[Intel-gfx] [PATCH] drm/i915: add error detection & collection
Jesse Barnes
jbarnes at virtuousgeek.org
Fri Jun 19 20:42:17 CEST 2009
[I'm still testing this on my 945, but it seems ok on my GM45. Next
patch will add hang detection & reset capability.]
This patch enables error detection by enabling several types of error
interrupts. When an error interrupt is received, the interrupt
handler captures the error state; hopefully resulting in an accurate
set of error data (error type, active head pointer, etc.).
The captured error record is dumped to the syslog and made available
through debugfs. A uevent is also generated, to indicate to userspace
that a GPU dump should be captured along with the contents of the
debugfs files.
Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 1a60626..1029750 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1192,6 +1192,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
pci_enable_msi(dev->pdev);
spin_lock_init(&dev_priv->user_irq_lock);
+ spin_lock_init(&dev_priv->error_lock);
dev_priv->user_irq_refcount = 0;
ret = drm_vblank_init(dev, I915_NUM_PIPE);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7a84f04..7756f78 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -133,6 +133,22 @@ struct sdvo_device_mapping {
u8 initialized;
};
+struct drm_i915_error_state {
+ u32 eir;
+ u32 pgtbl_er;
+ u32 pipeastat;
+ u32 pipebstat;
+ u32 ipeir;
+ u32 ipehr;
+ u32 instdone;
+ u32 acthd;
+ u32 instpm;
+ u32 instps;
+ u32 instdone1;
+ u32 seqno;
+ struct timeval time;
+};
+
typedef struct drm_i915_private {
struct drm_device *dev;
@@ -209,6 +225,10 @@ typedef struct drm_i915_private {
int fence_reg_start; /* 4 if userland hasn't ioctl'd us yet */
int num_fence_regs; /* 8 on pre-965, 16 otherwise */
+ spinlock_t error_lock;
+ struct drm_i915_error_state *first_error;
+ struct work_struct error_work;
+
/* Register state */
u8 saveLBB;
u32 saveDSPACNTR;
diff --git a/drivers/gpu/drm/i915/i915_gem_debugfs.c b/drivers/gpu/drm/i915/i915_gem_debugfs.c
index 28146e4..3c2e39e 100644
--- a/drivers/gpu/drm/i915/i915_gem_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_gem_debugfs.c
@@ -323,6 +323,41 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
return 0;
}
+static int i915_error_state(struct seq_file *m, void *unused)
+{
+ struct drm_info_node *node = (struct drm_info_node *) m->private;
+ struct drm_device *dev = node->minor->dev;
+ drm_i915_private_t *dev_priv = dev->dev_private;
+ struct drm_i915_error_state *error;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_priv->error_lock, flags);
+ if (!dev_priv->first_error) {
+ seq_printf(m, "no error state collected\n");
+ goto out;
+ }
+
+ error = dev_priv->first_error;
+
+ seq_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
+ error->time.tv_usec);
+ seq_printf(m, "EIR: 0x%08x\n", error->eir);
+ seq_printf(m, " PGTBL_ER: 0x%08x\n", error->pgtbl_er);
+ seq_printf(m, " INSTPM: 0x%08x\n", error->instpm);
+ seq_printf(m, " IPEIR: 0x%08x\n", error->ipeir);
+ seq_printf(m, " IPEHR: 0x%08x\n", error->ipehr);
+ seq_printf(m, " INSTDONE: 0x%08x\n", error->instdone);
+ seq_printf(m, " ACTHD: 0x%08x\n", error->acthd);
+ if (IS_I965G(dev)) {
+ seq_printf(m, " INSTPS: 0x%08x\n", error->instps);
+ seq_printf(m, " INSTDONE1: 0x%08x\n", error->instdone1);
+ }
+
+out:
+ spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+
+ return 0;
+}
static struct drm_info_list i915_gem_debugfs_list[] = {
{"i915_gem_active", i915_gem_object_list_info, 0, (void *) ACTIVE_LIST},
@@ -336,6 +371,7 @@ static struct drm_info_list i915_gem_debugfs_list[] = {
{"i915_ringbuffer_data", i915_ringbuffer_data, 0},
{"i915_ringbuffer_info", i915_ringbuffer_info, 0},
{"i915_batchbuffers", i915_batchbuffer_info, 0},
+ {"i915_error_state", i915_error_state, 0},
};
#define I915_GEM_DEBUGFS_ENTRIES ARRAY_SIZE(i915_gem_debugfs_list)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index b86b7b7..39a2b40 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -26,6 +26,7 @@
*
*/
+#include <linux/sysrq.h>
#include "drmP.h"
#include "drm.h"
#include "i915_drm.h"
@@ -41,9 +42,10 @@
* we leave them always unmasked in IMR and then control enabling them through
* PIPESTAT alone.
*/
-#define I915_INTERRUPT_ENABLE_FIX (I915_ASLE_INTERRUPT | \
- I915_DISPLAY_PIPE_A_EVENT_INTERRUPT | \
- I915_DISPLAY_PIPE_B_EVENT_INTERRUPT)
+#define I915_INTERRUPT_ENABLE_FIX (I915_ASLE_INTERRUPT | \
+ I915_DISPLAY_PIPE_A_EVENT_INTERRUPT | \
+ I915_DISPLAY_PIPE_B_EVENT_INTERRUPT | \
+ I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
/** Interrupts that we mask and unmask at runtime. */
#define I915_INTERRUPT_ENABLE_VAR (I915_USER_INTERRUPT)
@@ -278,6 +280,201 @@ irqreturn_t igdng_irq_handler(struct drm_device *dev)
return ret;
}
+/**
+ * i915_error_work_func - do process context error handling work
+ * @work: work struct
+ *
+ * Fire an error uevent so userspace can see that a hang or error
+ * was detected.
+ */
+static void i915_error_work_func(struct work_struct *work)
+{
+ drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
+ error_work);
+ struct drm_device *dev = dev_priv->dev;
+ char *event_string = "ERROR=1";
+ char *envp[] = { event_string, NULL };
+
+ DRM_DEBUG("generating error event\n");
+
+ kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, envp);
+}
+
+/**
+ * i915_capture_error_state - capture an error record for later analysis
+ * @dev: drm device
+ *
+ * Should be called when an error is detected (either a hang or an error
+ * interrupt) to capture error state from the time of the error. Fills
+ * out a structure which becomes available in debugfs for user level tools
+ * to pick up.
+ */
+static void i915_capture_error_state(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ struct drm_i915_error_state *error;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_priv->error_lock, flags);
+ if (dev_priv->first_error)
+ goto out;
+
+ error = kmalloc(sizeof(*error), GFP_ATOMIC);
+ if (!error) {
+ DRM_DEBUG("out ot memory, not capturing error state\n");
+ goto out;
+ }
+
+ error->eir = I915_READ(EIR);
+ error->pgtbl_er = I915_READ(PGTBL_ER);
+ error->pipeastat = I915_READ(PIPEASTAT);
+ error->pipebstat = I915_READ(PIPEBSTAT);
+ error->instpm = I915_READ(INSTPM);
+ if (!IS_I965G(dev)) {
+ error->ipeir = I915_READ(IPEIR);
+ error->ipehr = I915_READ(IPEHR);
+ error->instdone = I915_READ(INSTDONE);
+ error->acthd = I915_READ(ACTHD);
+ } else {
+ error->ipeir = I915_READ(IPEIR_I965);
+ error->ipehr = I915_READ(IPEHR_I965);
+ error->instdone = I915_READ(INSTDONE_I965);
+ error->instps = I915_READ(INSTPS);
+ error->instdone1 = I915_READ(INSTDONE1);
+ error->acthd = I915_READ(ACTHD_I965);
+ }
+
+ do_gettimeofday(&error->time);
+
+ dev_priv->first_error = error;
+
+out:
+ spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+}
+
+/**
+ * i915_handle_error - handle an error interrupt
+ * @dev: drm device
+ *
+ * Do some basic checking of regsiter state at error interrupt time and
+ * dump it to the syslog. Also call i915_capture_error_state() to make
+ * sure we get a record and make it available in debugfs. Fire a uevent
+ * so userspace knows something bad happened (should trigger collection
+ * of a ring dump etc.).
+ */
+static void i915_handle_error(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ u32 eir = I915_READ(EIR);
+ u32 pipea_stats = I915_READ(PIPEASTAT);
+ u32 pipeb_stats = I915_READ(PIPEBSTAT);
+
+ i915_capture_error_state(dev);
+
+ printk(KERN_ERR "render error detected, EIR: 0x%08x\n",
+ eir);
+
+ if (IS_G4X(dev)) {
+ if (eir & (GM45_ERROR_MEM_PRIV | GM45_ERROR_CP_PRIV)) {
+ u32 ipeir = I915_READ(IPEIR_I965);
+
+ printk(KERN_ERR " IPEIR: 0x%08x\n",
+ I915_READ(IPEIR_I965));
+ printk(KERN_ERR " IPEHR: 0x%08x\n",
+ I915_READ(IPEHR_I965));
+ printk(KERN_ERR " INSTDONE: 0x%08x\n",
+ I915_READ(INSTDONE_I965));
+ printk(KERN_ERR " INSTPS: 0x%08x\n",
+ I915_READ(INSTPS));
+ printk(KERN_ERR " INSTDONE1: 0x%08x\n",
+ I915_READ(INSTDONE1));
+ printk(KERN_ERR " ACTHD: 0x%08x\n",
+ I915_READ(ACTHD_I965));
+ I915_WRITE(IPEIR_I965, ipeir);
+ (void)I915_READ(IPEIR_I965);
+ }
+ if (eir & GM45_ERROR_PAGE_TABLE) {
+ u32 pgtbl_err = I915_READ(PGTBL_ER);
+ printk(KERN_ERR "page table error\n");
+ printk(KERN_ERR " PGTBL_ER: 0x%08x\n",
+ pgtbl_err);
+ I915_WRITE(PGTBL_ER, pgtbl_err);
+ (void)I915_READ(PGTBL_ER);
+ }
+ }
+
+ if (IS_I9XX(dev)) {
+ if (eir & I915_ERROR_PAGE_TABLE) {
+ u32 pgtbl_err = I915_READ(PGTBL_ER);
+ printk(KERN_ERR "page table error\n");
+ printk(KERN_ERR " PGTBL_ER: 0x%08x\n",
+ pgtbl_err);
+ I915_WRITE(PGTBL_ER, pgtbl_err);
+ (void)I915_READ(PGTBL_ER);
+ }
+ }
+
+ if (eir & I915_ERROR_MEMORY_REFRESH) {
+ printk(KERN_ERR "memory refresh error\n");
+ printk(KERN_ERR "PIPEASTAT: 0x%08x\n",
+ pipea_stats);
+ printk(KERN_ERR "PIPEBSTAT: 0x%08x\n",
+ pipeb_stats);
+ /* pipestat has already been acked */
+ }
+ if (eir & I915_ERROR_INSTRUCTION) {
+ printk(KERN_ERR "instruction error\n");
+ printk(KERN_ERR " INSTPM: 0x%08x\n",
+ I915_READ(INSTPM));
+ if (!IS_I965G(dev)) {
+ u32 ipeir = I915_READ(IPEIR);
+
+ printk(KERN_ERR " IPEIR: 0x%08x\n",
+ I915_READ(IPEIR));
+ printk(KERN_ERR " IPEHR: 0x%08x\n",
+ I915_READ(IPEHR));
+ printk(KERN_ERR " INSTDONE: 0x%08x\n",
+ I915_READ(INSTDONE));
+ printk(KERN_ERR " ACTHD: 0x%08x\n",
+ I915_READ(ACTHD));
+ I915_WRITE(IPEIR, ipeir);
+ (void)I915_READ(IPEIR);
+ } else {
+ u32 ipeir = I915_READ(IPEIR_I965);
+
+ printk(KERN_ERR " IPEIR: 0x%08x\n",
+ I915_READ(IPEIR_I965));
+ printk(KERN_ERR " IPEHR: 0x%08x\n",
+ I915_READ(IPEHR_I965));
+ printk(KERN_ERR " INSTDONE: 0x%08x\n",
+ I915_READ(INSTDONE_I965));
+ printk(KERN_ERR " INSTPS: 0x%08x\n",
+ I915_READ(INSTPS));
+ printk(KERN_ERR " INSTDONE1: 0x%08x\n",
+ I915_READ(INSTDONE1));
+ printk(KERN_ERR " ACTHD: 0x%08x\n",
+ I915_READ(ACTHD_I965));
+ I915_WRITE(IPEIR_I965, ipeir);
+ (void)I915_READ(IPEIR_I965);
+ }
+ }
+
+ I915_WRITE(EIR, eir);
+ (void)I915_READ(EIR);
+ eir = I915_READ(EIR);
+ if (eir) {
+ /*
+ * some errors might have become stuck,
+ * mask them.
+ */
+ DRM_ERROR("EIR stuck: 0x%08x, masking\n", eir);
+ I915_WRITE(EMR, I915_READ(EMR) | eir);
+ I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT);
+ }
+
+ schedule_work(&dev_priv->error_work);
+}
+
irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
{
struct drm_device *dev = (struct drm_device *) arg;
@@ -319,6 +516,10 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
pipea_stats = I915_READ(PIPEASTAT);
pipeb_stats = I915_READ(PIPEBSTAT);
+ /* Make sure pipestat regs are still valid... */
+ if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
+ i915_handle_error(dev);
+
/*
* Clear the PIPE(A|B)STAT regs before the IIR
*/
@@ -699,6 +900,7 @@ void i915_driver_irq_preinstall(struct drm_device * dev)
atomic_set(&dev_priv->irq_received, 0);
INIT_WORK(&dev_priv->hotplug_work, i915_hotplug_work_func);
+ INIT_WORK(&dev_priv->error_work, i915_error_work_func);
if (IS_IGDNG(dev)) {
igdng_irq_preinstall(dev);
@@ -722,6 +924,7 @@ int i915_driver_irq_postinstall(struct drm_device *dev)
{
drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private;
u32 enable_mask = I915_INTERRUPT_ENABLE_FIX | I915_INTERRUPT_ENABLE_VAR;
+ u32 error_mask;
DRM_INIT_WAITQUEUE(&dev_priv->irq_queue);
@@ -758,6 +961,21 @@ int i915_driver_irq_postinstall(struct drm_device *dev)
i915_enable_irq(dev_priv, I915_DISPLAY_PORT_INTERRUPT);
}
+ /*
+ * Enable some error detection, note the instruction error mask
+ * bit is reserved, so we leave it masked.
+ */
+ if (IS_G4X(dev)) {
+ error_mask = ~(GM45_ERROR_PAGE_TABLE |
+ GM45_ERROR_MEM_PRIV |
+ GM45_ERROR_CP_PRIV |
+ I915_ERROR_MEMORY_REFRESH);
+ } else {
+ error_mask = ~(I915_ERROR_PAGE_TABLE |
+ I915_ERROR_MEMORY_REFRESH);
+ }
+ I915_WRITE(EMR, error_mask);
+
/* Disable pipe interrupt enables, clear pending pipe status */
I915_WRITE(PIPEASTAT, I915_READ(PIPEASTAT) & 0x8000ffff);
I915_WRITE(PIPEBSTAT, I915_READ(PIPEBSTAT) & 0x8000ffff);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index f6237a0..afb9835 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -206,6 +206,7 @@
/*
* Instruction and interrupt control regs
*/
+#define PGTBL_ER 0x02024
#define PRB0_TAIL 0x02030
#define PRB0_HEAD 0x02034
#define PRB0_START 0x02038
@@ -226,11 +227,18 @@
#define PRB1_HEAD 0x02044 /* 915+ only */
#define PRB1_START 0x02048 /* 915+ only */
#define PRB1_CTL 0x0204c /* 915+ only */
+#define IPEIR_I965 0x02064
+#define IPEHR_I965 0x02068
+#define INSTDONE_I965 0x0206c
+#define INSTPS 0x02070 /* 965+ only */
+#define INSTDONE1 0x0207c /* 965+ only */
#define ACTHD_I965 0x02074
#define HWS_PGA 0x02080
#define HWS_ADDRESS_MASK 0xfffff000
#define HWS_START_ADDRESS_SHIFT 4
#define IPEIR 0x02088
+#define IPEHR 0x0208c
+#define INSTDONE 0x02090
#define NOPID 0x02094
#define HWSTAM 0x02098
#define SCPD0 0x0209c /* 915+ only */
@@ -258,6 +266,12 @@
#define EIR 0x020b0
#define EMR 0x020b4
#define ESR 0x020b8
+#define GM45_ERROR_PAGE_TABLE (1<<5)
+#define GM45_ERROR_MEM_PRIV (1<<4)
+#define I915_ERROR_PAGE_TABLE (1<<4)
+#define GM45_ERROR_CP_PRIV (1<<3)
+#define I915_ERROR_MEMORY_REFRESH (1<<1)
+#define I915_ERROR_INSTRUCTION (1<<0)
#define INSTPM 0x020c0
#define ACTHD 0x020c8
#define FW_BLC 0x020d8
More information about the Intel-gfx
mailing list