[Intel-gfx] [PATCH] i915: add error detection & state dumping

Jesse Barnes jbarnes at virtuousgeek.org
Tue Apr 21 00:38:01 CEST 2009


Add error state detection and state dumping to the i915 driver.

This is still pretty rudimentary, since it just dumps error state at
detect time or when the i915_error_state file from debugfs is read.  To
really figure things out it would be good to track the PGTBL_ER offset
back to its originating batchbuffer and process, and save the
batchbuffer for later fetch & decode by the dumping tool.

We'd also like to be able to recover from errors by killing the
offending process and/or resetting the chip as needed.

I've tested this on 965, but re-reviewed the offets for pre-965 (good
thing I did, all the debug registers moved), so it should work on
earlier chips too.  However I've only successfully dumped instruction
parse errors, it would be good to get additional testing for pipe
underruns & page table errors.

Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>

diff --git a/drivers/gpu/drm/i915/i915_gem_debugfs.c b/drivers/gpu/drm/i915/i915_gem_debugfs.c
index 986f108..c023b40 100644
--- a/drivers/gpu/drm/i915/i915_gem_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_gem_debugfs.c
@@ -323,6 +323,44 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
 	return 0;
 }
 
+static int i915_error_state(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *) m->private;
+	struct drm_device *dev = node->minor->dev;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	u32 eir = I915_READ(EIR);
+
+	seq_printf(m, "ESR: 0x%08x\n", I915_READ(ESR));
+	seq_printf(m, "EMR: 0x%08x\n", I915_READ(EMR));
+	seq_printf(m, "EIR: 0x%08x\n", I915_READ(EIR));
+	if (eir & I915_ERROR_PAGE_TABLE) {
+		seq_printf(m, "  page table error\n");
+	}
+	if (eir & I915_ERROR_MEMORY_REFRESH) {
+		seq_printf(m, "  memory refresh error\n");
+	}
+	if (eir & I915_ERROR_INSTRUCTION) {
+		seq_printf(m, "  instruction error\n");
+	}
+	seq_printf(m, "  PGTBL_ER: 0x%08x\n", I915_READ(PGTBL_ER));
+	seq_printf(m, "  INSTPM: 0x%08x\n", I915_READ(INSTPM));
+	if (!IS_I965G(dev)) {
+		seq_printf(m, "  IPEIR: 0x%08x\n", I915_READ(IPEIR));
+		seq_printf(m, "  IPEHR: 0x%08x\n", I915_READ(IPEHR));
+		seq_printf(m, "  INSTDONE: 0x%08x\n", I915_READ(INSTDONE));
+		seq_printf(m, "  ACTHD: 0x%08x\n", I915_READ(ACTHD));
+	} else {
+		seq_printf(m, "  IPEIR: 0x%08x\n", I915_READ(IPEIR_I965));
+		seq_printf(m, "  IPEHR: 0x%08x\n", I915_READ(IPEHR_I965));
+		seq_printf(m, "  INSTDONE: 0x%08x\n",
+			   I915_READ(INSTDONE_I965));
+		seq_printf(m, "  INSTPS: 0x%08x\n", I915_READ(INSTPS));
+		seq_printf(m, "  INSTDONE1: 0x%08x\n", I915_READ(INSTDONE1));
+		seq_printf(m, "  ACTHD: 0x%08x\n", I915_READ(ACTHD_I965));
+	}
+
+	return 0;
+}
 
 static struct drm_info_list i915_gem_debugfs_list[] = {
 	{"i915_gem_active", i915_gem_object_list_info, 0, (void *) ACTIVE_LIST},
@@ -336,6 +374,7 @@ static struct drm_info_list i915_gem_debugfs_list[] = {
 	{"i915_ringbuffer_data", i915_ringbuffer_data, 0},
 	{"i915_ringbuffer_info", i915_ringbuffer_info, 0},
 	{"i915_batchbuffers", i915_batchbuffer_info, 0},
+	{"i915_error_state", i915_error_state, 0},
 };
 #define I915_GEM_DEBUGFS_ENTRIES ARRAY_SIZE(i915_gem_debugfs_list)
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 98bb4c8..57606d3 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -41,8 +41,9 @@
  * we leave them always unmasked in IMR and then control enabling them through
  * PIPESTAT alone.
  */
-#define I915_INTERRUPT_ENABLE_FIX (I915_ASLE_INTERRUPT | \
-				   I915_DISPLAY_PIPE_A_EVENT_INTERRUPT |  \
+#define I915_INTERRUPT_ENABLE_FIX (I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT |\
+				   I915_ASLE_INTERRUPT |		       \
+				   I915_DISPLAY_PIPE_A_EVENT_INTERRUPT |       \
 				   I915_DISPLAY_PIPE_B_EVENT_INTERRUPT)
 
 /** Interrupts that we mask and unmask at runtime. */
@@ -267,6 +268,66 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
 			I915_READ(PORT_HOTPLUG_STAT);
 		}
 
+		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) {
+			u32 eir = I915_READ(EIR);
+			printk(KERN_ERR "render error detected, EIR: 0x%08x\n",
+			       eir);
+			if (eir & I915_ERROR_PAGE_TABLE) {
+				u32 pgtbl_err = I915_READ(PGTBL_ER);
+				printk(KERN_ERR "page table error\n");
+				printk(KERN_ERR "  PGTBL_ER: 0x%08x\n",
+				       pgtbl_err);
+				I915_WRITE(PGTBL_ER, pgtbl_err);
+				(void)I915_READ(PGTBL_ER);
+			}
+			if (eir & I915_ERROR_MEMORY_REFRESH) {
+				printk(KERN_ERR "memory refresh error\n");
+				printk(KERN_ERR "PIPEASTAT: 0x%08x\n",
+				       pipea_stats);
+				printk(KERN_ERR "PIPEBSTAT: 0x%08x\n",
+				       pipeb_stats);
+				/* pipestat has already been acked */
+			}
+			if (eir & I915_ERROR_INSTRUCTION) {
+				printk(KERN_ERR "instruction error\n");
+				printk(KERN_ERR "  INSTPM: 0x%08x\n",
+				       I915_READ(INSTPM));
+				if (!IS_I965G(dev)) {
+					u32 ipeir = I915_READ(IPEIR);
+
+					printk(KERN_ERR "  IPEIR: 0x%08x\n",
+					       I915_READ(IPEIR));
+					printk(KERN_ERR "  IPEHR: 0x%08x\n",
+						   I915_READ(IPEHR));
+					printk(KERN_ERR "  INSTDONE: 0x%08x\n",
+						   I915_READ(INSTDONE));
+					printk(KERN_ERR "  ACTHD: 0x%08x\n",
+						   I915_READ(ACTHD));
+					I915_WRITE(IPEIR, ipeir);
+					(void)I915_READ(IPEIR);
+				} else {
+					u32 ipeir = I915_READ(IPEIR_I965);
+
+					printk(KERN_ERR "  IPEIR: 0x%08x\n",
+					       I915_READ(IPEIR_I965));
+					printk(KERN_ERR "  IPEHR: 0x%08x\n",
+					       I915_READ(IPEHR_I965));
+					printk(KERN_ERR "  INSTDONE: 0x%08x\n",
+					       I915_READ(INSTDONE_I965));
+					printk(KERN_ERR "  INSTPS: 0x%08x\n",
+					       I915_READ(INSTPS));
+					printk(KERN_ERR "  INSTDONE1: 0x%08x\n",
+					       I915_READ(INSTDONE1));
+					printk(KERN_ERR "  ACTHD: 0x%08x\n",
+					       I915_READ(ACTHD_I965));
+					I915_WRITE(IPEIR_I965, ipeir);
+					(void)I915_READ(IPEIR_I965);
+				}
+			}
+			I915_WRITE(EIR, eir);
+			(void)I915_READ(EIR);
+		}
+
 		I915_WRITE(IIR, iir);
 		new_iir = I915_READ(IIR); /* Flush posted writes */
 
@@ -569,6 +631,7 @@ int i915_driver_irq_postinstall(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private;
 	u32 enable_mask = I915_INTERRUPT_ENABLE_FIX | I915_INTERRUPT_ENABLE_VAR;
+	u32 error_mask;
 
 	dev_priv->vblank_pipe = DRM_I915_VBLANK_PIPE_A | DRM_I915_VBLANK_PIPE_B;
 
@@ -602,6 +665,11 @@ int i915_driver_irq_postinstall(struct drm_device *dev)
 		i915_enable_irq(dev_priv, I915_DISPLAY_PORT_INTERRUPT);
 	}
 
+	/* Enable some error detection */
+	error_mask = ~(I915_ERROR_PAGE_TABLE | I915_ERROR_MEMORY_REFRESH |
+		       I915_ERROR_INSTRUCTION);
+	I915_WRITE(EMR, error_mask);
+
 	/* Disable pipe interrupt enables, clear pending pipe status */
 	I915_WRITE(PIPEASTAT, I915_READ(PIPEASTAT) & 0x8000ffff);
 	I915_WRITE(PIPEBSTAT, I915_READ(PIPEBSTAT) & 0x8000ffff);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 5211947..5a748cf 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -205,6 +205,7 @@
 /*
  * Instruction and interrupt control regs
  */
+#define PGTBL_ER	0x02024
 #define PRB0_TAIL	0x02030
 #define PRB0_HEAD	0x02034
 #define PRB0_START	0x02038
@@ -225,11 +226,18 @@
 #define PRB1_HEAD	0x02044 /* 915+ only */
 #define PRB1_START	0x02048 /* 915+ only */
 #define PRB1_CTL	0x0204c /* 915+ only */
+#define IPEIR_I965	0x02064
+#define IPEHR_I965	0x02068
+#define INSTDONE_I965	0x0206c
+#define INSTPS		0x02070 /* 965+ only */
+#define INSTDONE1	0x0207c /* 965+ only */
 #define ACTHD_I965	0x02074
 #define HWS_PGA		0x02080
 #define HWS_ADDRESS_MASK	0xfffff000
 #define HWS_START_ADDRESS_SHIFT	4
 #define IPEIR		0x02088
+#define IPEHR		0x0208c
+#define INSTDONE	0x02090
 #define NOPID		0x02094
 #define HWSTAM		0x02098
 #define SCPD0		0x0209c /* 915+ only */
@@ -257,6 +265,9 @@
 #define EIR		0x020b0
 #define EMR		0x020b4
 #define ESR		0x020b8
+#define   I915_ERROR_PAGE_TABLE				(1<<4)
+#define   I915_ERROR_MEMORY_REFRESH			(1<<1)
+#define   I915_ERROR_INSTRUCTION			(1<<0)
 #define INSTPM	        0x020c0
 #define ACTHD	        0x020c8
 #define FW_BLC		0x020d8



More information about the Intel-gfx mailing list