[Intel-gfx] [PATCH] drm/i915: add error detection & collection

Jesse Barnes jbarnes at virtuousgeek.org
Fri Jun 19 20:42:17 CEST 2009


[I'm still testing this on my 945, but it seems ok on my GM45.  Next
patch will add hang detection & reset capability.]

This patch enables error detection by enabling several types of error
interrupts.  When an error interrupt is received, the interrupt
handler captures the error state; hopefully resulting in an accurate
set of error data (error type, active head pointer, etc.).

The captured error record is dumped to the syslog and made available
through debugfs.  A uevent is also generated, to indicate to userspace
that a GPU dump should be captured along with the contents of the
debugfs files.

Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 1a60626..1029750 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1192,6 +1192,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 		pci_enable_msi(dev->pdev);
 
 	spin_lock_init(&dev_priv->user_irq_lock);
+	spin_lock_init(&dev_priv->error_lock);
 	dev_priv->user_irq_refcount = 0;
 
 	ret = drm_vblank_init(dev, I915_NUM_PIPE);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7a84f04..7756f78 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -133,6 +133,22 @@ struct sdvo_device_mapping {
 	u8 initialized;
 };
 
+struct drm_i915_error_state {
+	u32 eir;
+	u32 pgtbl_er;
+	u32 pipeastat;
+	u32 pipebstat;
+	u32 ipeir;
+	u32 ipehr;
+	u32 instdone;
+	u32 acthd;
+	u32 instpm;
+	u32 instps;
+	u32 instdone1;
+	u32 seqno;
+	struct timeval time;
+};
+
 typedef struct drm_i915_private {
 	struct drm_device *dev;
 
@@ -209,6 +225,10 @@ typedef struct drm_i915_private {
 	int fence_reg_start; /* 4 if userland hasn't ioctl'd us yet */
 	int num_fence_regs; /* 8 on pre-965, 16 otherwise */
 
+	spinlock_t error_lock;
+	struct drm_i915_error_state *first_error;
+	struct work_struct error_work;
+
 	/* Register state */
 	u8 saveLBB;
 	u32 saveDSPACNTR;
diff --git a/drivers/gpu/drm/i915/i915_gem_debugfs.c b/drivers/gpu/drm/i915/i915_gem_debugfs.c
index 28146e4..3c2e39e 100644
--- a/drivers/gpu/drm/i915/i915_gem_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_gem_debugfs.c
@@ -323,6 +323,41 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
 	return 0;
 }
 
+static int i915_error_state(struct seq_file *m, void *unused)
+{
+	struct drm_info_node *node = (struct drm_info_node *) m->private;
+	struct drm_device *dev = node->minor->dev;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	struct drm_i915_error_state *error;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_priv->error_lock, flags);
+	if (!dev_priv->first_error) {
+		seq_printf(m, "no error state collected\n");
+		goto out;
+	}
+
+	error = dev_priv->first_error;
+
+	seq_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
+		   error->time.tv_usec);
+	seq_printf(m, "EIR: 0x%08x\n", error->eir);
+	seq_printf(m, "  PGTBL_ER: 0x%08x\n", error->pgtbl_er);
+	seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
+	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir);
+	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr);
+	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone);
+	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd);
+	if (IS_I965G(dev)) {
+		seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
+		seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
+	}
+
+out:
+	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+
+	return 0;
+}
 
 static struct drm_info_list i915_gem_debugfs_list[] = {
 	{"i915_gem_active", i915_gem_object_list_info, 0, (void *) ACTIVE_LIST},
@@ -336,6 +371,7 @@ static struct drm_info_list i915_gem_debugfs_list[] = {
 	{"i915_ringbuffer_data", i915_ringbuffer_data, 0},
 	{"i915_ringbuffer_info", i915_ringbuffer_info, 0},
 	{"i915_batchbuffers", i915_batchbuffer_info, 0},
+	{"i915_error_state", i915_error_state, 0},
 };
 #define I915_GEM_DEBUGFS_ENTRIES ARRAY_SIZE(i915_gem_debugfs_list)
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index b86b7b7..39a2b40 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -26,6 +26,7 @@
  *
  */
 
+#include <linux/sysrq.h>
 #include "drmP.h"
 #include "drm.h"
 #include "i915_drm.h"
@@ -41,9 +42,10 @@
  * we leave them always unmasked in IMR and then control enabling them through
  * PIPESTAT alone.
  */
-#define I915_INTERRUPT_ENABLE_FIX (I915_ASLE_INTERRUPT | \
-				   I915_DISPLAY_PIPE_A_EVENT_INTERRUPT |  \
-				   I915_DISPLAY_PIPE_B_EVENT_INTERRUPT)
+#define I915_INTERRUPT_ENABLE_FIX (I915_ASLE_INTERRUPT |		 \
+				   I915_DISPLAY_PIPE_A_EVENT_INTERRUPT | \
+				   I915_DISPLAY_PIPE_B_EVENT_INTERRUPT | \
+				   I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
 
 /** Interrupts that we mask and unmask at runtime. */
 #define I915_INTERRUPT_ENABLE_VAR (I915_USER_INTERRUPT)
@@ -278,6 +280,201 @@ irqreturn_t igdng_irq_handler(struct drm_device *dev)
 	return ret;
 }
 
+/**
+ * i915_error_work_func - do process context error handling work
+ * @work: work struct
+ *
+ * Fire an error uevent so userspace can see that a hang or error
+ * was detected.
+ */
+static void i915_error_work_func(struct work_struct *work)
+{
+	drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
+						    error_work);
+	struct drm_device *dev = dev_priv->dev;
+	char *event_string = "ERROR=1";
+	char *envp[] = { event_string, NULL };
+
+	DRM_DEBUG("generating error event\n");
+
+	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, envp);
+}
+
+/**
+ * i915_capture_error_state - capture an error record for later analysis
+ * @dev: drm device
+ *
+ * Should be called when an error is detected (either a hang or an error
+ * interrupt) to capture error state from the time of the error.  Fills
+ * out a structure which becomes available in debugfs for user level tools
+ * to pick up.
+ */
+static void i915_capture_error_state(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_error_state *error;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_priv->error_lock, flags);
+	if (dev_priv->first_error)
+		goto out;
+
+	error = kmalloc(sizeof(*error), GFP_ATOMIC);
+	if (!error) {
+		DRM_DEBUG("out ot memory, not capturing error state\n");
+		goto out;
+	}
+
+	error->eir = I915_READ(EIR);
+	error->pgtbl_er = I915_READ(PGTBL_ER);
+	error->pipeastat = I915_READ(PIPEASTAT);
+	error->pipebstat = I915_READ(PIPEBSTAT);
+	error->instpm = I915_READ(INSTPM);
+	if (!IS_I965G(dev)) {
+		error->ipeir = I915_READ(IPEIR);
+		error->ipehr = I915_READ(IPEHR);
+		error->instdone = I915_READ(INSTDONE);
+		error->acthd = I915_READ(ACTHD);
+	} else {
+		error->ipeir = I915_READ(IPEIR_I965);
+		error->ipehr = I915_READ(IPEHR_I965);
+		error->instdone = I915_READ(INSTDONE_I965);
+		error->instps = I915_READ(INSTPS);
+		error->instdone1 = I915_READ(INSTDONE1);
+		error->acthd = I915_READ(ACTHD_I965);
+	}
+
+	do_gettimeofday(&error->time);
+
+	dev_priv->first_error = error;
+
+out:
+	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+}
+
+/**
+ * i915_handle_error - handle an error interrupt
+ * @dev: drm device
+ *
+ * Do some basic checking of regsiter state at error interrupt time and
+ * dump it to the syslog.  Also call i915_capture_error_state() to make
+ * sure we get a record and make it available in debugfs.  Fire a uevent
+ * so userspace knows something bad happened (should trigger collection
+ * of a ring dump etc.).
+ */
+static void i915_handle_error(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u32 eir = I915_READ(EIR);
+	u32 pipea_stats = I915_READ(PIPEASTAT);
+	u32 pipeb_stats = I915_READ(PIPEBSTAT);
+
+	i915_capture_error_state(dev);
+
+	printk(KERN_ERR "render error detected, EIR: 0x%08x\n",
+	       eir);
+
+	if (IS_G4X(dev)) {
+		if (eir & (GM45_ERROR_MEM_PRIV | GM45_ERROR_CP_PRIV)) {
+			u32 ipeir = I915_READ(IPEIR_I965);
+
+			printk(KERN_ERR "  IPEIR: 0x%08x\n",
+			       I915_READ(IPEIR_I965));
+			printk(KERN_ERR "  IPEHR: 0x%08x\n",
+			       I915_READ(IPEHR_I965));
+			printk(KERN_ERR "  INSTDONE: 0x%08x\n",
+			       I915_READ(INSTDONE_I965));
+			printk(KERN_ERR "  INSTPS: 0x%08x\n",
+			       I915_READ(INSTPS));
+			printk(KERN_ERR "  INSTDONE1: 0x%08x\n",
+			       I915_READ(INSTDONE1));
+			printk(KERN_ERR "  ACTHD: 0x%08x\n",
+			       I915_READ(ACTHD_I965));
+			I915_WRITE(IPEIR_I965, ipeir);
+			(void)I915_READ(IPEIR_I965);
+		}
+		if (eir & GM45_ERROR_PAGE_TABLE) {
+			u32 pgtbl_err = I915_READ(PGTBL_ER);
+			printk(KERN_ERR "page table error\n");
+			printk(KERN_ERR "  PGTBL_ER: 0x%08x\n",
+			       pgtbl_err);
+			I915_WRITE(PGTBL_ER, pgtbl_err);
+			(void)I915_READ(PGTBL_ER);
+		}
+	}
+
+	if (IS_I9XX(dev)) {
+		if (eir & I915_ERROR_PAGE_TABLE) {
+			u32 pgtbl_err = I915_READ(PGTBL_ER);
+			printk(KERN_ERR "page table error\n");
+			printk(KERN_ERR "  PGTBL_ER: 0x%08x\n",
+			       pgtbl_err);
+			I915_WRITE(PGTBL_ER, pgtbl_err);
+			(void)I915_READ(PGTBL_ER);
+		}
+	}
+
+	if (eir & I915_ERROR_MEMORY_REFRESH) {
+		printk(KERN_ERR "memory refresh error\n");
+		printk(KERN_ERR "PIPEASTAT: 0x%08x\n",
+		       pipea_stats);
+		printk(KERN_ERR "PIPEBSTAT: 0x%08x\n",
+		       pipeb_stats);
+		/* pipestat has already been acked */
+	}
+	if (eir & I915_ERROR_INSTRUCTION) {
+		printk(KERN_ERR "instruction error\n");
+		printk(KERN_ERR "  INSTPM: 0x%08x\n",
+		       I915_READ(INSTPM));
+		if (!IS_I965G(dev)) {
+			u32 ipeir = I915_READ(IPEIR);
+
+			printk(KERN_ERR "  IPEIR: 0x%08x\n",
+			       I915_READ(IPEIR));
+			printk(KERN_ERR "  IPEHR: 0x%08x\n",
+			       I915_READ(IPEHR));
+			printk(KERN_ERR "  INSTDONE: 0x%08x\n",
+			       I915_READ(INSTDONE));
+			printk(KERN_ERR "  ACTHD: 0x%08x\n",
+			       I915_READ(ACTHD));
+			I915_WRITE(IPEIR, ipeir);
+			(void)I915_READ(IPEIR);
+		} else {
+			u32 ipeir = I915_READ(IPEIR_I965);
+
+			printk(KERN_ERR "  IPEIR: 0x%08x\n",
+			       I915_READ(IPEIR_I965));
+			printk(KERN_ERR "  IPEHR: 0x%08x\n",
+			       I915_READ(IPEHR_I965));
+			printk(KERN_ERR "  INSTDONE: 0x%08x\n",
+			       I915_READ(INSTDONE_I965));
+			printk(KERN_ERR "  INSTPS: 0x%08x\n",
+			       I915_READ(INSTPS));
+			printk(KERN_ERR "  INSTDONE1: 0x%08x\n",
+			       I915_READ(INSTDONE1));
+			printk(KERN_ERR "  ACTHD: 0x%08x\n",
+			       I915_READ(ACTHD_I965));
+			I915_WRITE(IPEIR_I965, ipeir);
+			(void)I915_READ(IPEIR_I965);
+		}
+	}
+
+	I915_WRITE(EIR, eir);
+	(void)I915_READ(EIR);
+	eir = I915_READ(EIR);
+	if (eir) {
+		/*
+		 * some errors might have become stuck,
+		 * mask them.
+		 */
+		DRM_ERROR("EIR stuck: 0x%08x, masking\n", eir);
+		I915_WRITE(EMR, I915_READ(EMR) | eir);
+		I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT);
+	}
+
+	schedule_work(&dev_priv->error_work);
+}
+
 irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
 {
 	struct drm_device *dev = (struct drm_device *) arg;
@@ -319,6 +516,10 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
 		pipea_stats = I915_READ(PIPEASTAT);
 		pipeb_stats = I915_READ(PIPEBSTAT);
 
+		/* Make sure pipestat regs are still valid... */
+		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
+			i915_handle_error(dev);
+
 		/*
 		 * Clear the PIPE(A|B)STAT regs before the IIR
 		 */
@@ -699,6 +900,7 @@ void i915_driver_irq_preinstall(struct drm_device * dev)
 	atomic_set(&dev_priv->irq_received, 0);
 
 	INIT_WORK(&dev_priv->hotplug_work, i915_hotplug_work_func);
+	INIT_WORK(&dev_priv->error_work, i915_error_work_func);
 
 	if (IS_IGDNG(dev)) {
 		igdng_irq_preinstall(dev);
@@ -722,6 +924,7 @@ int i915_driver_irq_postinstall(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private;
 	u32 enable_mask = I915_INTERRUPT_ENABLE_FIX | I915_INTERRUPT_ENABLE_VAR;
+	u32 error_mask;
 
 	DRM_INIT_WAITQUEUE(&dev_priv->irq_queue);
 
@@ -758,6 +961,21 @@ int i915_driver_irq_postinstall(struct drm_device *dev)
 		i915_enable_irq(dev_priv, I915_DISPLAY_PORT_INTERRUPT);
 	}
 
+	/*
+	 * Enable some error detection, note the instruction error mask
+	 * bit is reserved, so we leave it masked.
+	 */
+	if (IS_G4X(dev)) {
+		error_mask = ~(GM45_ERROR_PAGE_TABLE |
+			       GM45_ERROR_MEM_PRIV |
+			       GM45_ERROR_CP_PRIV |
+			       I915_ERROR_MEMORY_REFRESH);
+	} else {
+		error_mask = ~(I915_ERROR_PAGE_TABLE |
+			       I915_ERROR_MEMORY_REFRESH);
+	}
+	I915_WRITE(EMR, error_mask);
+
 	/* Disable pipe interrupt enables, clear pending pipe status */
 	I915_WRITE(PIPEASTAT, I915_READ(PIPEASTAT) & 0x8000ffff);
 	I915_WRITE(PIPEBSTAT, I915_READ(PIPEBSTAT) & 0x8000ffff);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index f6237a0..afb9835 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -206,6 +206,7 @@
 /*
  * Instruction and interrupt control regs
  */
+#define PGTBL_ER	0x02024
 #define PRB0_TAIL	0x02030
 #define PRB0_HEAD	0x02034
 #define PRB0_START	0x02038
@@ -226,11 +227,18 @@
 #define PRB1_HEAD	0x02044 /* 915+ only */
 #define PRB1_START	0x02048 /* 915+ only */
 #define PRB1_CTL	0x0204c /* 915+ only */
+#define IPEIR_I965	0x02064
+#define IPEHR_I965	0x02068
+#define INSTDONE_I965	0x0206c
+#define INSTPS		0x02070 /* 965+ only */
+#define INSTDONE1	0x0207c /* 965+ only */
 #define ACTHD_I965	0x02074
 #define HWS_PGA		0x02080
 #define HWS_ADDRESS_MASK	0xfffff000
 #define HWS_START_ADDRESS_SHIFT	4
 #define IPEIR		0x02088
+#define IPEHR		0x0208c
+#define INSTDONE	0x02090
 #define NOPID		0x02094
 #define HWSTAM		0x02098
 #define SCPD0		0x0209c /* 915+ only */
@@ -258,6 +266,12 @@
 #define EIR		0x020b0
 #define EMR		0x020b4
 #define ESR		0x020b8
+#define   GM45_ERROR_PAGE_TABLE				(1<<5)
+#define   GM45_ERROR_MEM_PRIV				(1<<4)
+#define   I915_ERROR_PAGE_TABLE				(1<<4)
+#define   GM45_ERROR_CP_PRIV				(1<<3)
+#define   I915_ERROR_MEMORY_REFRESH			(1<<1)
+#define   I915_ERROR_INSTRUCTION			(1<<0)
 #define INSTPM	        0x020c0
 #define ACTHD	        0x020c8
 #define FW_BLC		0x020d8



More information about the Intel-gfx mailing list