[Intel-gfx] [PATCH 1/5] drm/i915: add error detection & collection
Ben Gamari
bgamari.foss at gmail.com
Thu Jul 9 00:30:11 CEST 2009
From: Jesse Barnes <jbarnes at virtuousgeek.org>
[I'm still testing this on my 945, but it seems ok on my GM45. Next
patch will add hang detection & reset capability.]
This patch enables error detection by enabling several types of error
interrupts. When an error interrupt is received, the interrupt
handler captures the error state; hopefully resulting in an accurate
set of error data (error type, active head pointer, etc.).
The captured error record is dumped to the syslog and made available
through debugfs. A uevent is also generated, to indicate to userspace
that a GPU dump should be captured along with the contents of the
debugfs files.
Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>
Signed-off-by: Ben Gamari <bgamari.foss at gmail.com>
---
drivers/gpu/drm/i915/i915_drv.h | 1 +
drivers/gpu/drm/i915/i915_gem_debugfs.c | 2 +
drivers/gpu/drm/i915/i915_irq.c | 232 +++++++++++++++++++++----------
3 files changed, 161 insertions(+), 74 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9d68897..775fe91 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -229,6 +229,7 @@ typedef struct drm_i915_private {
spinlock_t error_lock;
struct drm_i915_error_state *first_error;
+ struct work_struct error_work;
/* Register state */
u8 saveLBB;
diff --git a/drivers/gpu/drm/i915/i915_gem_debugfs.c b/drivers/gpu/drm/i915/i915_gem_debugfs.c
index 9a44bfc..cb3b974 100644
--- a/drivers/gpu/drm/i915/i915_gem_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_gem_debugfs.c
@@ -343,6 +343,8 @@ static int i915_error_state(struct seq_file *m, void *unused)
error = dev_priv->first_error;
+ seq_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
+ error->time.tv_usec);
seq_printf(m, "EIR: 0x%08x\n", error->eir);
seq_printf(m, " PGTBL_ER: 0x%08x\n", error->pgtbl_er);
seq_printf(m, " INSTPM: 0x%08x\n", error->instpm);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 7ba23a6..ba65431 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -290,6 +290,35 @@ irqreturn_t igdng_irq_handler(struct drm_device *dev)
return ret;
}
+/**
+ * i915_error_work_func - do process context error handling work
+ * @work: work struct
+ *
+ * Fire an error uevent so userspace can see that a hang or error
+ * was detected.
+ */
+static void i915_error_work_func(struct work_struct *work)
+{
+ drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
+ error_work);
+ struct drm_device *dev = dev_priv->dev;
+ char *event_string = "ERROR=1";
+ char *envp[] = { event_string, NULL };
+
+ DRM_DEBUG("generating error event\n");
+
+ kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, envp);
+}
+
+/**
+ * i915_capture_error_state - capture an error record for later analysis
+ * @dev: drm device
+ *
+ * Should be called when an error is detected (either a hang or an error
+ * interrupt) to capture error state from the time of the error. Fills
+ * out a structure which becomes available in debugfs for user level tools
+ * to pick up.
+ */
static void i915_capture_error_state(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = dev->dev_private;
@@ -325,12 +354,137 @@ static void i915_capture_error_state(struct drm_device *dev)
error->acthd = I915_READ(ACTHD_I965);
}
+ do_gettimeofday(&error->time);
+
dev_priv->first_error = error;
out:
spin_unlock_irqrestore(&dev_priv->error_lock, flags);
}
+/**
+ * i915_handle_error - handle an error interrupt
+ * @dev: drm device
+ *
+ * Do some basic checking of regsiter state at error interrupt time and
+ * dump it to the syslog. Also call i915_capture_error_state() to make
+ * sure we get a record and make it available in debugfs. Fire a uevent
+ * so userspace knows something bad happened (should trigger collection
+ * of a ring dump etc.).
+ */
+static void i915_handle_error(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ u32 eir = I915_READ(EIR);
+ u32 pipea_stats = I915_READ(PIPEASTAT);
+ u32 pipeb_stats = I915_READ(PIPEBSTAT);
+
+ i915_capture_error_state(dev);
+
+ printk(KERN_ERR "render error detected, EIR: 0x%08x\n",
+ eir);
+
+ if (IS_G4X(dev)) {
+ if (eir & (GM45_ERROR_MEM_PRIV | GM45_ERROR_CP_PRIV)) {
+ u32 ipeir = I915_READ(IPEIR_I965);
+
+ printk(KERN_ERR " IPEIR: 0x%08x\n",
+ I915_READ(IPEIR_I965));
+ printk(KERN_ERR " IPEHR: 0x%08x\n",
+ I915_READ(IPEHR_I965));
+ printk(KERN_ERR " INSTDONE: 0x%08x\n",
+ I915_READ(INSTDONE_I965));
+ printk(KERN_ERR " INSTPS: 0x%08x\n",
+ I915_READ(INSTPS));
+ printk(KERN_ERR " INSTDONE1: 0x%08x\n",
+ I915_READ(INSTDONE1));
+ printk(KERN_ERR " ACTHD: 0x%08x\n",
+ I915_READ(ACTHD_I965));
+ I915_WRITE(IPEIR_I965, ipeir);
+ (void)I915_READ(IPEIR_I965);
+ }
+ if (eir & GM45_ERROR_PAGE_TABLE) {
+ u32 pgtbl_err = I915_READ(PGTBL_ER);
+ printk(KERN_ERR "page table error\n");
+ printk(KERN_ERR " PGTBL_ER: 0x%08x\n",
+ pgtbl_err);
+ I915_WRITE(PGTBL_ER, pgtbl_err);
+ (void)I915_READ(PGTBL_ER);
+ }
+ }
+
+ if (IS_I9XX(dev)) {
+ if (eir & I915_ERROR_PAGE_TABLE) {
+ u32 pgtbl_err = I915_READ(PGTBL_ER);
+ printk(KERN_ERR "page table error\n");
+ printk(KERN_ERR " PGTBL_ER: 0x%08x\n",
+ pgtbl_err);
+ I915_WRITE(PGTBL_ER, pgtbl_err);
+ (void)I915_READ(PGTBL_ER);
+ }
+ }
+
+ if (eir & I915_ERROR_MEMORY_REFRESH) {
+ printk(KERN_ERR "memory refresh error\n");
+ printk(KERN_ERR "PIPEASTAT: 0x%08x\n",
+ pipea_stats);
+ printk(KERN_ERR "PIPEBSTAT: 0x%08x\n",
+ pipeb_stats);
+ /* pipestat has already been acked */
+ }
+ if (eir & I915_ERROR_INSTRUCTION) {
+ printk(KERN_ERR "instruction error\n");
+ printk(KERN_ERR " INSTPM: 0x%08x\n",
+ I915_READ(INSTPM));
+ if (!IS_I965G(dev)) {
+ u32 ipeir = I915_READ(IPEIR);
+
+ printk(KERN_ERR " IPEIR: 0x%08x\n",
+ I915_READ(IPEIR));
+ printk(KERN_ERR " IPEHR: 0x%08x\n",
+ I915_READ(IPEHR));
+ printk(KERN_ERR " INSTDONE: 0x%08x\n",
+ I915_READ(INSTDONE));
+ printk(KERN_ERR " ACTHD: 0x%08x\n",
+ I915_READ(ACTHD));
+ I915_WRITE(IPEIR, ipeir);
+ (void)I915_READ(IPEIR);
+ } else {
+ u32 ipeir = I915_READ(IPEIR_I965);
+
+ printk(KERN_ERR " IPEIR: 0x%08x\n",
+ I915_READ(IPEIR_I965));
+ printk(KERN_ERR " IPEHR: 0x%08x\n",
+ I915_READ(IPEHR_I965));
+ printk(KERN_ERR " INSTDONE: 0x%08x\n",
+ I915_READ(INSTDONE_I965));
+ printk(KERN_ERR " INSTPS: 0x%08x\n",
+ I915_READ(INSTPS));
+ printk(KERN_ERR " INSTDONE1: 0x%08x\n",
+ I915_READ(INSTDONE1));
+ printk(KERN_ERR " ACTHD: 0x%08x\n",
+ I915_READ(ACTHD_I965));
+ I915_WRITE(IPEIR_I965, ipeir);
+ (void)I915_READ(IPEIR_I965);
+ }
+ }
+
+ I915_WRITE(EIR, eir);
+ (void)I915_READ(EIR);
+ eir = I915_READ(EIR);
+ if (eir) {
+ /*
+ * some errors might have become stuck,
+ * mask them.
+ */
+ DRM_ERROR("EIR stuck: 0x%08x, masking\n", eir);
+ I915_WRITE(EMR, I915_READ(EMR) | eir);
+ I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT);
+ }
+
+ schedule_work(&dev_priv->error_work);
+}
+
irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
{
struct drm_device *dev = (struct drm_device *) arg;
@@ -371,6 +525,9 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
spin_lock_irqsave(&dev_priv->user_irq_lock, irqflags);
pipea_stats = I915_READ(PIPEASTAT);
pipeb_stats = I915_READ(PIPEBSTAT);
+
+ if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
+ i915_handle_error(dev);
/*
* Clear the PIPE(A|B)STAT regs before the IIR
@@ -409,80 +566,6 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
I915_READ(PORT_HOTPLUG_STAT);
}
- if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) {
- u32 eir = I915_READ(EIR);
-
- i915_capture_error_state(dev);
-
- printk(KERN_ERR "render error detected, EIR: 0x%08x\n",
- eir);
- if (eir & I915_ERROR_PAGE_TABLE) {
- u32 pgtbl_err = I915_READ(PGTBL_ER);
- printk(KERN_ERR "page table error\n");
- printk(KERN_ERR " PGTBL_ER: 0x%08x\n",
- pgtbl_err);
- I915_WRITE(PGTBL_ER, pgtbl_err);
- (void)I915_READ(PGTBL_ER);
- }
- if (eir & I915_ERROR_MEMORY_REFRESH) {
- printk(KERN_ERR "memory refresh error\n");
- printk(KERN_ERR "PIPEASTAT: 0x%08x\n",
- pipea_stats);
- printk(KERN_ERR "PIPEBSTAT: 0x%08x\n",
- pipeb_stats);
- /* pipestat has already been acked */
- }
- if (eir & I915_ERROR_INSTRUCTION) {
- printk(KERN_ERR "instruction error\n");
- printk(KERN_ERR " INSTPM: 0x%08x\n",
- I915_READ(INSTPM));
- if (!IS_I965G(dev)) {
- u32 ipeir = I915_READ(IPEIR);
-
- printk(KERN_ERR " IPEIR: 0x%08x\n",
- I915_READ(IPEIR));
- printk(KERN_ERR " IPEHR: 0x%08x\n",
- I915_READ(IPEHR));
- printk(KERN_ERR " INSTDONE: 0x%08x\n",
- I915_READ(INSTDONE));
- printk(KERN_ERR " ACTHD: 0x%08x\n",
- I915_READ(ACTHD));
- I915_WRITE(IPEIR, ipeir);
- (void)I915_READ(IPEIR);
- } else {
- u32 ipeir = I915_READ(IPEIR_I965);
-
- printk(KERN_ERR " IPEIR: 0x%08x\n",
- I915_READ(IPEIR_I965));
- printk(KERN_ERR " IPEHR: 0x%08x\n",
- I915_READ(IPEHR_I965));
- printk(KERN_ERR " INSTDONE: 0x%08x\n",
- I915_READ(INSTDONE_I965));
- printk(KERN_ERR " INSTPS: 0x%08x\n",
- I915_READ(INSTPS));
- printk(KERN_ERR " INSTDONE1: 0x%08x\n",
- I915_READ(INSTDONE1));
- printk(KERN_ERR " ACTHD: 0x%08x\n",
- I915_READ(ACTHD_I965));
- I915_WRITE(IPEIR_I965, ipeir);
- (void)I915_READ(IPEIR_I965);
- }
- }
-
- I915_WRITE(EIR, eir);
- (void)I915_READ(EIR);
- eir = I915_READ(EIR);
- if (eir) {
- /*
- * some errors might have become stuck,
- * mask them.
- */
- DRM_ERROR("EIR stuck: 0x%08x, masking\n", eir);
- I915_WRITE(EMR, I915_READ(EMR) | eir);
- I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT);
- }
- }
-
I915_WRITE(IIR, iir);
new_iir = I915_READ(IIR); /* Flush posted writes */
@@ -830,6 +913,7 @@ void i915_driver_irq_preinstall(struct drm_device * dev)
atomic_set(&dev_priv->irq_received, 0);
INIT_WORK(&dev_priv->hotplug_work, i915_hotplug_work_func);
+ INIT_WORK(&dev_priv->error_work, i915_error_work_func);
if (IS_IGDNG(dev)) {
igdng_irq_preinstall(dev);
--
1.6.3.3
More information about the Intel-gfx
mailing list