[Intel-gfx] [PATCH] drm/i915: add hang detection and reset

Jesse Barnes jbarnes at virtuousgeek.org
Fri Jun 19 22:08:29 CEST 2009


This patch adds a hangcheck timer, which set set to a 5s timeout at
each batch execution, and cleared when sequence number pass or when a
leave VT event happens.  It also adds infrastructure for saving &
restoring just the display state, in the event we need to reset the
display engine too (though that's currently unused).

If the timer fires, the hangcheck function captures some error state
and attempts a reset (on 965+ only thus far), and generates a uevent.
It's enough to recover from the gem_hang test case, allowing me to
start a fresh desktop after the hang has been detected and recovered
from, which wasn't possible before.

Wider testing would be much appreciated, with other types of hangs.
And of course review is always welcome.  I'm hoping this is safe to
include in 2.6.31 since it shouldn't make things any worse than they
are today.

Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 1029750..a8a8b44 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1014,6 +1014,9 @@ static int i915_load_modeset_init(struct drm_device *dev)
 	/* Basic memrange allocator for stolen space (aka vram) */
 	drm_mm_init(&dev_priv->vram, 0, prealloc_size);
 
+	/* We're off and running w/KMS */
+	dev_priv->mm.suspended = 0;
+
 	/* Let GEM Manage from end of prealloc space to end of aperture.
 	 *
 	 * However, leave one page at the end still bound to the scratch page.
@@ -1025,7 +1028,9 @@ static int i915_load_modeset_init(struct drm_device *dev)
 	 */
 	i915_gem_do_init(dev, prealloc_size, agp_size - 4096);
 
+	mutex_lock(&dev->struct_mutex);
 	ret = i915_gem_init_ringbuffer(dev);
+	mutex_unlock(&dev->struct_mutex);
 	if (ret)
 		goto out;
 
@@ -1202,6 +1207,9 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 		return ret;
 	}
 
+	/* Start out suspended */
+	dev_priv->mm.suspended = 1;
+
 	if (drm_core_check_feature(dev, DRIVER_MODESET)) {
 		ret = i915_load_modeset_init(dev);
 		if (ret < 0) {
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 98560e1..4d5b907 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -119,6 +119,100 @@ static int i915_resume(struct drm_device *dev)
 	return ret;
 }
 
+/**
+ * i965_reset - reset chip after a hang
+ * @dev: drm device to reset
+ * @flags: reset domains
+ *
+ * Reset the chip.  Useful if a hang is detected.
+ *
+ * Procedure is fairly simple:
+ *   - reset the chip using the reset reg
+ *   - re-init context state
+ *   - re-init hardware status page
+ *   - re-init ring buffer
+ *   - re-init interrupt state
+ *   - re-init display
+ */
+void i965_reset(struct drm_device *dev, u8 flags)
+{
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	unsigned long timeout = jiffies + msecs_to_jiffies(500);
+	u8 gdrst;
+
+	BUG_ON(mutex_is_locked(&dev->struct_mutex));
+
+	if (flags & GDRST_FULL)
+		i915_save_display(dev);
+
+	/*
+	 * Set the reset bit, wait for reset, then clear it.  Hardware
+	 * will clear the status bit (bit 1) when it's actually ready
+	 * for action again.
+	 */
+	pci_read_config_byte(dev->pdev, GDRST, &gdrst);
+	pci_write_config_byte(dev->pdev, GDRST, gdrst | flags);
+	udelay(50);
+	pci_write_config_byte(dev->pdev, GDRST, gdrst & 0xfe);
+
+	/* ...we don't want to loop forever though, 500ms should be plenty */
+	do {
+		udelay(100);
+		pci_read_config_byte(dev->pdev, GDRST, &gdrst);
+	} while ((gdrst & 2) && time_after(timeout, jiffies));
+
+	/* Ok now get things going again... */
+
+	/*
+	 * Everything depends on having the GTT running, so we need to start
+	 * there.  Fortunately we don't need to do this unless we reset the
+	 * chip at a PCI level.
+	 *
+	 * Next we need to restore the context, but we don't use those
+	 * yet either...
+	 *
+	 * Ring buffer needs to be re-initialized in the KMS case, or if X
+	 * was running at the time of the reset (i.e. we weren't VT
+	 * switched away).
+	 */
+	if (drm_core_check_feature(dev, DRIVER_MODESET) ||
+	    !dev_priv->mm.suspended) {
+		drm_i915_ring_buffer_t *ring = &dev_priv->ring;
+		struct drm_gem_object *obj = ring->ring_obj;
+		struct drm_i915_gem_object *obj_priv = obj->driver_private;
+		dev_priv->mm.suspended = 0;
+
+		/* Stop the ring if it's running. */
+		I915_WRITE(PRB0_CTL, 0);
+		I915_WRITE(PRB0_TAIL, 0);
+		I915_WRITE(PRB0_HEAD, 0);
+
+		/* Initialize the ring. */
+		I915_WRITE(PRB0_START, obj_priv->gtt_offset);
+		I915_WRITE(PRB0_CTL,
+			   ((obj->size - 4096) & RING_NR_PAGES) |
+			   RING_NO_REPORT |
+			   RING_VALID);
+		if (!drm_core_check_feature(dev, DRIVER_MODESET))
+			i915_kernel_lost_context(dev);
+		else {
+			ring->head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
+			ring->tail = I915_READ(PRB0_TAIL) & TAIL_ADDR;
+			ring->space = ring->head - (ring->tail + 8);
+			if (ring->space < 0)
+				ring->space += ring->Size;
+		}
+
+		drm_irq_install(dev);
+	}
+
+	/*
+	 * Display needs restore too...
+	 */
+	if (flags & GDRST_FULL)
+		i915_restore_display(dev);
+}
+
 static int __devinit
 i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7756f78..c51ddb2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -228,6 +228,7 @@ typedef struct drm_i915_private {
 	spinlock_t error_lock;
 	struct drm_i915_error_state *first_error;
 	struct work_struct error_work;
+	struct timer_list hangcheck_timer;
 
 	/* Register state */
 	u8 saveLBB;
@@ -557,6 +558,8 @@ extern struct drm_ioctl_desc i915_ioctls[];
 extern int i915_max_ioctl;
 extern unsigned int i915_fbpercrtc;
 
+extern void i915_save_display(struct drm_device *dev);
+extern void i915_restore_display(struct drm_device *dev);
 extern int i915_master_create(struct drm_device *dev, struct drm_master *master);
 extern void i915_master_destroy(struct drm_device *dev, struct drm_master *master);
 
@@ -576,6 +579,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 extern int i915_emit_box(struct drm_device *dev,
 			 struct drm_clip_rect *boxes,
 			 int i, int DR1, int DR4);
+extern void i965_reset(struct drm_device *dev, u8 flags);
 
 /* i915_irq.c */
 extern int i915_irq_emit(struct drm_device *dev, void *data,
@@ -607,7 +611,7 @@ i915_enable_pipestat(drm_i915_private_t *dev_priv, int pipe, u32 mask);
 
 void
 i915_disable_pipestat(drm_i915_private_t *dev_priv, int pipe, u32 mask);
-
+extern void i915_gem_hangcheck(unsigned long arg);
 
 /* i915_mem.c */
 extern int i915_mem_alloc(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 174aef2..c74e29f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1545,8 +1545,12 @@ i915_add_request(struct drm_device *dev, struct drm_file *file_priv,
 
 	}
 
-	if (was_empty && !dev_priv->mm.suspended)
-		schedule_delayed_work(&dev_priv->mm.retire_work, HZ);
+	if (!dev_priv->mm.suspended) {
+		if (was_empty)
+		    schedule_delayed_work(&dev_priv->mm.retire_work, HZ);
+
+		mod_timer(&dev_priv->hangcheck_timer, jiffies + 5*DRM_HZ);
+	}
 	return seqno;
 }
 
@@ -3866,6 +3870,7 @@ i915_gem_idle(struct drm_device *dev)
 	 * We need to replace this with a semaphore, or something.
 	 */
 	dev_priv->mm.suspended = 1;
+	del_timer(&dev_priv->hangcheck_timer);
 
 	/* Cancel the retire work handler, wait for it to finish if running
 	 */
@@ -4250,6 +4255,9 @@ i915_gem_load(struct drm_device *dev)
 		dev_priv->num_fence_regs = 8;
 
 	i915_gem_detect_bit_6_swizzle(dev);
+
+	setup_timer(&dev_priv->hangcheck_timer, i915_gem_hangcheck,
+		    (unsigned long)dev);
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 39a2b40..df32ee8 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -298,6 +298,18 @@ static void i915_error_work_func(struct work_struct *work)
 	DRM_DEBUG("generating error event\n");
 
 	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, envp);
+
+	if (dev_priv->mm.wedged) {
+		printk(KERN_ERR "GPU hang detected...");
+		if (IS_I965G(dev)) {
+			printk("resetting...");
+			i965_reset(dev, GDRST_RENDER);
+			printk("done.\n");
+			dev_priv->mm.wedged = 0;
+		} else {
+			printk("reboot required\n");
+		}
+	}
 }
 
 /**
@@ -475,6 +487,23 @@ static void i915_handle_error(struct drm_device *dev)
 	schedule_work(&dev_priv->error_work);
 }
 
+/**
+ * i915_gem_hangcheck - hang timer function
+ * @dev: drm device
+ *
+ * This function is called by the hangcheck timer if a batch doesn't finish
+ * within the timeout value.  It should capture error state and try to reset
+ * the GPU.
+ */
+void i915_gem_hangcheck(unsigned long arg)
+{
+	struct drm_device *dev = (struct drm_device *)arg;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+
+	dev_priv->mm.wedged = 1;
+	i915_handle_error(dev);
+}
+
 irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
 {
 	struct drm_device *dev = (struct drm_device *) arg;
@@ -566,6 +595,7 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
 		if (iir & I915_USER_INTERRUPT) {
 			dev_priv->mm.irq_gem_seqno = i915_get_gem_seqno(dev);
 			DRM_WAKEUP(&dev_priv->irq_queue);
+			del_timer(&dev_priv->hangcheck_timer);
 		}
 
 		if (pipea_stats & vblank_status) {
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index afb9835..2ffafbf 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -66,6 +66,10 @@
 #define   GC_DISPLAY_CLOCK_333_MHZ	(4 << 4)
 #define   GC_DISPLAY_CLOCK_MASK		(7 << 4)
 #define LBB	0xf4
+#define GDRST 0xc0
+#define  GDRST_FULL	(0<<2)
+#define  GDRST_RENDER	(1<<2)
+#define  GDRST_MEDIA	(3<<2)
 
 /* VGA stuff */
 
diff --git a/drivers/gpu/drm/i915/i915_suspend.c b/drivers/gpu/drm/i915/i915_suspend.c
index a98e283..cfb469c 100644
--- a/drivers/gpu/drm/i915/i915_suspend.c
+++ b/drivers/gpu/drm/i915/i915_suspend.c
@@ -222,19 +222,15 @@ static void i915_restore_vga(struct drm_device *dev)
 	I915_WRITE8(VGA_DACMASK, dev_priv->saveDACMASK);
 }
 
-int i915_save_state(struct drm_device *dev)
+/**
+ * i915_save_display - save display & mode info
+ * @dev: DRM device
+ *
+ * Save mode timings and display info.
+ */
+void i915_save_display(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	int i;
-
-	pci_read_config_byte(dev->pdev, LBB, &dev_priv->saveLBB);
-
-	/* Render Standby */
-	if (IS_I965G(dev) && IS_MOBILE(dev))
-		dev_priv->saveRENDERSTANDBY = I915_READ(MCHBAR_RENDER_STANDBY);
-
-	/* Hardware status page */
-	dev_priv->saveHWS = I915_READ(HWS_PGA);
 
 	/* Display arbitration control */
 	dev_priv->saveDSPARB = I915_READ(DSPARB);
@@ -330,17 +326,36 @@ int i915_save_state(struct drm_device *dev)
 	dev_priv->saveFBC_CONTROL2 = I915_READ(FBC_CONTROL2);
 	dev_priv->saveFBC_CONTROL = I915_READ(FBC_CONTROL);
 
-	/* Interrupt state */
-	dev_priv->saveIIR = I915_READ(IIR);
-	dev_priv->saveIER = I915_READ(IER);
-	dev_priv->saveIMR = I915_READ(IMR);
-
 	/* VGA state */
 	dev_priv->saveVGA0 = I915_READ(VGA0);
 	dev_priv->saveVGA1 = I915_READ(VGA1);
 	dev_priv->saveVGA_PD = I915_READ(VGA_PD);
 	dev_priv->saveVGACNTRL = I915_READ(VGACNTRL);
 
+	i915_save_vga(dev);
+}
+
+int i915_save_state(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int i;
+
+	pci_read_config_byte(dev->pdev, LBB, &dev_priv->saveLBB);
+
+	/* Render Standby */
+	if (IS_I965G(dev) && IS_MOBILE(dev))
+		dev_priv->saveRENDERSTANDBY = I915_READ(MCHBAR_RENDER_STANDBY);
+
+	/* Hardware status page */
+	dev_priv->saveHWS = I915_READ(HWS_PGA);
+
+	i915_save_display(dev);
+
+	/* Interrupt state */
+	dev_priv->saveIIR = I915_READ(IIR);
+	dev_priv->saveIER = I915_READ(IER);
+	dev_priv->saveIMR = I915_READ(IMR);
+
 	/* Clock gating state */
 	dev_priv->saveD_STATE = I915_READ(D_STATE);
 	dev_priv->saveCG_2D_DIS = I915_READ(CG_2D_DIS);
@@ -371,25 +386,15 @@ int i915_save_state(struct drm_device *dev)
 			for (i = 0; i < 8; i++)
 				dev_priv->saveFENCE[i+8] = I915_READ(FENCE_REG_945_8 + (i * 4));
 	}
-	i915_save_vga(dev);
 
 	return 0;
 }
 
-int i915_restore_state(struct drm_device *dev)
+void i915_restore_display(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int i;
 
-	pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
-
-	/* Render Standby */
-	if (IS_I965G(dev) && IS_MOBILE(dev))
-		I915_WRITE(MCHBAR_RENDER_STANDBY, dev_priv->saveRENDERSTANDBY);
-
-	/* Hardware status page */
-	I915_WRITE(HWS_PGA, dev_priv->saveHWS);
-
 	/* Display arbitration */
 	I915_WRITE(DSPARB, dev_priv->saveDSPARB);
 
@@ -533,6 +538,25 @@ int i915_restore_state(struct drm_device *dev)
 	I915_WRITE(VGA_PD, dev_priv->saveVGA_PD);
 	DRM_UDELAY(150);
 
+	i915_restore_vga(dev);
+}
+
+int i915_restore_state(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int i;
+
+	pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
+
+	/* Render Standby */
+	if (IS_I965G(dev) && IS_MOBILE(dev))
+		I915_WRITE(MCHBAR_RENDER_STANDBY, dev_priv->saveRENDERSTANDBY);
+
+	/* Hardware status page */
+	I915_WRITE(HWS_PGA, dev_priv->saveHWS);
+
+	i915_restore_display(dev);
+
 	/* Clock gating state */
 	I915_WRITE (D_STATE, dev_priv->saveD_STATE);
 	I915_WRITE (CG_2D_DIS, dev_priv->saveCG_2D_DIS);
@@ -550,8 +574,6 @@ int i915_restore_state(struct drm_device *dev)
 	for (i = 0; i < 3; i++)
 		I915_WRITE(SWF30 + (i << 2), dev_priv->saveSWF2[i]);
 
-	i915_restore_vga(dev);
-
 	return 0;
 }
 



More information about the Intel-gfx mailing list