[Intel-gfx] [PATCH 5/5] drm/i915: Add hangcheck timer

Ben Gamari bgamari.foss at gmail.com
Thu Jul 9 00:30:15 CEST 2009


We set a periodic timer to check on the GPU, resetting it every time a
batch is completed. If the timer elapses, we check acthd. If acthd
hasn't changed in two timer periods, we assume the chip is wedged.

This is implemented in such a way that it leaves the option open to
employ adaptive timer intervals in the future. One could wait until
several timer periods have elapsed before declaring the chip dead. If
the chip comes back after several periods but before the "dead"
threshold, the timer interval or dead threshold could be raised.

Signed-off-by: Ben Gamari <bgamari.foss at gmail.com>
---
 drivers/gpu/drm/i915/i915_dma.c |    4 ++++
 drivers/gpu/drm/i915/i915_drv.h |    7 +++++++
 drivers/gpu/drm/i915/i915_gem.c |    9 +++++++--
 drivers/gpu/drm/i915/i915_irq.c |   31 +++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index a0a503d..99de86c 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1251,6 +1251,8 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	if (!IS_IGDNG(dev))
 		intel_opregion_init(dev, 0);
 
+	setup_timer(&dev_priv->hangcheck_timer, i915_hangcheck_elapsed,
+		    (unsigned long) dev);
 	return 0;
 
 out_iomapfree:
@@ -1266,6 +1268,8 @@ int i915_driver_unload(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
+	del_timer_sync(&dev_priv->hangcheck_timer);
+
 	io_mapping_free(dev_priv->mm.gtt_mapping);
 	if (dev_priv->mm.gtt_mtrr >= 0) {
 		mtrr_del(dev_priv->mm.gtt_mtrr, dev->agp->base,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b0d632e..bd9fdef 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -198,6 +198,12 @@ typedef struct drm_i915_private {
 	unsigned int sr01, adpa, ppcr, dvob, dvoc, lvds;
 	int vblank_pipe;
 
+	/* For hangcheck timer */
+#define DRM_I915_HANGCHECK_PERIOD 10 /* in jiffies */
+	struct timer_list hangcheck_timer;
+	int hangcheck_count;
+	uint32_t last_acthd;
+
 	bool cursor_needs_physical;
 
 	struct drm_mm vram;
@@ -591,6 +597,7 @@ extern int i915_emit_box(struct drm_device *dev,
 extern void i965_reset(struct drm_device *dev, u8 flags);
 
 /* i915_irq.c */
+void i915_hangcheck_elapsed(unsigned long data);
 extern int i915_irq_emit(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int i915_irq_wait(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 876b65c..0882abb 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1544,8 +1544,12 @@ i915_add_request(struct drm_device *dev, struct drm_file *file_priv,
 
 	}
 
-	if (was_empty && !dev_priv->mm.suspended)
-		schedule_delayed_work(&dev_priv->mm.retire_work, HZ);
+	if (!dev_priv->mm.suspended) {
+		if (was_empty)
+			schedule_delayed_work(&dev_priv->mm.retire_work, HZ);
+
+		mod_timer(&dev_priv->hangcheck_timer, jiffies + DRM_I915_HANGCHECK_PERIOD);
+	}
 	return seqno;
 }
 
@@ -3863,6 +3867,7 @@ i915_gem_idle(struct drm_device *dev)
 	 * We need to replace this with a semaphore, or something.
 	 */
 	dev_priv->mm.suspended = 1;
+	del_timer(&dev_priv->hangcheck_timer);
 
 	/* Cancel the retire work handler, wait for it to finish if running
 	 */
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 38e3d5c..b4dae5f 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -591,6 +591,8 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
 		if (iir & I915_USER_INTERRUPT) {
 			dev_priv->mm.irq_gem_seqno = i915_get_gem_seqno(dev);
 			DRM_WAKEUP(&dev_priv->irq_queue);
+			dev_priv->hangcheck_count = 0;
+			mod_timer(&dev_priv->hangcheck_timer, jiffies + DRM_I915_HANGCHECK_PERIOD);
 		}
 
 		if (pipea_stats & vblank_status) {
@@ -870,6 +872,35 @@ int i915_vblank_swap(struct drm_device *dev, void *data,
 	return -EINVAL;
 }
 
+/**
+ * This is called when the chip hasn't reported back with completed
+ * batchbuffers in a long time. The first time this is called we simply record
+ * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
+ * again, we assume the chip is wedged and try to fix it.
+ */
+void i915_hangcheck_elapsed(unsigned long data)
+{
+	struct drm_device *dev = (struct drm_device *)data;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	uint32_t acthd = I915_READ(ACTHD);
+
+	if (acthd != dev_priv->last_acthd) {
+		dev_priv->hangcheck_count = 0;
+		dev_priv->last_acthd = acthd;
+		return;
+	}
+
+	if (dev_priv->hangcheck_count && dev_priv->last_acthd == acthd) {
+		printk(KERN_ERR "i915 hang detected\n");
+		dev_priv->mm.wedged = true;
+		i915_handle_error(dev);
+		return;
+	}
+	
+	dev_priv->hangcheck_count++;
+	dev_priv->last_acthd = acthd;
+}
+
 /* drm_dma.h hooks
 */
 static void igdng_irq_preinstall(struct drm_device *dev)
-- 
1.6.3.3




More information about the Intel-gfx mailing list