[Intel-gfx] [RFC 01/13] drm/i915: Periodic sampling for hang detection

Lister, Ian ian.lister at intel.com
Mon Dec 16 17:02:23 CET 2013


From c3b6469bd01017a121a52f02453a41b2fd8bc424 Mon Sep 17 00:00:00 2001
Message-Id: <c3b6469bd01017a121a52f02453a41b2fd8bc424.1387201899.git.ian.lister at intel.com>
In-Reply-To: <cover.1387201899.git.ian.lister at intel.com>
References: <cover.1387201899.git.ian.lister at intel.com>
From: ian-lister <ian.lister at intel.com>
Date: Wed, 4 Dec 2013 17:13:46 +0000
Subject: [RFC 01/13] drm/i915: Periodic sampling for hang detection

Convert the hang detection logic to sample the rings periodically whilst the
GPU has pending work to complete on any of the rings. Previously it would only
fire after work stopped being submitted and there were no completion reports
for the timer period, however there are several problems with this:

  a) The USER interrupt is only enabled when the driver knows that someone
     is waiting on the seqno so we are not really getting the desired behaviour
     of restarting the timer when work completes.

  b) There is a possible DoS attack because the hang detection timer can be
     defeated simply by adding more work to any one of the rings.

  c) The current scheme does not lend itself to per-ring TDR because we do not
     want submission/completion events on one ring to prevent a hang being
     detected on another ring.

Periodic sampling creates a disconnect between the current request and the time
it has been executing so the ring_stuck function can no longer poke the
wait_event bit immediately, instead it should be tried just prior to triggering
recovery work.

Signed-off-by: ian-lister <ian.lister at intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c |  1 +
 drivers/gpu/drm/i915/i915_drv.h |  1 +
 drivers/gpu/drm/i915/i915_gem.c |  1 +
 drivers/gpu/drm/i915/i915_irq.c | 69 ++++++++++++++++++++++++++++-------------
 4 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index c5df900..e17a5d20 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1749,6 +1749,7 @@ int i915_driver_unload(struct drm_device *dev)
 
 	/* Free error state after interrupts are fully disabled. */
 	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	atomic_set(&dev_priv->gpu_error.active, 0);
 	cancel_work_sync(&dev_priv->gpu_error.work);
 	i915_destroy_error_state(dev);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4613f51..f261ab5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1099,6 +1099,7 @@ struct i915_gpu_error {
 #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
 
 	struct timer_list hangcheck_timer;
+	atomic_t active;
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2370dba..7ed1fab 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4296,6 +4296,7 @@ i915_gem_suspend(struct drm_device *dev)
 	mutex_unlock(&dev->struct_mutex);
 
 	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	atomic_set(&dev_priv->gpu_error.active, 0);
 	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
 	cancel_delayed_work_sync(&dev_priv->mm.idle_work);
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index dc5f438..289985b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -957,7 +957,6 @@ static void notify_ring(struct drm_device *dev,
 	trace_i915_gem_request_complete(ring);
 
 	wake_up_all(&ring->irq_queue);
-	i915_queue_hangcheck(dev);
 }
 
 /**
@@ -2697,19 +2696,12 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 	if (IS_GEN2(dev))
 		return HANGCHECK_HUNG;
 
-	/* Is the chip hanging on a WAIT_FOR_EVENT?
-	 * If so we can simply poke the RB_WAIT bit
-	 * and break the hang. This should work on
-	 * all but the second generation chipsets.
-	 */
+	/* WARNING: Do not attempt to kick the ring here now that
+	* we are periodically sampling the ring state.
+	* It may be that the command has only just started
+	* and the wait condition has not yet been met.
+        */
 	tmp = I915_READ_CTL(ring);
-	if (tmp & RING_WAIT) {
-		DRM_ERROR("Kicking stuck wait on %s\n",
-			  ring->name);
-		i915_handle_error(dev, false);
-		I915_WRITE_CTL(ring, tmp);
-		return HANGCHECK_KICK;
-	}
 
 	if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
 		switch (semaphore_passed(ring)) {
@@ -2730,12 +2722,10 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 }
 
 /**
- * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. We keep track per ring seqno progress and
- * if there are no progress, hangcheck score for that ring is increased.
- * Further, acthd is inspected to see if the ring is stuck. On stuck case
- * we kick the ring. If we see no progress on three subsequent calls
- * we assume chip is wedged and try to fix it by resetting the chip.
+ * This function is called periodically while the GPU has work to do.
+ * It assesses the current state to see if the ring appears to be moving.
+ * If there is no discernible progress then it will increment the hangcheck
+ * score. If it exceeds the threshold then recovery work will be triggered.
  */
 static void i915_hangcheck_elapsed(unsigned long data)
 {
@@ -2745,11 +2735,19 @@ static void i915_hangcheck_elapsed(unsigned long data)
 	int i;
 	int busy_count = 0, rings_hung = 0;
 	bool stuck[I915_NUM_RINGS] = { 0 };
+	u32 tmp;
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
 #define FIRE 30
 
+	/* Clear the active flag *before* assessing the ring state
+        * in case new work is added just after we sample the rings.
+        * This will allow new work to re-trigger the timer even
+        * if we see the rings as idle on this occasion.
+		*/
+        atomic_set(&dev_priv->gpu_error.active, 0);
+
 	if (!i915_enable_hangcheck)
 		return;
 
@@ -2836,7 +2834,29 @@ static void i915_hangcheck_elapsed(unsigned long data)
 			DRM_INFO("%s on %s\n",
 				 stuck[i] ? "stuck" : "no progress",
 				 ring->name);
-			rings_hung++;
+
+			/* Is the chip hanging on a WAIT_FOR_EVENT?
+			 * If so we can try poking the RB_WAIT bit
+			 * to break the hang. This should work on
+			 * all but the second generation chipsets.
+			 */
+			tmp = I915_READ_CTL(ring);
+			if (!IS_GEN2(dev) && (tmp & RING_WAIT)) {
+				DRM_ERROR("Kicking stuck wait on %s\n",
+					ring->name);
+				/* Send error event but don't reset.
+				 * The score is set just under the threshold so
+				 * that if we fail to break the hang it will be
+				 * recovered by a reset next time round.
+				 */
+				i915_handle_error(dev, false);
+				I915_WRITE_CTL(ring, tmp);
+				ring->hangcheck.action = HANGCHECK_KICK;
+				ring->hangcheck.score = FIRE;
+			} else {
+				rings_hung++;
+				ring->hangcheck.score = 0;
+			}
 		}
 	}
 
@@ -2855,8 +2875,13 @@ void i915_queue_hangcheck(struct drm_device *dev)
 	if (!i915_enable_hangcheck)
 		return;
 
-	mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	/* Only re-start the timer if it is not currently active to
+	 * prevent DoS attacks which try to defeat the hang detection.
+	 */
+        if (atomic_add_unless(&dev_priv->gpu_error.active, 1, 1) != 0) {
+		mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+			  (jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	}
 }
 
 static void ibx_irq_preinstall(struct drm_device *dev)
-- 
1.8.5.1




More information about the Intel-gfx mailing list