[Intel-gfx] [PATCH 1/6] drm/i915: hangcheck robustification

Tue Oct 11 16:39:09 CEST 2011

From: Ben Widawsky <ben at bwidawsk.net>

This was pulled out of the per ring error handling patch series as it
actually fixes two issues, and bikeshedding appears to be going on
there.

First, remove setting hangcheck_count when we do notify ring. While it
seems counterintuitive to be setting up a timer to catch hangcheck_count
greater than 0 with hangcheck_count already greater than 0, actually
when we go to check if the GPU is hung we clear that value if the gpu is
still alive . Leaving this is actually harmful as submitting work could
falsely clear the count while the hanghcheck code is checking the count.
I can't think of case where this doesn't just delay the inevitable
reset... but I didn't spend too much time thinking about it.

Second, for Gen5+ we have more information to be considered when
determining if the GPU is stuck, primarily the media ring (and blitter
ring in gen6). This patch will check all available rings, and also updates
error state with the new information. It theoretically cant fix false
positives, but I haven't actually come across such a case.

Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
[danvet: remove remnants of a unrelated cleanup patch]
Signed-off-by: Daniel Vetter <daniel.vetter at ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.h |    7 +-
 drivers/gpu/drm/i915/i915_irq.c |  148 +++++++++++++++++++++++++++-----------
 2 files changed, 108 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 15c0ca5..4e73a86 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -330,10 +330,9 @@ typedef struct drm_i915_private {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
 	struct timer_list hangcheck_timer;
-	int hangcheck_count;
-	uint32_t last_acthd;
-	uint32_t last_instdone;
-	uint32_t last_instdone1;
+	int hangcheck_count; /* Should only be modified in hanghceck timer */
+	uint32_t last_acthd[I915_NUM_RINGS];
+	uint64_t last_instdone[I915_NUM_RINGS];
 
 	unsigned long cfb_size;
 	unsigned int cfb_fb;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 012732b..2218d12 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -365,7 +365,6 @@ static void notify_ring(struct drm_device *dev,
 	ring->irq_seqno = seqno;
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->hangcheck_count = 0;
 		mod_timer(&dev_priv->hangcheck_timer,
 			  jiffies +
 			  msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD));
@@ -1656,6 +1655,91 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 	return false;
 }
 
+static bool
+instdone_stuck(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint64_t instdone = 0, instdone1 = 0;
+	uint64_t vcs_instdone = 0, bcs_instdone = 0;
+	bool stuck;
+
+	switch (INTEL_INFO(dev)->gen) {
+	case 7:
+	case 6:
+		bcs_instdone = I915_READ(BCS_INSTDONE);
+	case 5:
+		vcs_instdone = I915_READ(VCS_INSTDONE);
+	case 4:
+		instdone = I915_READ(INSTDONE_I965);
+		instdone1 = I915_READ(INSTDONE1);
+		break;
+	case 3:
+	case 2:
+		instdone = I915_READ(INSTDONE);
+		break;
+	}
+
+	stuck =
+	    (dev_priv->last_instdone[RCS] == ((instdone << 32) | instdone1)) &&
+	    (dev_priv->last_instdone[VCS] == vcs_instdone) &&
+	    (dev_priv->last_instdone[BCS] == bcs_instdone);
+
+	dev_priv->last_instdone[RCS] = (instdone << 32) | instdone1;
+	dev_priv->last_instdone[VCS] = vcs_instdone;
+	dev_priv->last_instdone[BCS] = bcs_instdone;
+
+	return stuck;
+}
+
+static bool
+acthd_stuck(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t acthd = 0, vcs_acthd = 0, bcs_acthd = 0;
+	bool stuck = false;
+
+	switch (INTEL_INFO(dev)->gen) {
+	case 7:
+	case 6:
+		bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]);
+	case 5:
+		vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]);
+	case 4:
+	case 3:
+	case 2:
+		acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]);
+		break;
+	}
+
+	stuck =
+	    (dev_priv->last_acthd[RCS] == acthd) &&
+	    (dev_priv->last_acthd[VCS] == vcs_acthd) &&
+	    (dev_priv->last_acthd[BCS] == bcs_acthd);
+
+	dev_priv->last_acthd[RCS] = acthd;
+	dev_priv->last_acthd[VCS] = vcs_acthd;
+	dev_priv->last_acthd[BCS] = bcs_acthd;
+
+	return stuck;
+}
+
+static bool gpu_stuck(struct drm_device *dev)
+{
+	#define NUM_HANGCHECKS_TO_RESET 1
+
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	if (!acthd_stuck(dev) || !instdone_stuck(dev))
+		dev_priv->hangcheck_count = 0;
+	else
+		dev_priv->hangcheck_count++;
+
+	if (dev_priv->hangcheck_count > NUM_HANGCHECKS_TO_RESET)
+		return true;
+
+	return false;
+}
+
 /**
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. The first time this is called we simply record
@@ -1666,13 +1750,11 @@ void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	uint32_t acthd, instdone, instdone1;
 	bool err = false;
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	/* If all work is done then ACTHD clearly hasn't advanced. */
 	if (i915_hangcheck_ring_idle(&dev_priv->ring[RCS], &err) &&
 	    i915_hangcheck_ring_idle(&dev_priv->ring[VCS], &err) &&
 	    i915_hangcheck_ring_idle(&dev_priv->ring[BCS], &err)) {
@@ -1682,50 +1764,30 @@ void i915_hangcheck_elapsed(unsigned long data)
 		return;
 	}
 
-	if (INTEL_INFO(dev)->gen < 4) {
-		acthd = I915_READ(ACTHD);
-		instdone = I915_READ(INSTDONE);
-		instdone1 = 0;
-	} else {
-		acthd = I915_READ(ACTHD_I965);
-		instdone = I915_READ(INSTDONE_I965);
-		instdone1 = I915_READ(INSTDONE1);
-	}
+	if (gpu_stuck(dev)) {
+		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
 
-	if (dev_priv->last_acthd == acthd &&
-	    dev_priv->last_instdone == instdone &&
-	    dev_priv->last_instdone1 == instdone1) {
-		if (dev_priv->hangcheck_count++ > 1) {
-			DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
-
-			if (!IS_GEN2(dev)) {
-				/* Is the chip hanging on a WAIT_FOR_EVENT?
-				 * If so we can simply poke the RB_WAIT bit
-				 * and break the hang. This should work on
-				 * all but the second generation chipsets.
-				 */
-
-				if (kick_ring(&dev_priv->ring[RCS]))
-					goto repeat;
+		if (!IS_GEN2(dev)) {
+			/* Is the chip hanging on a WAIT_FOR_EVENT?
+			 * If so we can simply poke the RB_WAIT bit
+			 * and break the hang. This should work on
+			 * all but the second generation chipsets.
+			 */
 
-				if (HAS_BSD(dev) &&
-				    kick_ring(&dev_priv->ring[VCS]))
-					goto repeat;
+			if (kick_ring(&dev_priv->ring[RCS]))
+				goto repeat;
 
-				if (HAS_BLT(dev) &&
-				    kick_ring(&dev_priv->ring[BCS]))
-					goto repeat;
-			}
+			if (HAS_BSD(dev) &&
+			    kick_ring(&dev_priv->ring[VCS]))
+				goto repeat;
 
-			i915_handle_error(dev, true);
-			return;
+			if (HAS_BLT(dev) &&
+			    kick_ring(&dev_priv->ring[BCS]))
+				goto repeat;
 		}
-	} else {
-		dev_priv->hangcheck_count = 0;
 
-		dev_priv->last_acthd = acthd;
-		dev_priv->last_instdone = instdone;
-		dev_priv->last_instdone1 = instdone1;
+		i915_handle_error(dev, true);
+		return;
 	}
 
 repeat:
-- 
1.7.6.4