[Intel-gfx] [PATCH] drm/i915: hangcheck robustification

Ben Widawsky ben at bwidawsk.net
Fri Oct 7 21:53:22 CEST 2011


This was pulled out of the per ring error handling patch series as it
actually fixes two issues, and bikeshedding appears to be going on
there.

First, remove setting hangcheck_count when we do notify ring. While it
seems counterintuitive to be setting up a timer to catch hangcheck_count
greater than 0 with hangcheck_count already greater than 0, actually
when we go to check if the GPU is hung we clear that value if the gpu is
still alive . Leaving this is actually harmful as submitting work could
falsely clear the count while the hanghcheck code is checking the count.
I can't think of case where this doesn't just delay the inevitable
reset... but I didn't spend too much time thinking about it.

Second, for Gen5+ we have more information to be considered when
determining if the GPU is stuck, primarily the media ring (and blitter
ring in gen6). This patch will check all available rings, and also updates
error state with the new information. It theoretically cant fix false
positives, but I haven't actually come across such a case.

Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_debugfs.c |    6 +-
 drivers/gpu/drm/i915/i915_drv.h     |   11 +--
 drivers/gpu/drm/i915/i915_irq.c     |  152 ++++++++++++++++++++++++----------
 3 files changed, 114 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 8e95d66..ea0b6bb 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -752,20 +752,20 @@ static int i915_error_state(struct seq_file *m, void *unused)
 	if (INTEL_INFO(dev)->gen >= 6) {
 		seq_printf(m, "ERROR: 0x%08x\n", error->error);
 		seq_printf(m, "Blitter command stream:\n");
-		seq_printf(m, "  ACTHD:    0x%08x\n", error->bcs_acthd);
+		seq_printf(m, "  ACTHD:    0x%08x\n", error->acthd[BCS]);
 		seq_printf(m, "  IPEIR:    0x%08x\n", error->bcs_ipeir);
 		seq_printf(m, "  IPEHR:    0x%08x\n", error->bcs_ipehr);
 		seq_printf(m, "  INSTDONE: 0x%08x\n", error->bcs_instdone);
 		seq_printf(m, "  seqno:    0x%08x\n", error->bcs_seqno);
 		seq_printf(m, "Video (BSD) command stream:\n");
-		seq_printf(m, "  ACTHD:    0x%08x\n", error->vcs_acthd);
+		seq_printf(m, "  ACTHD:    0x%08x\n", error->acthd[VCS]);
 		seq_printf(m, "  IPEIR:    0x%08x\n", error->vcs_ipeir);
 		seq_printf(m, "  IPEHR:    0x%08x\n", error->vcs_ipehr);
 		seq_printf(m, "  INSTDONE: 0x%08x\n", error->vcs_instdone);
 		seq_printf(m, "  seqno:    0x%08x\n", error->vcs_seqno);
 	}
 	seq_printf(m, "Render command stream:\n");
-	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd);
+	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd[RCS]);
 	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir);
 	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr);
 	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 15c0ca5..fe91b5f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -152,14 +152,12 @@ struct drm_i915_error_state {
 	u32 ipeir;
 	u32 ipehr;
 	u32 instdone;
-	u32 acthd;
+	u32 acthd[I915_NUM_RINGS];
 	u32 error; /* gen6+ */
-	u32 bcs_acthd; /* gen6+ blt engine */
 	u32 bcs_ipehr;
 	u32 bcs_ipeir;
 	u32 bcs_instdone;
 	u32 bcs_seqno;
-	u32 vcs_acthd; /* gen6+ bsd engine */
 	u32 vcs_ipehr;
 	u32 vcs_ipeir;
 	u32 vcs_instdone;
@@ -330,10 +328,9 @@ typedef struct drm_i915_private {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
 	struct timer_list hangcheck_timer;
-	int hangcheck_count;
-	uint32_t last_acthd;
-	uint32_t last_instdone;
-	uint32_t last_instdone1;
+	int hangcheck_count; /* Should only be modified in hanghceck timer */
+	uint32_t last_acthd[I915_NUM_RINGS];
+	uint64_t last_instdone[I915_NUM_RINGS];
 
 	unsigned long cfb_size;
 	unsigned int cfb_fb;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 012732b..eedb5d1 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -365,7 +365,6 @@ static void notify_ring(struct drm_device *dev,
 	ring->irq_seqno = seqno;
 	wake_up_all(&ring->irq_queue);
 	if (i915_enable_hangcheck) {
-		dev_priv->hangcheck_count = 0;
 		mod_timer(&dev_priv->hangcheck_timer,
 			  jiffies +
 			  msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD));
@@ -916,7 +915,7 @@ static void i915_capture_error_state(struct drm_device *dev)
 	if (INTEL_INFO(dev)->gen >= 6) {
 		error->error = I915_READ(ERROR_GEN6);
 
-		error->bcs_acthd = I915_READ(BCS_ACTHD);
+		error->acthd[BCS] = I915_READ(BCS_ACTHD);
 		error->bcs_ipehr = I915_READ(BCS_IPEHR);
 		error->bcs_ipeir = I915_READ(BCS_IPEIR);
 		error->bcs_instdone = I915_READ(BCS_INSTDONE);
@@ -924,7 +923,7 @@ static void i915_capture_error_state(struct drm_device *dev)
 		if (dev_priv->ring[BCS].get_seqno)
 			error->bcs_seqno = dev_priv->ring[BCS].get_seqno(&dev_priv->ring[BCS]);
 
-		error->vcs_acthd = I915_READ(VCS_ACTHD);
+		error->acthd[VCS] = I915_READ(VCS_ACTHD);
 		error->vcs_ipehr = I915_READ(VCS_IPEHR);
 		error->vcs_ipeir = I915_READ(VCS_IPEIR);
 		error->vcs_instdone = I915_READ(VCS_INSTDONE);
@@ -938,13 +937,13 @@ static void i915_capture_error_state(struct drm_device *dev)
 		error->instdone = I915_READ(INSTDONE_I965);
 		error->instps = I915_READ(INSTPS);
 		error->instdone1 = I915_READ(INSTDONE1);
-		error->acthd = I915_READ(ACTHD_I965);
+		error->acthd[RCS] = I915_READ(ACTHD_I965);
 		error->bbaddr = I915_READ64(BB_ADDR);
 	} else {
 		error->ipeir = I915_READ(IPEIR);
 		error->ipehr = I915_READ(IPEHR);
 		error->instdone = I915_READ(INSTDONE);
-		error->acthd = I915_READ(ACTHD);
+		error->acthd[RCS] = I915_READ(ACTHD);
 		error->bbaddr = 0;
 	}
 	i915_gem_record_fences(dev, error);
@@ -1656,6 +1655,91 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 	return false;
 }
 
+static bool
+instdone_stuck(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint64_t instdone = 0, instdone1 = 0;
+	uint64_t vcs_instdone = 0, bcs_instdone = 0;
+	bool stuck;
+
+	switch (INTEL_INFO(dev)->gen) {
+	case 7:
+	case 6:
+		bcs_instdone = I915_READ(BCS_INSTDONE);
+	case 5:
+		vcs_instdone = I915_READ(VCS_INSTDONE);
+	case 4:
+		instdone = I915_READ(INSTDONE_I965);
+		instdone1 = I915_READ(INSTDONE1);
+		break;
+	case 3:
+	case 2:
+		instdone = I915_READ(INSTDONE);
+		break;
+	}
+
+	stuck =
+	    (dev_priv->last_instdone[RCS] == ((instdone << 32) | instdone1)) &&
+	    (dev_priv->last_instdone[VCS] == vcs_instdone) &&
+	    (dev_priv->last_instdone[BCS] == bcs_instdone);
+
+	dev_priv->last_instdone[RCS] = (instdone << 32) | instdone1;
+	dev_priv->last_instdone[VCS] = vcs_instdone;
+	dev_priv->last_instdone[BCS] = bcs_instdone;
+
+	return stuck;
+}
+
+static bool
+acthd_stuck(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t acthd = 0, vcs_acthd = 0, bcs_acthd = 0;
+	bool stuck = false;
+
+	switch (INTEL_INFO(dev)->gen) {
+	case 7:
+	case 6:
+		bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]);
+	case 5:
+		vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]);
+	case 4:
+	case 3:
+	case 2:
+		acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]);
+		break;
+	}
+
+	stuck =
+	    (dev_priv->last_acthd[RCS] == acthd) &&
+	    (dev_priv->last_acthd[VCS] == vcs_acthd) &&
+	    (dev_priv->last_acthd[BCS] == bcs_acthd);
+
+	dev_priv->last_acthd[RCS] = acthd;
+	dev_priv->last_acthd[VCS] = vcs_acthd;
+	dev_priv->last_acthd[BCS] = bcs_acthd;
+
+	return stuck;
+}
+
+static bool gpu_stuck(struct drm_device *dev)
+{
+	#define NUM_HANGCHECKS_TO_RESET 1
+
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	if (!acthd_stuck(dev) || !instdone_stuck(dev))
+		dev_priv->hangcheck_count = 0;
+	else
+		dev_priv->hangcheck_count++;
+
+	if (dev_priv->hangcheck_count > NUM_HANGCHECKS_TO_RESET)
+		return true;
+
+	return false;
+}
+
 /**
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. The first time this is called we simply record
@@ -1666,13 +1750,11 @@ void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	uint32_t acthd, instdone, instdone1;
 	bool err = false;
 
 	if (!i915_enable_hangcheck)
 		return;
 
-	/* If all work is done then ACTHD clearly hasn't advanced. */
 	if (i915_hangcheck_ring_idle(&dev_priv->ring[RCS], &err) &&
 	    i915_hangcheck_ring_idle(&dev_priv->ring[VCS], &err) &&
 	    i915_hangcheck_ring_idle(&dev_priv->ring[BCS], &err)) {
@@ -1682,50 +1764,30 @@ void i915_hangcheck_elapsed(unsigned long data)
 		return;
 	}
 
-	if (INTEL_INFO(dev)->gen < 4) {
-		acthd = I915_READ(ACTHD);
-		instdone = I915_READ(INSTDONE);
-		instdone1 = 0;
-	} else {
-		acthd = I915_READ(ACTHD_I965);
-		instdone = I915_READ(INSTDONE_I965);
-		instdone1 = I915_READ(INSTDONE1);
-	}
+	if (gpu_stuck(dev)) {
+		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
 
-	if (dev_priv->last_acthd == acthd &&
-	    dev_priv->last_instdone == instdone &&
-	    dev_priv->last_instdone1 == instdone1) {
-		if (dev_priv->hangcheck_count++ > 1) {
-			DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
-
-			if (!IS_GEN2(dev)) {
-				/* Is the chip hanging on a WAIT_FOR_EVENT?
-				 * If so we can simply poke the RB_WAIT bit
-				 * and break the hang. This should work on
-				 * all but the second generation chipsets.
-				 */
-
-				if (kick_ring(&dev_priv->ring[RCS]))
-					goto repeat;
+		if (!IS_GEN2(dev)) {
+			/* Is the chip hanging on a WAIT_FOR_EVENT?
+			 * If so we can simply poke the RB_WAIT bit
+			 * and break the hang. This should work on
+			 * all but the second generation chipsets.
+			 */
 
-				if (HAS_BSD(dev) &&
-				    kick_ring(&dev_priv->ring[VCS]))
-					goto repeat;
+			if (kick_ring(&dev_priv->ring[RCS]))
+				goto repeat;
 
-				if (HAS_BLT(dev) &&
-				    kick_ring(&dev_priv->ring[BCS]))
-					goto repeat;
-			}
+			if (HAS_BSD(dev) &&
+			    kick_ring(&dev_priv->ring[VCS]))
+				goto repeat;
 
-			i915_handle_error(dev, true);
-			return;
+			if (HAS_BLT(dev) &&
+			    kick_ring(&dev_priv->ring[BCS]))
+				goto repeat;
 		}
-	} else {
-		dev_priv->hangcheck_count = 0;
 
-		dev_priv->last_acthd = acthd;
-		dev_priv->last_instdone = instdone;
-		dev_priv->last_instdone1 = instdone1;
+		i915_handle_error(dev, true);
+		return;
 	}
 
 repeat:
-- 
1.7.7




More information about the Intel-gfx mailing list