[Intel-gfx] [PATCH 2/2] drm/i915: check acthd for all rings

Ben Widawsky ben at bwidawsk.net
Mon Sep 19 22:36:37 CEST 2011


On Gen6+ we have other rings which may be in use. We haven't hung if the
blit or media ring is still going

Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_debugfs.c |    6 +-
 drivers/gpu/drm/i915/i915_drv.h     |    6 +-
 drivers/gpu/drm/i915/i915_irq.c     |  113 +++++++++++++++++++++--------------
 3 files changed, 73 insertions(+), 52 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 3cdf638..0431358 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -755,20 +755,20 @@ static int i915_error_state(struct seq_file *m, void *unused)
 		seq_printf(m, "Blitter Page Fault: 0x%08x\n", error->page_fault[BCS]);
 		seq_printf(m, "ERROR: 0x%08x\n", error->error);
 		seq_printf(m, "Blitter command stream:\n");
-		seq_printf(m, "  ACTHD:    0x%08x\n", error->bcs_acthd);
+		seq_printf(m, "  ACTHD:    0x%08x\n", error->acthd[BCS]);
 		seq_printf(m, "  IPEIR:    0x%08x\n", error->bcs_ipeir);
 		seq_printf(m, "  IPEHR:    0x%08x\n", error->bcs_ipehr);
 		seq_printf(m, "  INSTDONE: 0x%08x\n", error->bcs_instdone);
 		seq_printf(m, "  seqno:    0x%08x\n", error->bcs_seqno);
 		seq_printf(m, "Video (BSD) command stream:\n");
-		seq_printf(m, "  ACTHD:    0x%08x\n", error->vcs_acthd);
+		seq_printf(m, "  ACTHD:    0x%08x\n", error->acthd[VCS]);
 		seq_printf(m, "  IPEIR:    0x%08x\n", error->vcs_ipeir);
 		seq_printf(m, "  IPEHR:    0x%08x\n", error->vcs_ipehr);
 		seq_printf(m, "  INSTDONE: 0x%08x\n", error->vcs_instdone);
 		seq_printf(m, "  seqno:    0x%08x\n", error->vcs_seqno);
 	}
 	seq_printf(m, "Render command stream:\n");
-	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd);
+	seq_printf(m, "  ACTHD: 0x%08x\n", error->acthd[RCS]);
 	seq_printf(m, "  IPEIR: 0x%08x\n", error->ipeir);
 	seq_printf(m, "  IPEHR: 0x%08x\n", error->ipehr);
 	seq_printf(m, "  INSTDONE: 0x%08x\n", error->instdone);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0447461..36ecae8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -152,15 +152,13 @@ struct drm_i915_error_state {
 	u32 ipeir;
 	u32 ipehr;
 	u32 instdone;
-	u32 acthd;
+	u32 acthd[I915_NUM_RINGS];
 	u32 page_fault[I915_NUM_RINGS];
 	u32 error; /* gen6+ */
-	u32 bcs_acthd; /* gen6+ blt engine */
 	u32 bcs_ipehr;
 	u32 bcs_ipeir;
 	u32 bcs_instdone;
 	u32 bcs_seqno;
-	u32 vcs_acthd; /* gen6+ bsd engine */
 	u32 vcs_ipehr;
 	u32 vcs_ipeir;
 	u32 vcs_instdone;
@@ -330,7 +328,7 @@ typedef struct drm_i915_private {
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
 	struct timer_list hangcheck_timer;
 	int hangcheck_count;
-	uint32_t last_acthd;
+	uint32_t last_acthd[I915_NUM_RINGS];
 	uint32_t last_instdone;
 	uint32_t last_instdone1;
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 99bd330..df14c28 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -919,7 +919,7 @@ static void i915_capture_error_state(struct drm_device *dev)
 		error->page_fault[BCS] = I915_READ(GEN6_BLT_FAULT);
 		error->error = I915_READ(ERROR_GEN6);
 
-		error->bcs_acthd = I915_READ(BCS_ACTHD);
+		error->acthd[BCS] = I915_READ(BCS_ACTHD);
 		error->bcs_ipehr = I915_READ(BCS_IPEHR);
 		error->bcs_ipeir = I915_READ(BCS_IPEIR);
 		error->bcs_instdone = I915_READ(BCS_INSTDONE);
@@ -927,7 +927,7 @@ static void i915_capture_error_state(struct drm_device *dev)
 		if (dev_priv->ring[BCS].get_seqno)
 			error->bcs_seqno = dev_priv->ring[BCS].get_seqno(&dev_priv->ring[BCS]);
 
-		error->vcs_acthd = I915_READ(VCS_ACTHD);
+		error->acthd[VCS] = I915_READ(VCS_ACTHD);
 		error->vcs_ipehr = I915_READ(VCS_IPEHR);
 		error->vcs_ipeir = I915_READ(VCS_IPEIR);
 		error->vcs_instdone = I915_READ(VCS_INSTDONE);
@@ -941,13 +941,13 @@ static void i915_capture_error_state(struct drm_device *dev)
 		error->instdone = I915_READ(INSTDONE_I965);
 		error->instps = I915_READ(INSTPS);
 		error->instdone1 = I915_READ(INSTDONE1);
-		error->acthd = I915_READ(ACTHD_I965);
+		error->acthd[RCS] = I915_READ(ACTHD_I965);
 		error->bbaddr = I915_READ64(BB_ADDR);
 	} else {
 		error->ipeir = I915_READ(IPEIR);
 		error->ipehr = I915_READ(IPEHR);
 		error->instdone = I915_READ(INSTDONE);
-		error->acthd = I915_READ(ACTHD);
+		error->acthd[RCS] = I915_READ(ACTHD);
 		error->bbaddr = 0;
 	}
 	i915_gem_record_fences(dev, error);
@@ -1659,6 +1659,50 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 	return false;
 }
 
+static bool
+acthd_stuck(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t acthd, vcs_acthd, bcs_acthd;
+	uint32_t instdone = 0, instdone1 = 0;
+	bool rcs_stuck, others_stuck = true;
+
+	acthd = intel_ring_get_active_head(&dev_priv->ring[RCS]);
+	switch (INTEL_INFO(dev)->gen) {
+	case 7:
+	case 6:
+		vcs_acthd = intel_ring_get_active_head(&dev_priv->ring[VCS]);
+		bcs_acthd = intel_ring_get_active_head(&dev_priv->ring[BCS]);
+		others_stuck = (dev_priv->last_acthd[2] == bcs_acthd) &&
+			       (dev_priv->last_acthd[1] == vcs_acthd);
+		dev_priv->last_acthd[2] = bcs_acthd;
+		dev_priv->last_acthd[1] = vcs_acthd;
+		break;
+	case 5:
+	case 4:
+		instdone = I915_READ(INSTDONE_I965);
+		instdone1 = I915_READ(INSTDONE1);
+		break;
+	case 3:
+	case 2:
+		instdone = I915_READ(INSTDONE);
+		instdone1 = 0;
+		break;
+	default:
+		BUG();
+		return false;
+	}
+	rcs_stuck = dev_priv->last_acthd[0] == acthd;
+
+	dev_priv->last_acthd[0] = acthd;
+	dev_priv->last_instdone = instdone;
+	dev_priv->last_instdone1 = instdone1;
+
+	if (dev_priv->hangcheck_count++ == 0)
+		return false;
+
+	return rcs_stuck && others_stuck;
+}
 /**
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. The first time this is called we simply record
@@ -1669,7 +1713,6 @@ void i915_hangcheck_elapsed(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	uint32_t acthd, instdone, instdone1;
 	bool err = false;
 
 	if (!i915_enable_hangcheck)
@@ -1685,50 +1728,30 @@ void i915_hangcheck_elapsed(unsigned long data)
 		return;
 	}
 
-	if (INTEL_INFO(dev)->gen < 4) {
-		acthd = I915_READ(ACTHD);
-		instdone = I915_READ(INSTDONE);
-		instdone1 = 0;
-	} else {
-		acthd = I915_READ(ACTHD_I965);
-		instdone = I915_READ(INSTDONE_I965);
-		instdone1 = I915_READ(INSTDONE1);
-	}
-
-	if (dev_priv->last_acthd == acthd &&
-	    dev_priv->last_instdone == instdone &&
-	    dev_priv->last_instdone1 == instdone1) {
-		if (dev_priv->hangcheck_count++ > 1) {
-			DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
+	if (acthd_stuck(dev)) {
+		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
 
-			if (!IS_GEN2(dev)) {
-				/* Is the chip hanging on a WAIT_FOR_EVENT?
-				 * If so we can simply poke the RB_WAIT bit
-				 * and break the hang. This should work on
-				 * all but the second generation chipsets.
-				 */
-
-				if (kick_ring(&dev_priv->ring[RCS]))
-					goto repeat;
+		if (!IS_GEN2(dev)) {
+			/* Is the chip hanging on a WAIT_FOR_EVENT?
+			 * If so we can simply poke the RB_WAIT bit
+			 * and break the hang. This should work on
+			 * all but the second generation chipsets.
+			 */
 
-				if (HAS_BSD(dev) &&
-				    kick_ring(&dev_priv->ring[VCS]))
-					goto repeat;
+			if (kick_ring(&dev_priv->ring[RCS]))
+				goto repeat;
 
-				if (HAS_BLT(dev) &&
-				    kick_ring(&dev_priv->ring[BCS]))
-					goto repeat;
-			}
+			if (HAS_BSD(dev) &&
+			    kick_ring(&dev_priv->ring[VCS]))
+				goto repeat;
 
-			i915_handle_error(dev, true);
-			return;
+			if (HAS_BLT(dev) &&
+			    kick_ring(&dev_priv->ring[BCS]))
+				goto repeat;
 		}
-	} else {
-		dev_priv->hangcheck_count = 0;
 
-		dev_priv->last_acthd = acthd;
-		dev_priv->last_instdone = instdone;
-		dev_priv->last_instdone1 = instdone1;
+		i915_handle_error(dev, true);
+		return;
 	}
 
 repeat:
-- 
1.7.6.1




More information about the Intel-gfx mailing list