[Intel-gfx] [PATCH] drm/i915: Make hangcheck logging more compact

Thu Sep 18 10:16:16 CEST 2014

When we detect a GPU hang, we emit a loud bang with the reason. But in
the process, we also log each ring that hangs and then lose that
information in the reason. Combine the two so that an accurate reason
why we triggered the GPU hang is logged in the error state and so that
we no longer need to emit the incremental hang detection.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_irq.c | 55 ++++++++++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 112a3e60a661..d08538bcfb28 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -3190,9 +3190,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		container_of(work, typeof(*dev_priv),
 			     gpu_error.hangcheck_work.work);
 	struct intel_engine_cs *engine;
-	int i;
-	int busy_count = 0, rings_hung = 0;
 	bool stuck[I915_NUM_ENGINES] = { 0 };
+	int busy_count = 0, hung = 0;
+	int i;
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
@@ -3284,19 +3284,52 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		engine->hangcheck.seqno = seqno;
 		engine->hangcheck.acthd = acthd;
 		busy_count += busy;
+
+		hung += engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
 	}
 
-	for_each_engine(engine, dev_priv, i) {
-		if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
-			DRM_INFO("%s on %s\n",
-				 stuck[i] ? "stuck" : "no progress",
-				 engine->name);
-			rings_hung++;
+	if (hung) {
+		char msg[512];
+		int rings_stall, rings_stuck, len;
+
+		len = rings_stall = rings_stuck = 0;
+		for_each_engine(engine, dev_priv, i) {
+			if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG &&
+			    stuck[i]) {
+				if (rings_stuck == 0)
+					len += snprintf(msg + len,
+							sizeof(msg)-len,
+							"Stuck on");
+				len += snprintf(msg + len, sizeof(msg)-len,
+						" %s,", engine->name);
+				rings_stuck++;
+			}
 		}
-	}
+		if (rings_stuck)
+			msg[--len] = '\0';
+
+		for_each_engine(engine, dev_priv, i) {
+			if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG &&
+			    !stuck[i]) {
+				if (rings_stall == 0) {
+					if (rings_stuck)
+						len += snprintf(msg + len,
+								sizeof(msg)-len,
+								"; ");
+					len += snprintf(msg + len,
+							sizeof(msg)-len,
+							"No progress on");
+				}
+				len += snprintf(msg + len, sizeof(msg)-len,
+						" %s,", engine->name);
+				rings_stall++;
+			}
+		}
+		if (rings_stall)
+			msg[--len] = '\0';
 
-	if (rings_hung)
-		return i915_handle_error(dev_priv->dev, true, "Ring hung");
+		return i915_handle_error(dev_priv->dev, true, msg);
+	}
 
 	if (busy_count)
 		/* Reset timer case chip hangs without another request
-- 
2.1.0