[Intel-gfx] [RFCv2 08/12] drm/i915: Debugfs interface for per-engine hang recovery.

Tue Jul 21 06:58:51 PDT 2015

1. The i915_wedged_set() function allows us to schedule three forms of hang recovery:

	a) Legacy hang recovery: By passing e.g. -1 we trigger the legacy full
	GPU reset recovery path.

	b) Single engine hang recovery: By passing an engine ID in the interval
	of [0, I915_NUM_RINGS) we can schedule hang recovery of any single
	engine assuming that the context submission consistency requirements
	are met (otherwise the hang recovery path will simply exit early and
	wait for another hang detection). The values are assumed to use up bits
	3:0 only since we certainly do not support as many as 16 engines.

	This mode is supported since there are several legacy test applications
	that rely on this interface.

	c) Multiple engine hang recovery: By passing in an engine flag mask in
	bits 31:8 (bit 8 corresponds to engine 0 = RCS, bit 9 corresponds to
	engine 1 = VCS etc) we can schedule any combination of engine hang
	recoveries as we please. For example, by passing in the value 0x3 << 8
	we would schedule hang recovery for engines 0 and 1 (RCS and VCS) at
	the same time.

	If bits in fields 3:0 and 31:8 are both used then single engine hang
	recovery mode takes presidence and bits 31:8 are ignored.

2. The i915_hangcheck_read() function produces a set of statistics related to:

	a) Number of engine hangs detected by periodic hang checker.
	b) Number of watchdog timeout hangs detected.
	c) Number of full GPU resets carried out.
	d) Number of engine resets carried out.

	These statistics are presented in a very parser-friendly way and are
	used by the TDR ULT to poll system behaviour to validate test outcomes.

* v2: (Chris Wilson)
- After review comments by Chris Wilson we're dropping the dual-mode parameter
  value interpretation in i915_wedged_set(). In this version we only accept
  engine id flag masks that contain the engine id flags of all currently hung
  engines. Full GPU reset is most easily requested by passing an all zero
  engine id flag mask.

- Moved TDR-specific engine metrics like number of detected engine hangs and
  number of per-engine resets into i915_hangcheck_info() from
  i915_hangcheck_read().

Signed-off-by: Tomas Elf <tomas.elf at intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery at intel.com>
Signed-off-by: Ian Lister <ian.lister at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 76 ++++++++++++++++++++++++++++++++++---
 1 file changed, 71 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a89da48..d99c152 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1302,6 +1302,8 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	} else
 		seq_printf(m, "Hangcheck inactive\n");
 
+	seq_printf(m, "Full GPU resets = %u\n", i915_reset_count(&dev_priv->gpu_error));
+
 	for_each_ring(ring, dev_priv, i) {
 		seq_printf(m, "%s:\n", ring->name);
 		seq_printf(m, "\tseqno = %x [current %x]\n",
@@ -1313,6 +1315,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 			   (long long)ring->hangcheck.max_acthd);
 		seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
 		seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
+		seq_printf(m, "\tengine resets = %u\n",
+			ring->hangcheck.reset_count);
+		seq_printf(m, "\tengine hang detections = %u\n",
+			ring->hangcheck.tdr_count);
+		seq_printf(m, "\tengine watchdog timeout detections = %u\n",
+			ring->hangcheck.watchdog_count);
 	}
 
 	return 0;
@@ -2030,7 +2038,7 @@ static int i915_execlists(struct seq_file *m, void *data)
 		seq_printf(m, "%s\n", ring->name);
 
 		status = I915_READ(RING_EXECLIST_STATUS(ring));
-		ctx_id = I915_READ(RING_EXECLIST_STATUS(ring) + 4);
+		ctx_id = I915_READ(RING_EXECLIST_STATUS_CTX_ID(ring));
 		seq_printf(m, "\tExeclist status: 0x%08X, context: %u\n",
 			   status, ctx_id);
 
@@ -4164,11 +4172,47 @@ i915_wedged_get(void *data, u64 *val)
 	return 0;
 }
 
+static const char *ringid_to_str(enum intel_ring_id ring_id)
+{
+	switch (ring_id) {
+	case RCS:
+		return "RCS";
+	case VCS:
+		return "VCS";
+	case BCS:
+		return "BCS";
+	case VECS:
+		return "VECS";
+	case VCS2:
+		return "VCS2";
+	}
+
+	return "unknown";
+}
+
 static int
 i915_wedged_set(void *data, u64 val)
 {
 	struct drm_device *dev = data;
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_engine_cs *engine;
+	u32 i;
+#define ENGINE_MSGLEN 64
+	char msg[ENGINE_MSGLEN];
+
+	/*
+	 * Val contains the engine flag mask of engines to be reset.
+	 *
+	 * Full GPU reset is implied in the following two cases:
+	 * 1. val == 0x0
+	 * 2. val >= (1 << I915_NUM_RINGS)
+	 *
+	 * Bit 0: RCS engine
+	 * Bit 1: VCS engine
+	 * Bit 2: BCS engine
+	 * Bit 3: VECS engine
+	 * Bit 4: VCS2 engine (if available)
+	 */
 
 	/*
 	 * There is no safeguard against this debugfs entry colliding
@@ -4177,14 +4221,36 @@ i915_wedged_set(void *data, u64 val)
 	 * test harness is responsible enough not to inject gpu hangs
 	 * while it is writing to 'i915_wedged'
 	 */
-
-	if (i915_reset_in_progress(&dev_priv->gpu_error))
+	if (i915_gem_check_wedge(dev_priv, NULL, true))
 		return -EAGAIN;
 
 	intel_runtime_pm_get(dev_priv);
 
-	i915_handle_error(dev, 0x0, false, val,
-			  "Manually setting wedged to %llu", val);
+	memset(msg, 0, sizeof(msg));
+
+	if (val) {
+		scnprintf(msg, sizeof(msg), "Manual reset:");
+
+		/* Assemble message string */
+		for_each_ring(engine, dev_priv, i)
+			if (intel_ring_flag(engine) & val) {
+				DRM_INFO("Manual reset: %s\n", engine->name);
+
+				scnprintf(msg, sizeof(msg),
+					  "%s [%s]",
+					  msg,
+					  ringid_to_str(i));
+			}
+
+	} else {
+		scnprintf(msg, sizeof(msg), "Manual global reset");
+	}
+
+	i915_handle_error(dev,
+			  val,
+			  false,
+			  true,
+			  msg);
 
 	intel_runtime_pm_put(dev_priv);
 
-- 
1.9.1