[Intel-gfx] [RFC 10/11] drm/i915: Debugfs interface for per-engine hang recovery.

Mon Jun 8 10:03:28 PDT 2015

1. The i915_wedged_set function allows us to schedule three forms of hang recovery:

	a) Legacy hang recovery: By passing e.g. -1 we trigger the legacy full
	GPU reset recovery path.

	b) Single engine hang recovery: By passing an engine ID in the interval
	of [0, I915_NUM_RINGS) we can schedule hang recovery of any single
	engine assuming that the context submission consistency requirements
	are met (otherwise the hang recovery path will simply exit early and
	wait for another hang detection). The values are assumed to use up bits
	3:0 only since we certainly do not support as many as 16 engines.

	This mode is supported since there are several legacy test applications
	that rely on this interface.

	c) Multiple engine hang recovery: By passing in an engine flag mask in
	bits 31:8 (bit 8 corresponds to engine 0 = RCS, bit 9 corresponds to
	engine 1 = VCS etc) we can schedule any combination of engine hang
	recoveries as we please. For example, by passing in the value 0x3 << 8
	we would schedule hang recovery for engines 0 and 1 (RCS and VCS) at
	the same time.

	If bits in fields 3:0 and 31:8 are both used then single engine hang
	recovery mode takes presidence and bits 31:8 are ignored.

2. The i915_wedged_get function produces a set of statistics related to:

	a) Number of engine hangs detected by periodic hang checker.
	b) Number of watchdog timeout hangs detected.
	c) Number of full GPU resets carried out.
	d) Number of engine resets carried out.

	These statistics are presented in a very parser-friendly way and are
	used by the TDR ULT to poll system behaviour to validate test outcomes.

Signed-off-by: Tomas Elf <tomas.elf at intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery at intel.com>
Signed-off-by: Ian Lister <ian.lister at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c |  146 +++++++++++++++++++++++++++++++++--
 1 file changed, 141 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a89da48..f3305ed 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2030,7 +2030,7 @@ static int i915_execlists(struct seq_file *m, void *data)
 		seq_printf(m, "%s\n", ring->name);
 
 		status = I915_READ(RING_EXECLIST_STATUS(ring));
-		ctx_id = I915_READ(RING_EXECLIST_STATUS(ring) + 4);
+		ctx_id = I915_READ(RING_EXECLIST_STATUS_CTX_ID(ring));
 		seq_printf(m, "\tExeclist status: 0x%08X, context: %u\n",
 			   status, ctx_id);
 
@@ -4164,11 +4164,50 @@ i915_wedged_get(void *data, u64 *val)
 	return 0;
 }
 
+static const char *ringid_to_str(enum intel_ring_id ring_id)
+{
+	switch (ring_id) {
+	case RCS:
+		return "RCS";
+	case VCS:
+		return "VCS";
+	case BCS:
+		return "BCS";
+	case VECS:
+		return "VECS";
+	case VCS2:
+		return "VCS2";
+	}
+
+	return "unknown";
+}
+
 static int
 i915_wedged_set(void *data, u64 val)
 {
 	struct drm_device *dev = data;
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_engine_cs *engine;
+	const u32 engine_mask = ((1 << I915_NUM_RINGS) - 1);
+	const u32 single_engine_reset_mask = 0xF;
+	const u32 bitfield_boundary = 8;
+	u32 val_mask = 0;
+	u32 i;
+#define ENGINE_MSGLEN 64
+	char msg[ENGINE_MSGLEN] = "";
+
+	/*
+	 * Val can contain values in one of the following mutally exclusive
+	 * formats:
+	 *
+	 * 1. Bits [3:0] != 0x0 :
+	 *	Index (0 .. I915_NUM_RINGS-1) of engine to be manually reset.
+	 *	Invalid indices translate to full gpu reset.
+	 *
+	 * 2. Bits [(I915_NUM_RINGS-1)+8 : 8] != 0x0 :
+	 *	Bit mask containing the engine flags of all the engines that
+	 *	are to be manually reset.
+	 */
 
 	/*
 	 * There is no safeguard against this debugfs entry colliding
@@ -4177,14 +4216,61 @@ i915_wedged_set(void *data, u64 val)
 	 * test harness is responsible enough not to inject gpu hangs
 	 * while it is writing to 'i915_wedged'
 	 */
-
-	if (i915_reset_in_progress(&dev_priv->gpu_error))
+	if (i915_gem_check_wedge(dev_priv, NULL, true))
 		return -EAGAIN;
 
 	intel_runtime_pm_get(dev_priv);
 
-	i915_handle_error(dev, 0x0, false, val,
-			  "Manually setting wedged to %llu", val);
+	if (!val || (single_engine_reset_mask & val)) {
+		/*
+		 * Single engine hang mode
+		 *
+		 * Bits [3:0] of val contains index of engine
+		 * to be manually reset.
+		 */
+		val &= single_engine_reset_mask;
+		if (val == single_engine_reset_mask)
+			val_mask = 0x0;
+		else
+			val_mask = (1 << (val & 0xF));
+
+	} else {
+		/*
+		 * Mask mode
+		 *
+		 * Bits [31:8] of val contains bit mask of engines to be
+		 * manually reset, engine index 0 at bit 4, engine index 1 at
+		 * bit 5 and so forth.
+		 */
+		val_mask = (val >> bitfield_boundary) & engine_mask;
+	}
+
+
+	if (val_mask) {
+		u32 len;
+
+		len = scnprintf(msg, sizeof(msg), "Manual reset:");
+
+		/* Assemble message string */
+		for_each_ring(engine, dev_priv, i)
+			if (intel_ring_flag(engine) & val_mask) {
+				DRM_INFO("Manual reset: %s\n", engine->name);
+
+				len += scnprintf(msg + len, sizeof(msg) - len,
+						 "%s [%s]",
+						 msg,
+						 ringid_to_str(i));
+			}
+
+	} else {
+		scnprintf(msg, sizeof(msg), "Manual global reset");
+	}
+
+	i915_handle_error(dev,
+			  val_mask,
+			  false,
+			  true,
+			  msg);
 
 	intel_runtime_pm_put(dev_priv);
 
@@ -4195,6 +4281,55 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
 			i915_wedged_get, i915_wedged_set,
 			"%llu\n");
 
+static ssize_t
+i915_ring_hangcheck_read(struct file *filp, char __user *ubuf,
+			 size_t max, loff_t *ppos)
+{
+	int i;
+	int len;
+	char buf[300];
+	struct drm_device *dev = filp->private_data;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	/*
+	 * Returns the total number of times the rings
+	 * have hung and been reset since boot
+	 */
+	len = scnprintf(buf, sizeof(buf), "GPU=0x%08X,",
+			i915_reset_count(&dev_priv->gpu_error));
+	for (i = 0; i < I915_NUM_RINGS; ++i)
+		len += scnprintf(buf + len, sizeof(buf) - len,
+				 "%s=0x%08lX,",
+				 ringid_to_str(i),
+				 (long unsigned)
+				 dev_priv->ring[i].hangcheck.reset_count);
+
+	for (i = 0; i < I915_NUM_RINGS; ++i)
+		len += scnprintf(buf + len, sizeof(buf) - len,
+				 "%s_T=0x%08lX,",
+				 ringid_to_str(i),
+				 (long unsigned)
+				 dev_priv->ring[i].hangcheck.tdr_count);
+
+	for (i = 0; i < I915_NUM_RINGS; ++i)
+		len += scnprintf(buf + len, sizeof(buf) - len,
+				 "%s_W=0x%08lX,",
+				 ringid_to_str(i),
+				 (long unsigned)
+				 dev_priv->ring[i].hangcheck.watchdog_count);
+
+	len += scnprintf(buf + len - 1, sizeof(buf) - len, "\n");
+
+	return simple_read_from_buffer(ubuf, max, ppos, buf, len);
+}
+
+static const struct file_operations i915_ring_hangcheck_fops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = i915_ring_hangcheck_read,
+	.llseek = default_llseek,
+};
+
 static int
 i915_ring_stop_get(void *data, u64 *val)
 {
@@ -4825,6 +4960,7 @@ static const struct i915_debugfs_files {
 	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
 	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
 	{"i915_gem_drop_caches", &i915_drop_caches_fops},
+	{"i915_ring_hangcheck", &i915_ring_hangcheck_fops},
 	{"i915_error_state", &i915_error_state_fops},
 	{"i915_next_seqno", &i915_next_seqno_fops},
 	{"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
-- 
1.7.9.5