[Intel-gfx] [RFCv2 08/12] drm/i915: Debugfs interface for per-engine hang recovery.
Tomas Elf
tomas.elf at intel.com
Tue Jul 21 06:58:51 PDT 2015
1. The i915_wedged_set() function allows us to schedule three forms of hang recovery:
a) Legacy hang recovery: By passing e.g. -1 we trigger the legacy full
GPU reset recovery path.
b) Single engine hang recovery: By passing an engine ID in the interval
of [0, I915_NUM_RINGS) we can schedule hang recovery of any single
engine assuming that the context submission consistency requirements
are met (otherwise the hang recovery path will simply exit early and
wait for another hang detection). The values are assumed to use up bits
3:0 only since we certainly do not support as many as 16 engines.
This mode is supported since there are several legacy test applications
that rely on this interface.
c) Multiple engine hang recovery: By passing in an engine flag mask in
bits 31:8 (bit 8 corresponds to engine 0 = RCS, bit 9 corresponds to
engine 1 = VCS etc) we can schedule any combination of engine hang
recoveries as we please. For example, by passing in the value 0x3 << 8
we would schedule hang recovery for engines 0 and 1 (RCS and VCS) at
the same time.
If bits in fields 3:0 and 31:8 are both used then single engine hang
recovery mode takes presidence and bits 31:8 are ignored.
2. The i915_hangcheck_read() function produces a set of statistics related to:
a) Number of engine hangs detected by periodic hang checker.
b) Number of watchdog timeout hangs detected.
c) Number of full GPU resets carried out.
d) Number of engine resets carried out.
These statistics are presented in a very parser-friendly way and are
used by the TDR ULT to poll system behaviour to validate test outcomes.
* v2: (Chris Wilson)
- After review comments by Chris Wilson we're dropping the dual-mode parameter
value interpretation in i915_wedged_set(). In this version we only accept
engine id flag masks that contain the engine id flags of all currently hung
engines. Full GPU reset is most easily requested by passing an all zero
engine id flag mask.
- Moved TDR-specific engine metrics like number of detected engine hangs and
number of per-engine resets into i915_hangcheck_info() from
i915_hangcheck_read().
Signed-off-by: Tomas Elf <tomas.elf at intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery at intel.com>
Signed-off-by: Ian Lister <ian.lister at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 76 ++++++++++++++++++++++++++++++++++---
1 file changed, 71 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a89da48..d99c152 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1302,6 +1302,8 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
} else
seq_printf(m, "Hangcheck inactive\n");
+ seq_printf(m, "Full GPU resets = %u\n", i915_reset_count(&dev_priv->gpu_error));
+
for_each_ring(ring, dev_priv, i) {
seq_printf(m, "%s:\n", ring->name);
seq_printf(m, "\tseqno = %x [current %x]\n",
@@ -1313,6 +1315,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
(long long)ring->hangcheck.max_acthd);
seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
+ seq_printf(m, "\tengine resets = %u\n",
+ ring->hangcheck.reset_count);
+ seq_printf(m, "\tengine hang detections = %u\n",
+ ring->hangcheck.tdr_count);
+ seq_printf(m, "\tengine watchdog timeout detections = %u\n",
+ ring->hangcheck.watchdog_count);
}
return 0;
@@ -2030,7 +2038,7 @@ static int i915_execlists(struct seq_file *m, void *data)
seq_printf(m, "%s\n", ring->name);
status = I915_READ(RING_EXECLIST_STATUS(ring));
- ctx_id = I915_READ(RING_EXECLIST_STATUS(ring) + 4);
+ ctx_id = I915_READ(RING_EXECLIST_STATUS_CTX_ID(ring));
seq_printf(m, "\tExeclist status: 0x%08X, context: %u\n",
status, ctx_id);
@@ -4164,11 +4172,47 @@ i915_wedged_get(void *data, u64 *val)
return 0;
}
+static const char *ringid_to_str(enum intel_ring_id ring_id)
+{
+ switch (ring_id) {
+ case RCS:
+ return "RCS";
+ case VCS:
+ return "VCS";
+ case BCS:
+ return "BCS";
+ case VECS:
+ return "VECS";
+ case VCS2:
+ return "VCS2";
+ }
+
+ return "unknown";
+}
+
static int
i915_wedged_set(void *data, u64 val)
{
struct drm_device *dev = data;
struct drm_i915_private *dev_priv = dev->dev_private;
+ struct intel_engine_cs *engine;
+ u32 i;
+#define ENGINE_MSGLEN 64
+ char msg[ENGINE_MSGLEN];
+
+ /*
+ * Val contains the engine flag mask of engines to be reset.
+ *
+ * Full GPU reset is implied in the following two cases:
+ * 1. val == 0x0
+ * 2. val >= (1 << I915_NUM_RINGS)
+ *
+ * Bit 0: RCS engine
+ * Bit 1: VCS engine
+ * Bit 2: BCS engine
+ * Bit 3: VECS engine
+ * Bit 4: VCS2 engine (if available)
+ */
/*
* There is no safeguard against this debugfs entry colliding
@@ -4177,14 +4221,36 @@ i915_wedged_set(void *data, u64 val)
* test harness is responsible enough not to inject gpu hangs
* while it is writing to 'i915_wedged'
*/
-
- if (i915_reset_in_progress(&dev_priv->gpu_error))
+ if (i915_gem_check_wedge(dev_priv, NULL, true))
return -EAGAIN;
intel_runtime_pm_get(dev_priv);
- i915_handle_error(dev, 0x0, false, val,
- "Manually setting wedged to %llu", val);
+ memset(msg, 0, sizeof(msg));
+
+ if (val) {
+ scnprintf(msg, sizeof(msg), "Manual reset:");
+
+ /* Assemble message string */
+ for_each_ring(engine, dev_priv, i)
+ if (intel_ring_flag(engine) & val) {
+ DRM_INFO("Manual reset: %s\n", engine->name);
+
+ scnprintf(msg, sizeof(msg),
+ "%s [%s]",
+ msg,
+ ringid_to_str(i));
+ }
+
+ } else {
+ scnprintf(msg, sizeof(msg), "Manual global reset");
+ }
+
+ i915_handle_error(dev,
+ val,
+ false,
+ true,
+ msg);
intel_runtime_pm_put(dev_priv);
--
1.9.1
More information about the Intel-gfx
mailing list