[Intel-gfx] [PATCH 12/20] drm/i915: Debugfs interface for per-engine hang recovery.
Arun Siluvery
arun.siluvery at linux.intel.com
Wed Jan 13 09:28:24 PST 2016
From: Tomas Elf <tomas.elf at intel.com>
1. The i915_wedged_set() function now allows for both legacy full GPU reset and
per-engine reset of one or more engines at a time:
a) Legacy hang recovery by passing 0.
b) Multiple engine hang recovery by passing in an engine flag mask
where bit 0 corresponds to engine 0 = RCS, bit 1 corresponds to engine
1 = VCS etc. This allows for any combination of engine hang recoveries
to be tested. For example, by passing in the value 0x3 hang recovery
for engines 0 and 1 (RCS and VCS) are scheduled at the same time.
2. The i915_hangcheck_info() function is complemented with statistics related
to:
a) Number of engine hangs detected by periodic hang checker.
b) Number of watchdog timeout hangs detected.
c) Number of full GPU resets carried out.
d) Number of engine resets carried out.
Signed-off-by: Tomas Elf <tomas.elf at intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery at intel.com>
Signed-off-by: Ian Lister <ian.lister at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 75 +++++++++++++++++++++++++++++++++++--
1 file changed, 71 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index dabddda..62c9a41 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1357,6 +1357,8 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
} else
seq_printf(m, "Hangcheck inactive\n");
+ seq_printf(m, "Full GPU resets = %u\n", i915_reset_count(&dev_priv->gpu_error));
+
for_each_ring(ring, dev_priv, i) {
seq_printf(m, "%s:\n", ring->name);
seq_printf(m, "\tseqno = %x [current %x]\n",
@@ -1368,6 +1370,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
(long long)ring->hangcheck.max_acthd);
seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
+ seq_printf(m, "\tengine resets = %u\n",
+ ring->hangcheck.reset_count);
+ seq_printf(m, "\tengine hang detections = %u\n",
+ ring->hangcheck.tdr_count);
+ seq_printf(m, "\tengine watchdog timeout detections = %u\n",
+ ring->hangcheck.watchdog_count);
if (ring->id == RCS) {
seq_puts(m, "\tinstdone read =");
@@ -4701,11 +4709,48 @@ i915_wedged_get(void *data, u64 *val)
return 0;
}
+static const char *ringid_to_str(enum intel_ring_id ring_id)
+{
+ switch (ring_id) {
+ case RCS:
+ return "RCS";
+ case VCS:
+ return "VCS";
+ case BCS:
+ return "BCS";
+ case VECS:
+ return "VECS";
+ case VCS2:
+ return "VCS2";
+ }
+
+ return "unknown";
+}
+
static int
i915_wedged_set(void *data, u64 val)
{
struct drm_device *dev = data;
struct drm_i915_private *dev_priv = dev->dev_private;
+ struct intel_engine_cs *engine;
+ u32 i;
+#define ENGINE_MSGLEN 64
+ char msg[ENGINE_MSGLEN];
+
+ /*
+ * Val contains the engine flag mask of engines to be reset.
+ *
+ * * Full GPU reset is caused by passing val == 0x0
+ *
+ * * Any combination of engine hangs is caused by setting up val as a
+ * mask with the following bits set for each engine to be hung:
+ *
+ * Bit 0: RCS engine
+ * Bit 1: VCS engine
+ * Bit 2: BCS engine
+ * Bit 3: VECS engine
+ * Bit 4: VCS2 engine (if available)
+ */
/*
* There is no safeguard against this debugfs entry colliding
@@ -4714,14 +4759,36 @@ i915_wedged_set(void *data, u64 val)
* test harness is responsible enough not to inject gpu hangs
* while it is writing to 'i915_wedged'
*/
-
- if (i915_reset_in_progress(&dev_priv->gpu_error))
+ if (i915_gem_check_wedge(dev_priv, NULL, true))
return -EAGAIN;
intel_runtime_pm_get(dev_priv);
- i915_handle_error(dev, 0x0, false, val,
- "Manually setting wedged to %llu", val);
+ memset(msg, 0, sizeof(msg));
+
+ if (val) {
+ scnprintf(msg, sizeof(msg), "Manual reset:");
+
+ /* Assemble message string */
+ for_each_ring(engine, dev_priv, i)
+ if (intel_ring_flag(engine) & val) {
+ DRM_INFO("Manual reset: %s\n", engine->name);
+
+ scnprintf(msg, sizeof(msg),
+ "%s [%s]",
+ msg,
+ ringid_to_str(i));
+ }
+
+ } else {
+ scnprintf(msg, sizeof(msg), "Manual global reset");
+ }
+
+ i915_handle_error(dev,
+ val,
+ false,
+ true,
+ msg);
intel_runtime_pm_put(dev_priv);
--
1.9.1
More information about the Intel-gfx
mailing list