[PATCH 52/59] hangcheck
Chris Wilson
chris at chris-wilson.co.uk
Fri Dec 21 16:07:58 UTC 2018
---
drivers/gpu/drm/i915/i915_debugfs.c | 80 +-----
drivers/gpu/drm/i915/i915_drv.h | 5 +-
drivers/gpu/drm/i915/i915_gpu_error.c | 120 +++------
drivers/gpu/drm/i915/i915_gpu_error.h | 9 +-
drivers/gpu/drm/i915/intel_engine_cs.c | 9 +-
drivers/gpu/drm/i915/intel_hangcheck.c | 330 +++++++-----------------
drivers/gpu/drm/i915/intel_ringbuffer.h | 46 +---
7 files changed, 148 insertions(+), 451 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6c463dbfe3f6..8b18b0fd86b5 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1252,42 +1252,10 @@ static int i915_frequency_info(struct seq_file *m, void *unused)
return ret;
}
-static void i915_instdone_info(struct drm_i915_private *dev_priv,
- struct seq_file *m,
- struct intel_instdone *instdone)
-{
- int slice;
- int subslice;
-
- seq_printf(m, "\t\tINSTDONE: 0x%08x\n",
- instdone->instdone);
-
- if (INTEL_GEN(dev_priv) <= 3)
- return;
-
- seq_printf(m, "\t\tSC_INSTDONE: 0x%08x\n",
- instdone->slice_common);
-
- if (INTEL_GEN(dev_priv) <= 6)
- return;
-
- for_each_instdone_slice_subslice(dev_priv, slice, subslice)
- seq_printf(m, "\t\tSAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
- slice, subslice, instdone->sampler[slice][subslice]);
-
- for_each_instdone_slice_subslice(dev_priv, slice, subslice)
- seq_printf(m, "\t\tROW_INSTDONE[%d][%d]: 0x%08x\n",
- slice, subslice, instdone->row[slice][subslice]);
-}
-
static int i915_hangcheck_info(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_engine_cs *engine;
- u64 acthd[I915_NUM_ENGINES];
- u32 seqno[I915_NUM_ENGINES];
- struct intel_instdone instdone;
- intel_wakeref_t wakeref;
enum intel_engine_id id;
if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
@@ -1304,15 +1272,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
return 0;
}
- with_intel_runtime_pm(dev_priv, wakeref) {
- for_each_engine(engine, dev_priv, id) {
- acthd[id] = intel_engine_get_active_head(engine);
- seqno[id] = intel_engine_get_seqno(engine);
- }
-
- intel_engine_get_instdone(dev_priv->engine[RCS], &instdone);
- }
-
if (timer_pending(&dev_priv->gpu_error.hangcheck_work.timer))
seq_printf(m, "Hangcheck active, timer fires in %dms\n",
jiffies_to_msecs(dev_priv->gpu_error.hangcheck_work.timer.expires -
@@ -1329,15 +1288,13 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
struct rb_node *rb;
seq_printf(m, "%s:\n", engine->name);
- seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
- engine->hangcheck.seqno, seqno[id],
- intel_engine_last_submit(engine));
- seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
+ seq_printf(m, "\tseqno = %llx, emitted %dms ago\n",
+ engine->hangcheck ? engine->hangcheck->fence.seqno : 0,
+ engine->hangcheck ? jiffies_to_msecs(jiffies - engine->hangcheck->emitted_jiffies) : -1);
+ seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
yesno(intel_engine_has_waiter(engine)),
yesno(test_bit(engine->id,
- &dev_priv->gpu_error.missed_irq_rings)),
- yesno(engine->hangcheck.stalled),
- yesno(engine->hangcheck.wedged));
+ &dev_priv->gpu_error.missed_irq_rings)));
spin_lock_irq(&b->rb_lock);
for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1349,26 +1306,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
lower_32_bits(w->global_seqno));
}
spin_unlock_irq(&b->rb_lock);
-
- seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
- (long long)engine->hangcheck.acthd,
- (long long)acthd[id]);
- seq_printf(m, "\taction = %s(%d) %d ms ago\n",
- hangcheck_action_to_str(engine->hangcheck.action),
- engine->hangcheck.action,
- jiffies_to_msecs(jiffies -
- engine->hangcheck.action_timestamp));
-
- if (engine->id == RCS) {
- seq_puts(m, "\tinstdone read =\n");
-
- i915_instdone_info(dev_priv, m, &instdone);
-
- seq_puts(m, "\tinstdone accu =\n");
-
- i915_instdone_info(dev_priv, m,
- &engine->hangcheck.instdone);
- }
}
return 0;
@@ -3903,8 +3840,6 @@ static int
i915_wedged_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
- struct intel_engine_cs *engine;
- unsigned int tmp;
/*
* There is no safeguard against this debugfs entry colliding
@@ -3917,11 +3852,6 @@ i915_wedged_set(void *data, u64 val)
if (i915_reset_backoff(&i915->gpu_error))
return -EAGAIN;
- for_each_engine_masked(engine, i915, val, tmp) {
- engine->hangcheck.seqno = intel_engine_get_seqno(engine);
- engine->hangcheck.stalled = true;
- }
-
i915_handle_error(i915, val, I915_ERROR_CAPTURE,
"Manually set wedged engine mask = %llx", val);
return 0;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 02c2ed3cc359..2aeea38140b1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -924,9 +924,7 @@ struct i915_gem_mm {
#define I915_RESET_TIMEOUT (10 * HZ) /* 10s */
#define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */
-#define I915_ENGINE_DEAD_TIMEOUT (4 * HZ) /* Seqno, head and subunits dead */
-#define I915_SEQNO_DEAD_TIMEOUT (12 * HZ) /* Seqno dead with active head */
-
+#define I915_ENGINE_STALLED_TIMEOUT (12 * HZ) /* No progress? */
#define I915_ENGINE_WEDGED_TIMEOUT (60 * HZ) /* Reset but no recovery? */
struct ddi_vbt_port_info {
@@ -2605,7 +2603,6 @@ extern int i915_driver_load(struct pci_dev *pdev,
const struct pci_device_id *ent);
extern void i915_driver_unload(struct drm_device *dev);
-extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index be6c80f38351..75e978d21bc5 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -530,13 +530,6 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
err_printf(m, " waiting: %s\n", yesno(ee->waiting));
err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
- err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
- err_printf(m, " hangcheck action: %s\n",
- hangcheck_action_to_str(ee->hangcheck_action));
- err_printf(m, " hangcheck action timestamp: %dms (%lu%s)\n",
- jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
- ee->hangcheck_timestamp,
- ee->hangcheck_timestamp == epoch ? "; epoch" : "");
err_printf(m, " engine reset count: %u\n", ee->reset_count);
for (n = 0; n < ee->num_ports; n++) {
@@ -677,17 +670,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
jiffies_to_msecs(jiffies - error->capture),
jiffies_to_msecs(error->capture - error->epoch));
- for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
- if (error->engine[i].hangcheck_stalled &&
- error->engine[i].context.pid) {
- err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
- engine_name(m->i915, i),
- error->engine[i].context.comm,
- error->engine[i].context.pid,
- error->engine[i].context.ban_score,
- bannable(&error->engine[i].context));
- }
- }
err_printf(m, "Reset count: %u\n", error->reset_count);
err_printf(m, "Suspend count: %u\n", error->suspend_count);
err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
@@ -1123,40 +1105,6 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
return i;
}
-/* Generate a semi-unique error code. The code is not meant to have meaning, The
- * code's only purpose is to try to prevent false duplicated bug reports by
- * grossly estimating a GPU error state.
- *
- * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
- * the hang if we could strip the GTT offset information from it.
- *
- * It's only a small step better than a random number in its current form.
- */
-static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
- struct i915_gpu_state *error,
- int *engine_id)
-{
- uint32_t error_code = 0;
- int i;
-
- /* IPEHR would be an ideal way to detect errors, as it's the gross
- * measure of "the command that hung." However, has some very common
- * synchronization commands which almost always appear in the case
- * strictly a client bug. Use instdone to differentiate those some.
- */
- for (i = 0; i < I915_NUM_ENGINES; i++) {
- if (error->engine[i].hangcheck_stalled) {
- if (engine_id)
- *engine_id = i;
-
- return error->engine[i].ipehr ^
- error->engine[i].instdone.instdone;
- }
- }
-
- return error_code;
-}
-
static void gem_record_fences(struct i915_gpu_state *error)
{
struct drm_i915_private *dev_priv = error->i915;
@@ -1317,9 +1265,6 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
}
ee->idle = intel_engine_is_idle(engine);
- ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
- ee->hangcheck_action = engine->hangcheck.action;
- ee->hangcheck_stalled = engine->hangcheck.stalled;
ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
engine);
@@ -1757,31 +1702,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
error->pgtbl_er = I915_READ(PGTBL_ER);
}
-static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
- struct i915_gpu_state *error,
- u32 engine_mask,
- const char *error_msg)
+static const char *
+error_msg(struct i915_gpu_state *error,
+ unsigned long engine_mask,
+ const char *msg)
{
- u32 ecode;
- int engine_id = -1, len;
+ int id, len;
- ecode = i915_error_generate_code(dev_priv, error, &engine_id);
+ len = scnprintf(error->error_msg, sizeof(error->error_msg), "GPU hang");
- len = scnprintf(error->error_msg, sizeof(error->error_msg),
- "GPU HANG: ecode %d:%d:0x%08x",
- INTEL_GEN(dev_priv), engine_id, ecode);
+ for_each_set_bit(id, &engine_mask, BITS_PER_LONG) {
+ if (!error->engine[id].context.pid)
+ continue;
- if (engine_id != -1 && error->engine[engine_id].context.pid)
len += scnprintf(error->error_msg + len,
sizeof(error->error_msg) - len,
- ", in %s [%d]",
- error->engine[engine_id].context.comm,
- error->engine[engine_id].context.pid);
+ " in %s [%d]",
+ error->engine[id].context.comm,
+ error->engine[id].context.pid);
+ break;
+ }
- scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
- ", reason: %s, action: %s",
- error_msg,
- engine_mask ? "reset" : "continue");
+ if (msg) {
+ scnprintf(error->error_msg + len,
+ sizeof(error->error_msg) - len,
+ ", %s",
+ msg);
+ }
+
+ return error->error_msg;
}
static void capture_gen_state(struct i915_gpu_state *error)
@@ -1821,15 +1770,17 @@ static void capture_params(struct i915_gpu_state *error)
static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
{
- unsigned long epoch = error->capture;
- int i;
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+ unsigned long epoch;
- for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
- const struct drm_i915_error_engine *ee = &error->engine[i];
+ epoch = error->capture;
+ for_each_engine(engine, error->i915, id) {
+ if (!engine->hangcheck)
+ continue;
- if (ee->hangcheck_stalled &&
- time_before(ee->hangcheck_timestamp, epoch))
- epoch = ee->hangcheck_timestamp;
+ if (time_before(engine->hangcheck->emitted_jiffies, epoch))
+ epoch = engine->hangcheck->emitted_jiffies;
}
return epoch;
@@ -1901,7 +1852,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
* i915_capture_error_state - capture an error record for later analysis
* @i915: i915 device
* @engine_mask: the mask of engines triggering the hang
- * @error_msg: a message to insert into the error capture header
+ * @msg: supplementary message to attach to the error state
*
* Should be called when an error is detected (either a hang or an error
* interrupt) to capture error state from the time of the error. Fills
@@ -1909,8 +1860,8 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
* to pick up.
*/
void i915_capture_error_state(struct drm_i915_private *i915,
- u32 engine_mask,
- const char *error_msg)
+ unsigned long engine_mask,
+ const char *msg)
{
static bool warned;
struct i915_gpu_state *error;
@@ -1926,8 +1877,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
if (IS_ERR(error))
return;
- i915_error_capture_msg(i915, error, engine_mask, error_msg);
- DRM_INFO("%s\n", error->error_msg);
+ dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
if (!error->simulated) {
spin_lock_irqsave(&i915->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index ea3346d5acae..7f68db4d2f19 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -83,9 +83,6 @@ struct i915_gpu_state {
bool idle;
bool waiting;
int num_waiters;
- unsigned long hangcheck_timestamp;
- bool hangcheck_stalled;
- enum intel_engine_hangcheck_action hangcheck_action;
struct i915_address_space *vm;
int num_requests;
u32 reset_count;
@@ -304,9 +301,9 @@ __printf(2, 3)
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
-void i915_capture_error_state(struct drm_i915_private *dev_priv,
- u32 engine_mask,
- const char *error_msg);
+void i915_capture_error_state(struct drm_i915_private *i915,
+ unsigned long engine_mask,
+ const char *msg);
static inline struct i915_gpu_state *
i915_gpu_state_get(struct i915_gpu_state *gpu)
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 82f870c7a1e9..df81d53a9e28 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -489,7 +489,6 @@ void intel_engine_setup_common(struct intel_engine_cs *engine)
i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
intel_engine_init_execlist(engine);
- intel_engine_init_hangcheck(engine);
intel_engine_init_batch_pool(engine);
intel_engine_init_cmd_parser(engine);
}
@@ -1113,6 +1112,8 @@ void intel_engines_park(struct drm_i915_private *i915)
i915_gem_batch_pool_fini(&engine->batch_pool);
engine->execlists.no_priolist = false;
+
+ i915_request_put(fetch_and_zero(&engine->hangcheck));
}
}
@@ -1140,8 +1141,6 @@ void intel_engines_unpark(struct drm_i915_private *i915)
if (engine->unpark)
engine->unpark(engine);
-
- intel_engine_init_hangcheck(engine);
}
}
@@ -1433,8 +1432,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
intel_engine_get_seqno(engine),
intel_engine_last_submit(engine),
- engine->hangcheck.seqno,
- jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
+ engine->hangcheck ? lower_32_bits(i915_request_global_seqno(engine->hangcheck)) : 0,
+ engine->hangcheck ? jiffies_to_msecs(jiffies - engine->hangcheck->emitted_jiffies) : -1);
drm_printf(m, "\tReset count: %d (global %d)\n",
i915_reset_engine_count(error, engine),
i915_reset_count(error));
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 7dc11fcb13de..390f1e4418f3 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -25,215 +25,25 @@
#include "i915_drv.h"
#include "i915_reset.h"
-static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
-{
- u32 tmp = current_instdone | *old_instdone;
- bool unchanged;
-
- unchanged = tmp == *old_instdone;
- *old_instdone |= tmp;
-
- return unchanged;
-}
-
-static bool subunits_stuck(struct intel_engine_cs *engine)
-{
- struct drm_i915_private *dev_priv = engine->i915;
- struct intel_instdone instdone;
- struct intel_instdone *accu_instdone = &engine->hangcheck.instdone;
- bool stuck;
- int slice;
- int subslice;
-
- if (engine->id != RCS)
- return true;
-
- intel_engine_get_instdone(engine, &instdone);
-
- /* There might be unstable subunit states even when
- * actual head is not moving. Filter out the unstable ones by
- * accumulating the undone -> done transitions and only
- * consider those as progress.
- */
- stuck = instdone_unchanged(instdone.instdone,
- &accu_instdone->instdone);
- stuck &= instdone_unchanged(instdone.slice_common,
- &accu_instdone->slice_common);
-
- for_each_instdone_slice_subslice(dev_priv, slice, subslice) {
- stuck &= instdone_unchanged(instdone.sampler[slice][subslice],
- &accu_instdone->sampler[slice][subslice]);
- stuck &= instdone_unchanged(instdone.row[slice][subslice],
- &accu_instdone->row[slice][subslice]);
- }
-
- return stuck;
-}
-
-static enum intel_engine_hangcheck_action
-head_stuck(struct intel_engine_cs *engine, u64 acthd)
-{
- if (acthd != engine->hangcheck.acthd) {
-
- /* Clear subunit states on head movement */
- memset(&engine->hangcheck.instdone, 0,
- sizeof(engine->hangcheck.instdone));
-
- return ENGINE_ACTIVE_HEAD;
- }
-
- if (!subunits_stuck(engine))
- return ENGINE_ACTIVE_SUBUNITS;
-
- return ENGINE_DEAD;
-}
-
-static enum intel_engine_hangcheck_action
-engine_stuck(struct intel_engine_cs *engine, u64 acthd)
-{
- struct drm_i915_private *dev_priv = engine->i915;
- enum intel_engine_hangcheck_action ha;
- u32 tmp;
-
- ha = head_stuck(engine, acthd);
- if (ha != ENGINE_DEAD)
- return ha;
-
- if (IS_GEN(dev_priv, 2))
- return ENGINE_DEAD;
-
- /* Is the chip hanging on a WAIT_FOR_EVENT?
- * If so we can simply poke the RB_WAIT bit
- * and break the hang. This should work on
- * all but the second generation chipsets.
- */
- tmp = I915_READ_CTL(engine);
- if (tmp & RING_WAIT) {
- i915_handle_error(dev_priv, BIT(engine->id), 0,
- "stuck wait on %s", engine->name);
- I915_WRITE_CTL(engine, tmp);
- return ENGINE_WAIT_KICK;
- }
-
- return ENGINE_DEAD;
-}
-
-static void hangcheck_load_sample(struct intel_engine_cs *engine,
- struct intel_engine_hangcheck *hc)
-{
- hc->acthd = intel_engine_get_active_head(engine);
- hc->seqno = intel_engine_get_seqno(engine);
-}
-
-static void hangcheck_store_sample(struct intel_engine_cs *engine,
- const struct intel_engine_hangcheck *hc)
-{
- engine->hangcheck.acthd = hc->acthd;
- engine->hangcheck.seqno = hc->seqno;
- engine->hangcheck.action = hc->action;
- engine->hangcheck.stalled = hc->stalled;
- engine->hangcheck.wedged = hc->wedged;
-}
-
-static enum intel_engine_hangcheck_action
-hangcheck_get_action(struct intel_engine_cs *engine,
- const struct intel_engine_hangcheck *hc)
-{
- if (engine->hangcheck.seqno != hc->seqno)
- return ENGINE_ACTIVE_SEQNO;
-
- if (intel_engine_is_idle(engine))
- return ENGINE_IDLE;
-
- return engine_stuck(engine, hc->acthd);
-}
-
-static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
- struct intel_engine_hangcheck *hc)
-{
- unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
-
- hc->action = hangcheck_get_action(engine, hc);
-
- /* We always increment the progress
- * if the engine is busy and still processing
- * the same request, so that no single request
- * can run indefinitely (such as a chain of
- * batches). The only time we do not increment
- * the hangcheck score on this ring, if this
- * engine is in a legitimate wait for another
- * engine. In that case the waiting engine is a
- * victim and we want to be sure we catch the
- * right culprit. Then every time we do kick
- * the ring, make it as a progress as the seqno
- * advancement might ensure and if not, it
- * will catch the hanging engine.
- */
-
- switch (hc->action) {
- case ENGINE_IDLE:
- case ENGINE_ACTIVE_SEQNO:
- /* Clear head and subunit states on seqno movement */
- hc->acthd = 0;
-
- memset(&engine->hangcheck.instdone, 0,
- sizeof(engine->hangcheck.instdone));
-
- /* Intentional fall through */
- case ENGINE_WAIT_KICK:
- case ENGINE_WAIT:
- engine->hangcheck.action_timestamp = jiffies;
- break;
-
- case ENGINE_ACTIVE_HEAD:
- case ENGINE_ACTIVE_SUBUNITS:
- /*
- * Seqno stuck with still active engine gets leeway,
- * in hopes that it is just a long shader.
- */
- timeout = I915_SEQNO_DEAD_TIMEOUT;
- break;
-
- case ENGINE_DEAD:
- if (GEM_SHOW_DEBUG()) {
- struct drm_printer p = drm_debug_printer("hangcheck");
- intel_engine_dump(engine, &p, "%s\n", engine->name);
- }
- break;
-
- default:
- MISSING_CASE(hc->action);
- }
-
- hc->stalled = time_after(jiffies,
- engine->hangcheck.action_timestamp + timeout);
- hc->wedged = time_after(jiffies,
- engine->hangcheck.action_timestamp +
- I915_ENGINE_WEDGED_TIMEOUT);
-}
-
-static void hangcheck_declare_hang(struct drm_i915_private *i915,
- unsigned int hung,
- unsigned int stuck)
+static void
+hangcheck_declare_stall(struct drm_i915_private *i915, unsigned int stalled)
{
struct intel_engine_cs *engine;
- char msg[80];
unsigned int tmp;
+ char msg[80];
int len;
- /* If some rings hung but others were still busy, only
+ /*
+ * If some rings stalled but others were still busy, only
* blame the hanging rings in the synopsis.
*/
- if (stuck != hung)
- hung &= ~stuck;
- len = scnprintf(msg, sizeof(msg),
- "%s on ", stuck == hung ? "no progress" : "hang");
- for_each_engine_masked(engine, i915, hung, tmp)
+ len = scnprintf(msg, sizeof(msg), "stall on ");
+ for_each_engine_masked(engine, i915, stalled, tmp)
len += scnprintf(msg + len, sizeof(msg) - len,
"%s, ", engine->name);
msg[len-2] = '\0';
- return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
+ return i915_handle_error(i915, stalled, I915_ERROR_CAPTURE, "%s", msg);
}
/*
@@ -246,64 +56,122 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
*/
static void i915_hangcheck_elapsed(struct work_struct *work)
{
- struct drm_i915_private *dev_priv =
- container_of(work, typeof(*dev_priv),
+ struct drm_i915_private *i915 =
+ container_of(work, typeof(*i915),
gpu_error.hangcheck_work.work);
struct intel_engine_cs *engine;
enum intel_engine_id id;
- unsigned int hung = 0, stuck = 0, wedged = 0;
+ unsigned int inject = 0, stalled = 0, wedged = 0;
+ struct i915_wedge_me w;
if (!i915_modparams.enable_hangcheck)
return;
- if (!READ_ONCE(dev_priv->gt.awake))
+ if (!READ_ONCE(i915->gt.awake))
return;
- if (i915_terminally_wedged(&dev_priv->gpu_error))
- return;
+ for_each_engine(engine, i915, id) {
+ struct i915_request *rq;
- /* As enabling the GPU requires fairly extensive mmio access,
- * periodically arm the mmio checker to see if we are triggering
- * any invalid access.
- */
- intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
+ /* Did our heartbeat stop? */
+ rq = engine->hangcheck;
+ if (rq) {
+ if (i915_request_completed(rq)) {
+ engine->hangcheck = NULL;
+ continue;
+ }
- for_each_engine(engine, dev_priv, id) {
- struct intel_engine_hangcheck hc;
+ if (time_after(jiffies,
+ rq->emitted_jiffies + I915_ENGINE_STALLED_TIMEOUT))
+ stalled |= intel_engine_flag(engine);
- hangcheck_load_sample(engine, &hc);
- hangcheck_accumulate_sample(engine, &hc);
- hangcheck_store_sample(engine, &hc);
+ if (time_after(jiffies,
+ rq->emitted_jiffies + I915_ENGINE_WEDGED_TIMEOUT))
+ wedged |= intel_engine_flag(engine);
- if (engine->hangcheck.stalled) {
- hung |= intel_engine_flag(engine);
- if (hc.action != ENGINE_DEAD)
- stuck |= intel_engine_flag(engine);
+ continue;
}
- if (engine->hangcheck.wedged)
- wedged |= intel_engine_flag(engine);
+ /* No heartbeat yet, is the engine active? */
+ spin_lock_irq(&engine->timeline.lock);
+ list_for_each_entry(rq, &engine->timeline.requests, link) {
+ if (i915_request_completed(rq))
+ continue;
+
+ /*
+ * Does the engine appear to be stalling?
+ *
+ * If a ready-to-run request picked at random from the
+ * queue has been stuck for a while, queue our own
+ * request to check the engine's pulse.
+ */
+ if (time_after(jiffies,
+ rq->emitted_jiffies + I915_ENGINE_STALLED_TIMEOUT))
+ inject |= intel_engine_flag(engine);
+
+ break;
+ }
+ spin_unlock_irq(&engine->timeline.lock);
}
if (wedged) {
- dev_err(dev_priv->drm.dev,
+ dev_err(i915->drm.dev,
"GPU recovery timed out,"
" cancelling all in-flight rendering.\n");
GEM_TRACE_DUMP();
- i915_gem_set_wedged(dev_priv);
+ i915_gem_set_wedged(i915);
}
- if (hung)
- hangcheck_declare_hang(dev_priv, hung, stuck);
+ if (i915_terminally_wedged(&i915->gpu_error))
+ return;
- /* Reset timer in case GPU hangs without another request being added */
- i915_queue_hangcheck(dev_priv);
-}
+ if (stalled) {
+ hangcheck_declare_stall(i915, stalled);
+ goto restart;
+ }
-void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
-{
- memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
- engine->hangcheck.action_timestamp = jiffies;
+ if (!inject)
+ goto restart;
+
+ /*
+ * As enabling the GPU requires fairly extensive mmio access,
+ * periodically arm the mmio checker to see if we are triggering
+ * any invalid access.
+ */
+ intel_uncore_arm_unclaimed_mmio_detection(i915);
+
+ i915_wedge_on_timeout(&w, i915, 5 * HZ) {
+ mutex_lock(&i915->drm.struct_mutex);
+ for_each_engine(engine, i915, id) {
+ struct i915_sched_attr attr = {
+ .priority = INT_MAX
+ };
+ struct i915_request *rq;
+
+ if (!(inject & intel_engine_flag(engine)))
+ continue;
+
+ GEM_BUG_ON(engine->hangcheck);
+
+ rq = i915_request_alloc(engine, i915->kernel_context);
+ if (IS_ERR(rq))
+ continue;
+
+ engine->hangcheck = i915_request_get(rq);
+ i915_request_add(rq);
+
+ /* Priority boost for preemption past blocking CL kernels */
+ rcu_read_lock();
+ if (engine->schedule)
+ engine->schedule(engine->hangcheck, &attr);
+ rcu_read_unlock();
+ }
+ mutex_unlock(&i915->drm.struct_mutex);
+ }
+
+restart:
+ /* Reset timer in case GPU hangs without another request being added */
+ i915_queue_hangcheck(i915);
}
void intel_hangcheck_init(struct drm_i915_private *i915)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index be650704bab0..cd18f6fff265 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -57,39 +57,6 @@ struct intel_hw_status_page {
/* seqno size is actually only a uint32, but since we plan to use MI_FLUSH_DW to
* do the writes, and that must have qw aligned offsets, simply pretend it's 8b.
*/
-enum intel_engine_hangcheck_action {
- ENGINE_IDLE = 0,
- ENGINE_WAIT,
- ENGINE_ACTIVE_SEQNO,
- ENGINE_ACTIVE_HEAD,
- ENGINE_ACTIVE_SUBUNITS,
- ENGINE_WAIT_KICK,
- ENGINE_DEAD,
-};
-
-static inline const char *
-hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
-{
- switch (a) {
- case ENGINE_IDLE:
- return "idle";
- case ENGINE_WAIT:
- return "wait";
- case ENGINE_ACTIVE_SEQNO:
- return "active seqno";
- case ENGINE_ACTIVE_HEAD:
- return "active head";
- case ENGINE_ACTIVE_SUBUNITS:
- return "active subunits";
- case ENGINE_WAIT_KICK:
- return "wait kick";
- case ENGINE_DEAD:
- return "dead";
- }
-
- return "unknown";
-}
-
#define I915_MAX_SLICES 3
#define I915_MAX_SUBSLICES 8
@@ -117,17 +84,6 @@ struct intel_instdone {
u32 row[I915_MAX_SLICES][I915_MAX_SUBSLICES];
};
-struct intel_engine_hangcheck {
- u64 acthd;
- u32 seqno;
- enum intel_engine_hangcheck_action action;
- unsigned long action_timestamp;
- int deadlock;
- struct intel_instdone instdone;
- bool stalled:1;
- bool wedged:1;
-};
-
struct intel_ring {
struct i915_vma *vma;
void *vaddr;
@@ -514,7 +470,7 @@ struct intel_engine_cs {
/* status_notifier: list of callbacks for context-switch changes */
struct atomic_notifier_head context_status_notifier;
- struct intel_engine_hangcheck hangcheck;
+ struct i915_request *hangcheck;
#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
#define I915_ENGINE_SUPPORTS_STATS BIT(1)
--
2.20.1
More information about the Intel-gfx-trybot
mailing list