[PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty
Chris Wilson
chris at chris-wilson.co.uk
Wed Feb 6 18:01:10 UTC 2019
If we haven't even begun executing the payload of the stalled request,
then we should not claim that its userspace context was guilty of
submitting a hanging batch.
v2: Check for context corruption before trying to restart.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
drivers/gpu/drm/i915/intel_lrc.c | 34 ++++++++++++++++++-
drivers/gpu/drm/i915/selftests/igt_spinner.c | 9 ++++-
.../gpu/drm/i915/selftests/intel_hangcheck.c | 6 ++++
3 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5e98fd79bd9d..5d5ce91a5dfa 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1878,6 +1878,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
+static bool lrc_regs_ok(const struct i915_request *rq)
+{
+ const struct intel_ring *ring = rq->ring;
+ const u32 *regs = rq->hw_context->lrc_reg_state;
+
+ /* Quick spot check for the common signs of context corruption */
+
+ if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
+ (RING_CTL_SIZE(ring->size) | RING_VALID))
+ return false;
+
+ if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
+ return false;
+
+ return true;
+}
+
static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1912,6 +1929,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
if (!rq)
goto out_unlock;
+ /*
+ * If this request hasn't started yet, e.g. it is waiting on a
+ * semaphore, we need to avoid skipping the request or else we
+ * break the signaling chain. However, if the context is corrupt
+ * the request will not restart and we will be stuck with a wedged
+ * device. It is quite often the case that if we issue a reset
+ * while the GPU is loading the context image, that context image
+ * becomes corrupt.
+ *
+ * Otherwise, if we have not started yet, the request should replay
+ * perfectly and we do not need to flag the result as being erroneous.
+ */
+ if (!i915_request_started(rq) && lrc_regs_ok(rq))
+ goto out_unlock;
+
/*
* If the request was innocent, we leave the request in the ELSP
* and will try to replay it on restarting. The context image may
@@ -1924,7 +1956,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
* image back to the expected values to skip over the guilty request.
*/
i915_reset_request(rq, stalled);
- if (!stalled)
+ if (!stalled && lrc_regs_ok(rq))
goto out_unlock;
/*
diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
index 9ebd9225684e..86354e51bdd3 100644
--- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
+++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
@@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin,
*batch++ = upper_32_bits(vma->node.start);
*batch++ = MI_BATCH_BUFFER_END; /* not reached */
- i915_gem_chipset_flush(spin->i915);
+ if (engine->emit_init_breadcrumb &&
+ rq->timeline->has_initial_breadcrumb) {
+ err = engine->emit_init_breadcrumb(rq);
+ if (err)
+ goto cancel_rq;
+ }
err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
+ i915_gem_chipset_flush(spin->i915);
+
cancel_rq:
if (err) {
i915_request_skip(rq, err);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4886fac12628..36c17bfe05a7 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -246,6 +246,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
if (INTEL_GEN(vm->i915) <= 5)
flags |= I915_DISPATCH_SECURE;
+ if (rq->engine->emit_init_breadcrumb) {
+ err = rq->engine->emit_init_breadcrumb(rq);
+ if (err)
+ goto cancel_rq;
+ }
+
err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
cancel_rq:
--
2.20.1
More information about the Intel-gfx-trybot
mailing list