[PATCH 07/47] drm/i915: Don't claim an unstarted request was guilty

Fri Feb 8 20:52:19 UTC 2019

If we haven't even begun executing the payload of the stalled request,
then we should not claim that its userspace context was guilty of
submitting a hanging batch.

v2: Check for context corruption before trying to restart.
v3: Preserve semaphores on skipping requests (need to keep the timelines
intact).

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c              | 42 +++++++++++++++++--
 drivers/gpu/drm/i915/selftests/igt_spinner.c  |  7 ++++
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |  6 +++
 3 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5e98fd79bd9d..1b567a3f006a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
 	*cs++ = rq->fence.seqno - 1;
 
 	intel_ring_advance(rq, cs);
+
+	/* Record the updated position of the request's payload */
+	rq->infix = intel_ring_offset(rq, cs);
+
 	return 0;
 }
 
@@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
+static bool lrc_regs_ok(const struct i915_request *rq)
+{
+	const struct intel_ring *ring = rq->ring;
+	const u32 *regs = rq->hw_context->lrc_reg_state;
+
+	/* Quick spot check for the common signs of context corruption */
+
+	if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
+	    (RING_CTL_SIZE(ring->size) | RING_VALID))
+		return false;
+
+	if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
+		return false;
+
+	return true;
+}
+
 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	if (!rq)
 		goto out_unlock;
 
+	/*
+	 * If this request hasn't started yet, e.g. it is waiting on a
+	 * semaphore, we need to avoid skipping the request or else we
+	 * break the signaling chain. However, if the context is corrupt
+	 * the request will not restart and we will be stuck with a wedged
+	 * device. It is quite often the case that if we issue a reset
+	 * while the GPU is loading the context image, that the context
+	 * image becomes corrupt.
+	 *
+	 * Otherwise, if we have not started yet, the request should replay
+	 * perfectly and we do not need to flag the result as being erroneous.
+	 */
+	if (!i915_request_started(rq) && lrc_regs_ok(rq))
+		goto out_unlock;
+
 	/*
 	 * If the request was innocent, we leave the request in the ELSP
 	 * and will try to replay it on restarting. The context image may
@@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	 * image back to the expected values to skip over the guilty request.
 	 */
 	i915_reset_request(rq, stalled);
-	if (!stalled)
+	if (!stalled && lrc_regs_ok(rq))
 		goto out_unlock;
 
 	/*
@@ -1942,8 +1978,8 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 		       engine->context_size - PAGE_SIZE);
 	}
 
-	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-	rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
+	/* Rerun the request; its payload has been neutered (if guilty). */
+	rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
 	intel_ring_update_space(rq->ring);
 
 	execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
index 9ebd9225684e..d0b93a3fbc54 100644
--- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
+++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
@@ -144,6 +144,13 @@ igt_spinner_create_request(struct igt_spinner *spin,
 
 	i915_gem_chipset_flush(spin->i915);
 
+	if (engine->emit_init_breadcrumb &&
+	    rq->timeline->has_initial_breadcrumb) {
+		err = engine->emit_init_breadcrumb(rq);
+		if (err)
+			goto cancel_rq;
+	}
+
 	err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
 
 cancel_rq:
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4886fac12628..92475596ff40 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -242,6 +242,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
 	i915_gem_chipset_flush(h->i915);
 
+	if (rq->engine->emit_init_breadcrumb) {
+		err = rq->engine->emit_init_breadcrumb(rq);
+		if (err)
+			goto cancel_rq;
+	}
+
 	flags = 0;
 	if (INTEL_GEN(vm->i915) <= 5)
 		flags |= I915_DISPATCH_SECURE;
-- 
2.20.1