[Intel-gfx] [PATCH v3] drm/i915/gt: Be defensive in the face of false CS events
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri Jul 10 13:43:24 UTC 2020
On 10/07/2020 14:31, Chris Wilson wrote:
> If the HW throws a curve ball and reports either en event before it is
> possible, or just a completely impossible event, we have to grin and
> bear it. The first few events, we will likely not notice as we would be
> expecting some event, but as soon as we stop expecting an event and yet
> they still keep coming, then we enter into undefined state territory.
> In which case, bail out, stop processing the events, and reset the
> engine and our set of queued requests to recover.
>
> The sporadic hangs and warnings will continue to plague CI, but at least
> system stability should not be compromised.
>
> v2: Commentary and force the reset-on-error.
> v3: Customised user facing message for forced resets from internal errors.
>
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2045
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com> #v1
> ---
> drivers/gpu/drm/i915/gt/intel_engine_types.h | 4 ++
> drivers/gpu/drm/i915/gt/intel_gt_irq.c | 3 +-
> drivers/gpu/drm/i915/gt/intel_lrc.c | 45 +++++++++++++++++---
> 3 files changed, 45 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 490af81bd6f3..8de92fd7d392 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -177,8 +177,12 @@ struct intel_engine_execlists {
> * the first error interrupt, record the EIR and schedule the tasklet.
> * In the tasklet, we process the pending CS events to ensure we have
> * the guilty request, and then reset the engine.
> + *
> + * Low 16b are used by HW, with the upper 16b used as the enabling mask.
> + * Reserve the upper 16b for tracking internal errors.
> */
> u32 error_interrupt;
> +#define ERROR_CSB BIT(31)
>
> /**
> * @reset_ccid: Active CCID [EXECLISTS_STATUS_HI] at the time of reset
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
> index e1964cf40fd6..b05da68e52f4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
> @@ -27,7 +27,8 @@ cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) {
> u32 eir;
>
> - eir = ENGINE_READ(engine, RING_EIR);
> + /* Upper 16b are the enabling mask, rsvd for internal errors */
> + eir = ENGINE_READ(engine, RING_EIR) & GENMASK(15, 0);
> ENGINE_TRACE(engine, "CS error: %x\n", eir);
>
> /* Disable the error interrupt until after the reset */
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index fbcfeaed6441..cd4262cc96e2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2568,6 +2568,25 @@ static void process_csb(struct intel_engine_cs *engine)
> if (unlikely(head == tail))
> return;
>
> + /*
> + * We will consume all events from HW, or at least pretend to.
> + *
> + * The sequence of events from the HW is deterministic, and derived
> + * from our writes to the ELSP, with a smidgen of variability for
> + * the arrival of the asynchronous requests wrt to the inflight
> + * execution. If the HW sends an event that does not correspond with
> + * the one we are expecting, we have to abandon all hope as we lose
> + * all tracking of what the engine is actually executing. We will
> + * only detect we are out of sequence with the HW when we get an
> + * 'impossible' event because we have already drained our own
> + * preemption/promotion queue. If this occurs, we know that we likely
> + * lost track of execution earlier and must unwind and restart, the
> + * simplest way is by stop processing the event queue and force the
> + * engine to reset.
> + */
> + execlists->csb_head = tail;
> + ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
> +
> /*
> * Hopefully paired with a wmb() in HW!
> *
> @@ -2577,8 +2596,6 @@ static void process_csb(struct intel_engine_cs *engine)
> * we perform the READ_ONCE(*csb_write).
> */
> rmb();
> -
> - ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
> do {
> bool promote;
>
> @@ -2613,6 +2630,11 @@ static void process_csb(struct intel_engine_cs *engine)
> if (promote) {
> struct i915_request * const *old = execlists->active;
>
> + if (GEM_WARN_ON(!*execlists->pending)) {
> + execlists->error_interrupt |= ERROR_CSB;
> + break;
> + }
> +
> ring_set_paused(engine, 0);
>
> /* Point active to the new ELSP; prevent overwriting */
> @@ -2635,7 +2657,10 @@ static void process_csb(struct intel_engine_cs *engine)
>
> WRITE_ONCE(execlists->pending[0], NULL);
> } else {
> - GEM_BUG_ON(!*execlists->active);
> + if (GEM_WARN_ON(!*execlists->active)) {
> + execlists->error_interrupt |= ERROR_CSB;
> + break;
> + }
>
> /* port0 completed, advanced to port1 */
> trace_ports(execlists, "completed", execlists->active);
> @@ -2686,7 +2711,6 @@ static void process_csb(struct intel_engine_cs *engine)
> }
> } while (head != tail);
>
> - execlists->csb_head = head;
> set_timeslice(engine);
>
> /*
> @@ -3117,9 +3141,18 @@ static void execlists_submission_tasklet(unsigned long data)
> process_csb(engine);
>
> if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
> + const char *msg;
> +
> + /* Generate the error message in priority wrt to the user! */
> + if (engine->execlists.error_interrupt & GENMASK(15, 0))
> + msg = "CS error"; /* thrown by a user payload */
> + else if (engine->execlists.error_interrupt & ERROR_CSB)
> + msg = "invalid CSB event";
> + else
> + msg = "internal error";
> +
> engine->execlists.error_interrupt = 0;
> - if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
> - execlists_reset(engine, "CS error");
> + execlists_reset(engine, msg);
> }
>
> if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list