[Intel-gfx] [PATCH] drm/i915/execlists: Always reset the context's RING registers

Chris Wilson chris at chris-wilson.co.uk
Mon Apr 8 15:38:59 UTC 2019


Quoting Chris Wilson (2019-04-08 14:40:25)
> During reset, we try and stop the active ring. This has the consequence
> that we often clobber the RING registers within the context image. When
> we find an active request, we update the context image to rerun that
> request (if it was guilty, we replace the hanging user payload with
> NOPs). However, we were ignoring an active context if the request had
> completed, with the consequence that the next submission on that request
> would start with RING_HEAD==0 and not the tail of the previous request,
> causing all requests still in the ring to be rerun. Rare, but
> occasionally seen within CI where we would spot that the context seqno
> would reverse and complain that we were retiring an incomplete request.
> 
>     <0> [412.390350]   <idle>-0       3d.s2 408373352us : __i915_request_submit: rcs0 fence 1e95b:3640 -> current 3638
>     <0> [412.390350]   <idle>-0       3d.s2 408373353us : __i915_request_submit: rcs0 fence 1e95b:3642 -> current 3638
>     <0> [412.390350]   <idle>-0       3d.s2 408373354us : __i915_request_submit: rcs0 fence 1e95b:3644 -> current 3638
>     <0> [412.390350]   <idle>-0       3d.s2 408373354us : __i915_request_submit: rcs0 fence 1e95b:3646 -> current 3638
>     <0> [412.390350]   <idle>-0       3d.s2 408373356us : __execlists_submission_tasklet: rcs0 in[0]:  ctx=2.1, fence 1e95b:3646 (current 3638), prio=4
>     <0> [412.390350] i915_sel-4613    0.... 408373374us : __i915_request_commit: rcs0 fence 1e95b:3648
>     <0> [412.390350] i915_sel-4613    0d..1 408373377us : process_csb: rcs0 cs-irq head=2, tail=3
>     <0> [412.390350] i915_sel-4613    0d..1 408373377us : process_csb: rcs0 csb[3]: status=0x00000001:0x00000000, active=0x1
>     <0> [412.390350] i915_sel-4613    0d..1 408373378us : __i915_request_submit: rcs0 fence 1e95b:3648 -> current 3638
>     <0> [412.390350]   <idle>-0       3..s1 408373378us : execlists_submission_tasklet: rcs0 awake?=1, active=5
>     <0> [412.390350] i915_sel-4613    0d..1 408373379us : __execlists_submission_tasklet: rcs0 in[0]:  ctx=2.2, fence 1e95b:3648 (current 3638), prio=4
>     <0> [412.390350] i915_sel-4613    0.... 408373381us : i915_reset_engine: rcs0 flags=4
>     <0> [412.390350] i915_sel-4613    0.... 408373382us : execlists_reset_prepare: rcs0: depth<-0
>     <0> [412.390350]   <idle>-0       3d.s2 408373390us : process_csb: rcs0 cs-irq head=3, tail=4
>     <0> [412.390350]   <idle>-0       3d.s2 408373390us : process_csb: rcs0 csb[4]: status=0x00008002:0x00000002, active=0x1
>     <0> [412.390350]   <idle>-0       3d.s2 408373390us : process_csb: rcs0 out[0]: ctx=2.2, fence 1e95b:3648 (current 3640), prio=4
>     <0> [412.390350] i915_sel-4613    0.... 408373401us : intel_engine_stop_cs: rcs0
>     <0> [412.390350] i915_sel-4613    0d..1 408373402us : process_csb: rcs0 cs-irq head=4, tail=4
>     <0> [412.390350] i915_sel-4613    0.... 408373403us : intel_gpu_reset: engine_mask=1
>     <0> [412.390350] i915_sel-4613    0d..1 408373408us : execlists_cancel_port_requests: rcs0:port0 fence 1e95b:3648, (current 3648)
>     <0> [412.390350] i915_sel-4613    0.... 408373442us : intel_engine_cancel_stop_cs: rcs0
>     <0> [412.390350] i915_sel-4613    0.... 408373442us : execlists_reset_finish: rcs0: depth->0
>     <0> [412.390350] ksoftirq-26      3..s. 408373442us : execlists_submission_tasklet: rcs0 awake?=1, active=0
>     <0> [412.390350] ksoftirq-26      3d.s1 408373443us : process_csb: rcs0 cs-irq head=5, tail=5
>     <0> [412.390350] i915_sel-4613    0.... 408373475us : i915_request_retire: rcs0 fence 1e95b:3640, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373476us : i915_request_retire: __retire_engine_request(rcs0) fence 1e95b:3640, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373494us : __i915_request_commit: rcs0 fence 1e95b:3650
>     <0> [412.390350] i915_sel-4613    0d..1 408373496us : process_csb: rcs0 cs-irq head=5, tail=5
>     <0> [412.390350] i915_sel-4613    0d..1 408373496us : __i915_request_submit: rcs0 fence 1e95b:3650 -> current 3648
>     <0> [412.390350] i915_sel-4613    0d..1 408373498us : __execlists_submission_tasklet: rcs0 in[0]:  ctx=2.1, fence 1e95b:3650 (current 3648), prio=6
>     <0> [412.390350] i915_sel-4613    0.... 408373500us : i915_request_retire_upto: rcs0 fence 1e95b:3648, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373500us : i915_request_retire: rcs0 fence 1e95b:3642, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373501us : i915_request_retire: __retire_engine_request(rcs0) fence 1e95b:3642, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373514us : i915_request_retire: rcs0 fence 1e95b:3644, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373515us : i915_request_retire: __retire_engine_request(rcs0) fence 1e95b:3644, current 3648
>     <0> [412.390350] i915_sel-4613    0.... 408373527us : i915_request_retire: rcs0 fence 1e95b:3646, current 3640
>     <0> [412.390350]   <idle>-0       3..s1 408373569us : execlists_submission_tasklet: rcs0 awake?=1, active=1
>     <0> [412.390350]   <idle>-0       3d.s2 408373569us : process_csb: rcs0 cs-irq head=5, tail=1
>     <0> [412.390350]   <idle>-0       3d.s2 408373570us : process_csb: rcs0 csb[0]: status=0x00000001:0x00000000, active=0x1
>     <0> [412.390350]   <idle>-0       3d.s2 408373570us : process_csb: rcs0 csb[1]: status=0x00000018:0x00000002, active=0x5
>     <0> [412.390350]   <idle>-0       3d.s2 408373570us : process_csb: rcs0 out[0]: ctx=2.1, fence 1e95b:3650 (current 3650), prio=6
>     <0> [412.390350]   <idle>-0       3d.s2 408373571us : process_csb: rcs0 completed ctx=2
>     <0> [412.390350] i915_sel-4613    0.... 408373621us : i915_request_retire: i915_request_retire:253 GEM_BUG_ON(!i915_request_completed(request))
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 46 +++++++++++++++++++++-----------
>  1 file changed, 31 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 6931dbb2888c..d195b7a1dccf 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1904,7 +1904,6 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
>  
>         /* And flush any current direct submission. */
>         spin_lock_irqsave(&engine->timeline.lock, flags);
> -       process_csb(engine); /* drain preemption events */
>         spin_unlock_irqrestore(&engine->timeline.lock, flags);
>  }
>  
> @@ -1928,12 +1927,29 @@ static bool lrc_regs_ok(const struct i915_request *rq)
>  static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  {
>         struct intel_engine_execlists * const execlists = &engine->execlists;
> -       struct i915_request *rq;
> +       struct i915_request *rq = NULL;
> +       struct intel_context *ce;
>         unsigned long flags;
>         u32 *regs;
>  
>         spin_lock_irqsave(&engine->timeline.lock, flags);
>  
> +       GEM_TRACE("%s\n", engine->name);
> +       process_csb(engine); /* drain preemption events */
> +
> +       /* Following the reset, we need to reload the CSB read/write pointers */
> +       reset_csb_pointers(&engine->execlists);
> +
> +       /*
> +        * Save the currently executing context, even if we completed
> +        * its request, it was still running at the time of the
> +        * reset and will have been clobbered.
> +        */
> +       if (!port_isset(execlists->port))
> +               goto out_unlock;
> +
> +       ce = port_request(execlists->port)->hw_context;
> +
>         /*
>          * Catch up with any missed context-switch interrupts.
>          *
> @@ -1947,12 +1963,10 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  
>         /* Push back any incomplete requests for replay after the reset. */
>         rq = __unwind_incomplete_requests(engine);
> -
> -       /* Following the reset, we need to reload the CSB read/write pointers */
> -       reset_csb_pointers(&engine->execlists);
> -
>         if (!rq)
> -               goto out_unlock;
> +               goto out_replay;
> +
> +       GEM_BUG_ON(rq->hw_context != ce);

It took a few hours, but I eventually hit this. Which can be if we reset
just before a context switch -- I think ce is the accurate one here, and
we should disregard rq.
-Chris


More information about the Intel-gfx mailing list