[Intel-gfx] [FIXES 3/3] drm/i915/execlists: Move reset_active() from schedule-out to schedule-in
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Mon Nov 11 16:31:09 UTC 2019
On 11/11/2019 13:32, Chris Wilson wrote:
> The gem_ctx_persistence/smoketest was detecting an odd coherency issue
> inside the LRC context image; that the address of the ring buffer did
> not match our associated struct intel_ring. As we set the address into
> the context image when we pin the ring buffer into place before the
> context is active, that leaves the question of where did it get
> overwritten. Either the HW context save occurred after our pin which
> would imply that our idle barriers are broken, or we overwrote the
> context image ourselves. It is only in reset_active() where we dabble
> inside the context image outside of a serialised path from schedule-out;
> but we could equally perform the operation inside schedule-in which is
> then fully serialised with the context pin -- and remains serialised by
> the engine pulse with kill_context(). (The only downside, aside from
> doing more work inside the engine->active.lock, was the plan to merge
> all the reset paths into doing their context scrubbing on schedule-out
> needs more thought.)
So process_csb is not under the engine lock and hence schedule_out can
race with schedule_in meaning schedule_out should refrain from doing
non-trivial work.
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
>
> Fixes: d12acee84ffb ("drm/i915/execlists: Cancel banned contexts on schedule-out")
> Testcase: igt/gem_ctx_persistence/smoketest
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> ---
> drivers/gpu/drm/i915/gt/intel_lrc.c | 114 ++++++++++++++--------------
> 1 file changed, 57 insertions(+), 57 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index e57345795c08..4b6d9e6b1bfd 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1042,6 +1042,59 @@ execlists_check_context(const struct intel_context *ce,
> WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
> }
>
> +static void restore_default_state(struct intel_context *ce,
> + struct intel_engine_cs *engine)
> +{
> + u32 *regs = ce->lrc_reg_state;
> +
> + if (engine->pinned_default_state)
> + memcpy(regs, /* skip restoring the vanilla PPHWSP */
> + engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
> + engine->context_size - PAGE_SIZE);
> +
> + execlists_init_reg_state(regs, ce, engine, ce->ring, false);
> +}
> +
> +static void reset_active(struct i915_request *rq,
> + struct intel_engine_cs *engine)
> +{
> + struct intel_context * const ce = rq->hw_context;
> + u32 head;
> +
> + /*
> + * The executing context has been cancelled. We want to prevent
> + * further execution along this context and propagate the error on
> + * to anything depending on its results.
> + *
> + * In __i915_request_submit(), we apply the -EIO and remove the
> + * requests' payloads for any banned requests. But first, we must
> + * rewind the context back to the start of the incomplete request so
> + * that we do not jump back into the middle of the batch.
> + *
> + * We preserve the breadcrumbs and semaphores of the incomplete
> + * requests so that inter-timeline dependencies (i.e other timelines)
> + * remain correctly ordered. And we defer to __i915_request_submit()
> + * so that all asynchronous waits are correctly handled.
> + */
> + GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
> + __func__, engine->name, rq->fence.context, rq->fence.seqno);
> +
> + /* On resubmission of the active request, payload will be scrubbed */
> + if (i915_request_completed(rq))
> + head = rq->tail;
> + else
> + head = active_request(ce->timeline, rq)->head;
> + ce->ring->head = intel_ring_wrap(ce->ring, head);
> + intel_ring_update_space(ce->ring);
> +
> + /* Scrub the context image to prevent replaying the previous batch */
> + restore_default_state(ce, engine);
> + __execlists_update_reg_state(ce, engine);
> +
> + /* We've switched away, so this should be a no-op, but intent matters */
> + ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> +}
> +
> static inline struct intel_engine_cs *
> __execlists_schedule_in(struct i915_request *rq)
> {
> @@ -1050,8 +1103,11 @@ __execlists_schedule_in(struct i915_request *rq)
>
> intel_context_get(ce);
>
> + if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
> + reset_active(rq, engine);
> +
> if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> - execlists_check_context(ce, rq->engine);
> + execlists_check_context(ce, engine);
>
> if (ce->tag) {
> /* Use a fixed tag for OA and friends */
> @@ -1102,59 +1158,6 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
> tasklet_schedule(&ve->base.execlists.tasklet);
> }
>
> -static void restore_default_state(struct intel_context *ce,
> - struct intel_engine_cs *engine)
> -{
> - u32 *regs = ce->lrc_reg_state;
> -
> - if (engine->pinned_default_state)
> - memcpy(regs, /* skip restoring the vanilla PPHWSP */
> - engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
> - engine->context_size - PAGE_SIZE);
> -
> - execlists_init_reg_state(regs, ce, engine, ce->ring, false);
> -}
> -
> -static void reset_active(struct i915_request *rq,
> - struct intel_engine_cs *engine)
> -{
> - struct intel_context * const ce = rq->hw_context;
> - u32 head;
> -
> - /*
> - * The executing context has been cancelled. We want to prevent
> - * further execution along this context and propagate the error on
> - * to anything depending on its results.
> - *
> - * In __i915_request_submit(), we apply the -EIO and remove the
> - * requests' payloads for any banned requests. But first, we must
> - * rewind the context back to the start of the incomplete request so
> - * that we do not jump back into the middle of the batch.
> - *
> - * We preserve the breadcrumbs and semaphores of the incomplete
> - * requests so that inter-timeline dependencies (i.e other timelines)
> - * remain correctly ordered. And we defer to __i915_request_submit()
> - * so that all asynchronous waits are correctly handled.
> - */
> - GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
> - __func__, engine->name, rq->fence.context, rq->fence.seqno);
> -
> - /* On resubmission of the active request, payload will be scrubbed */
> - if (i915_request_completed(rq))
> - head = rq->tail;
> - else
> - head = active_request(ce->timeline, rq)->head;
> - ce->ring->head = intel_ring_wrap(ce->ring, head);
> - intel_ring_update_space(ce->ring);
> -
> - /* Scrub the context image to prevent replaying the previous batch */
> - restore_default_state(ce, engine);
> - __execlists_update_reg_state(ce, engine);
> -
> - /* We've switched away, so this should be a no-op, but intent matters */
> - ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> -}
> -
> static inline void
> __execlists_schedule_out(struct i915_request *rq,
> struct intel_engine_cs * const engine)
> @@ -1165,9 +1168,6 @@ __execlists_schedule_out(struct i915_request *rq,
> execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> intel_gt_pm_put(engine->gt);
>
> - if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
> - reset_active(rq, engine);
> -
> /*
> * If this is part of a virtual engine, its next request may
> * have been blocked waiting for access to the active context.
>
More information about the Intel-gfx
mailing list