[Intel-gfx] [PATCH v2] drm/i915/execlists: Take a reference while capturing the guilty request
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Wed Jan 22 13:28:29 UTC 2020
On 22/01/2020 11:34, Chris Wilson wrote:
> Thanks to preempt-to-busy, we leave the request on the HW as we submit
> the preemption request. This means that the request may complete at any
> moment as we process HW events, and in particular the request may be
> retired as we are planning to capture it for a preemption timeout.
>
> Be more careful while obtaining the request to capture after a
> preemption timeout, and check to see if it completed before we were able
> to put it on the on-hold list. If we do see it did complete just before
> we capture the request, proclaim the preemption-timeout a false positive
> and pardon the reset as we should hit an arbitration point momentarily
> and so be able to process the preemption.
>
> Note that even after we move the request to be on hold it may be retired
> (as the reset to stop the HW comes after), so we do require to hold our
> own reference as we work on the request for capture (and all of the
> peeking at state within the request needs to be carefully protected).
>
> Fixes: 32ff621fd744 ("drm/i915/gt: Allow temporary suspension of inflight requests")
> Closes: https://gitlab.freedesktop.org/drm/intel/issues/997
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/gt/intel_lrc.c | 39 +++++++++++++++++++++++------
> 1 file changed, 31 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 3a30767ff0c4..c4826d49571f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2393,11 +2393,16 @@ static void __execlists_hold(struct i915_request *rq)
> } while (rq);
> }
>
> -static void execlists_hold(struct intel_engine_cs *engine,
> +static bool execlists_hold(struct intel_engine_cs *engine,
> struct i915_request *rq)
> {
> spin_lock_irq(&engine->active.lock);
>
> + if (!i915_request_completed(rq)) { /* too late! */
> + rq = NULL;
> + goto unlock;
> + }
> +
> /*
> * Transfer this request onto the hold queue to prevent it
> * being resumbitted to HW (and potentially completed) before we have
> @@ -2408,7 +2413,9 @@ static void execlists_hold(struct intel_engine_cs *engine,
> GEM_BUG_ON(rq->engine != engine);
> __execlists_hold(rq);
>
> +unlock:
> spin_unlock_irq(&engine->active.lock);
> + return rq;
> }
>
> static bool hold_request(const struct i915_request *rq)
> @@ -2524,6 +2531,7 @@ static void execlists_capture_work(struct work_struct *work)
>
> /* Return this request and all that depend upon it for signaling */
> execlists_unhold(engine, cap->rq);
> + i915_request_put(cap->rq);
>
> kfree(cap);
> }
> @@ -2560,12 +2568,12 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
> return NULL;
> }
>
> -static void execlists_capture(struct intel_engine_cs *engine)
> +static bool execlists_capture(struct intel_engine_cs *engine)
> {
> struct execlists_capture *cap;
>
> if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
> - return;
> + return true;
>
> /*
> * We need to _quickly_ capture the engine state before we reset.
> @@ -2574,13 +2582,17 @@ static void execlists_capture(struct intel_engine_cs *engine)
> */
> cap = capture_regs(engine);
> if (!cap)
> - return;
> + return true;
>
> cap->rq = execlists_active(&engine->execlists);
> GEM_BUG_ON(!cap->rq);
>
> + rcu_read_lock();
> cap->rq = active_request(cap->rq->context->timeline, cap->rq);
> - GEM_BUG_ON(!cap->rq);
> + cap->rq = i915_request_get_rcu(cap->rq);
> + rcu_read_unlock();
> + if (!cap->rq)
> + goto err_free;
>
> /*
> * Remove the request from the execlists queue, and take ownership
> @@ -2602,10 +2614,19 @@ static void execlists_capture(struct intel_engine_cs *engine)
> * simply hold that request accountable for being non-preemptible
> * long enough to force the reset.
> */
> - execlists_hold(engine, cap->rq);
> + if (!execlists_hold(engine, cap->rq))
> + goto err_rq;
>
> INIT_WORK(&cap->work, execlists_capture_work);
> schedule_work(&cap->work);
> + return true;
> +
> +err_rq:
> + i915_request_put(cap->rq);
> +err_free:
> + i915_gpu_coredump_put(cap->error);
> + kfree(cap);
> + return false;
> }
>
> static noinline void preempt_reset(struct intel_engine_cs *engine)
> @@ -2627,8 +2648,10 @@ static noinline void preempt_reset(struct intel_engine_cs *engine)
> jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
>
> ring_set_paused(engine, 1); /* Freeze the current request in place */
> - execlists_capture(engine);
> - intel_engine_reset(engine, "preemption time out");
> + if (execlists_capture(engine))
> + intel_engine_reset(engine, "preemption time out");
> + else
> + ring_set_paused(engine, 0);
>
> tasklet_enable(&engine->execlists.tasklet);
> clear_and_wake_up_bit(bit, lock);
>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list