[PATCH 12/57] drm/i915: Move context revocation to scheduler
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri Feb 5 16:36:16 UTC 2021
On 05/02/2021 01:18, Chris Wilson wrote:
> Centralise the means by which to remove a context from execution to the
> scheduler, allowing the backends to specialise as necessary. Note that
> without backend support, we can simplify the procedure to forcibly reset
> the HW to remove the context.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/gem/i915_gem_context.c | 117 +-----------------
> .../drm/i915/gt/intel_execlists_submission.c | 46 +++++++
> drivers/gpu/drm/i915/i915_scheduler.c | 20 +++
> drivers/gpu/drm/i915/i915_scheduler_types.h | 5 +
> 4 files changed, 74 insertions(+), 114 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index ca37d93ef5e7..be75f861db67 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -382,104 +382,9 @@ __context_engines_static(const struct i915_gem_context *ctx)
> return rcu_dereference_protected(ctx->engines, true);
> }
>
> -static void __reset_context(struct i915_gem_context *ctx,
> - struct intel_engine_cs *engine)
> -{
> - intel_gt_handle_error(engine->gt, engine->mask, 0,
> - "context closure in %s", ctx->name);
> -}
> -
> -static bool __cancel_engine(struct intel_engine_cs *engine)
> -{
> - /*
> - * Send a "high priority pulse" down the engine to cause the
> - * current request to be momentarily preempted. (If it fails to
> - * be preempted, it will be reset). As we have marked our context
> - * as banned, any incomplete request, including any running, will
> - * be skipped following the preemption.
> - *
> - * If there is no hangchecking (one of the reasons why we try to
> - * cancel the context) and no forced preemption, there may be no
> - * means by which we reset the GPU and evict the persistent hog.
> - * Ergo if we are unable to inject a preemptive pulse that can
> - * kill the banned context, we fallback to doing a local reset
> - * instead.
> - */
> - return intel_engine_pulse(engine) == 0;
> -}
> -
> -static bool
> -__active_engine(struct i915_request *rq, struct intel_engine_cs **active)
> -{
> - struct intel_engine_cs *engine, *locked;
> - bool ret = false;
> -
> - /*
> - * Serialise with __i915_request_submit() so that it sees
> - * is-banned?, or we know the request is already inflight.
> - *
> - * Note that rq->engine is unstable, and so we double
> - * check that we have acquired the lock on the final engine.
> - */
> - locked = READ_ONCE(rq->engine);
> - spin_lock_irq(&locked->sched.lock);
> - while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) {
> - spin_unlock(&locked->sched.lock);
> - locked = engine;
> - spin_lock(&locked->sched.lock);
> - }
> -
> - if (i915_request_is_active(rq)) {
> - if (!__i915_request_is_complete(rq))
> - *active = locked;
> - ret = true;
> - }
> -
> - spin_unlock_irq(&locked->sched.lock);
> -
> - return ret;
> -}
> -
> -static struct intel_engine_cs *active_engine(struct intel_context *ce)
> -{
> - struct intel_engine_cs *engine = NULL;
> - struct i915_request *rq;
> -
> - if (intel_context_has_inflight(ce))
> - return intel_context_inflight(ce);
> -
> - if (!ce->timeline)
> - return NULL;
> -
> - /*
> - * rq->link is only SLAB_TYPESAFE_BY_RCU, we need to hold a reference
> - * to the request to prevent it being transferred to a new timeline
> - * (and onto a new timeline->requests list).
> - */
> - rcu_read_lock();
> - list_for_each_entry_reverse(rq, &ce->timeline->requests, link) {
> - bool found;
> -
> - /* timeline is already completed upto this point? */
> - if (!i915_request_get_rcu(rq))
> - break;
> -
> - /* Check with the backend if the request is inflight */
> - found = true;
> - if (likely(rcu_access_pointer(rq->timeline) == ce->timeline))
> - found = __active_engine(rq, &engine);
> -
> - i915_request_put(rq);
> - if (found)
> - break;
> - }
> - rcu_read_unlock();
Moves and removes the above chunk of code. Now I assume
intel_context_inflight should be good enough, but obviously existing
code does not trust it. I can't figure it out. Do you remember why?
Regards,
Tvrtko
> -
> - return engine;
> -}
> -
> static void kill_engines(struct i915_gem_engines *engines, bool ban)
> {
> + const int error = ban ? -EIO : -EAGAIN;
> struct i915_gem_engines_iter it;
> struct intel_context *ce;
>
> @@ -491,28 +396,12 @@ static void kill_engines(struct i915_gem_engines *engines, bool ban)
> * engines on which there are incomplete requests.
> */
> for_each_gem_engine(ce, engines, it) {
> - struct intel_engine_cs *engine;
> + struct i915_sched *se = intel_engine_get_scheduler(ce->engine);
>
> if (ban && intel_context_set_banned(ce))
> continue;
>
> - /*
> - * Check the current active state of this context; if we
> - * are currently executing on the GPU we need to evict
> - * ourselves. On the other hand, if we haven't yet been
> - * submitted to the GPU or if everything is complete,
> - * we have nothing to do.
> - */
> - engine = active_engine(ce);
> -
> - /* First attempt to gracefully cancel the context */
> - if (engine && !__cancel_engine(engine) && ban)
> - /*
> - * If we are unable to send a preemptive pulse to bump
> - * the context from the GPU, we have to resort to a full
> - * reset. We hope the collateral damage is worth it.
> - */
> - __reset_context(engines->ctx, engine);
> + se->revoke_context(ce, ban ? engines->ctx->name : NULL, error);
> }
> }
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index b2b9e5b889a0..d9fd3ad27433 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -114,6 +114,7 @@
> #include "gen8_engine_cs.h"
> #include "intel_breadcrumbs.h"
> #include "intel_context.h"
> +#include "intel_engine_heartbeat.h"
> #include "intel_engine_pm.h"
> #include "intel_engine_stats.h"
> #include "intel_execlists_submission.h"
> @@ -2830,6 +2831,50 @@ static bool execlists_is_executing(const struct i915_request *rq)
> return inflight;
> }
>
> +static bool __cancel_engine(struct intel_engine_cs *engine)
> +{
> + /*
> + * Send a "high priority pulse" down the engine to cause the
> + * current request to be momentarily preempted. (If it fails to
> + * be preempted, it will be reset). As we have marked our context
> + * as banned, any incomplete request, including any running, will
> + * be skipped following the preemption.
> + *
> + * If there is no hangchecking (one of the reasons why we try to
> + * cancel the context) and no forced preemption, there may be no
> + * means by which we reset the GPU and evict the persistent hog.
> + * Ergo if we are unable to inject a preemptive pulse that can
> + * kill the banned context, we fallback to doing a local reset
> + * instead.
> + */
> + return intel_engine_pulse(engine) == 0;
> +}
> +
> +static void
> +execlists_revoke_context(struct intel_context *ce, const char *force, int error)
> +{
> + struct intel_engine_cs *engine;
> +
> + /*
> + * Check the current active state of this context; if we
> + * are currently executing on the GPU we need to evict
> + * ourselves. On the other hand, if we haven't yet been
> + * submitted to the GPU or if everything is complete,
> + * we have nothing to do.
> + */
> + engine = intel_context_inflight(ce);
> +
> + /* First attempt to gracefully cancel the context */
> + if (engine && !__cancel_engine(engine) && force)
> + /*
> + * If we are unable to send a preemptive pulse to bump
> + * the context from the GPU, we have to resort to a full
> + * reset. We hope the collateral damage is worth it.
> + */
> + intel_gt_handle_error(engine->gt, engine->mask, 0,
> + "context revoked from %s", force);
> +}
> +
> static bool can_preempt(struct intel_engine_cs *engine)
> {
> if (INTEL_GEN(engine->i915) > 8)
> @@ -2968,6 +3013,7 @@ static void init_execlists(struct intel_engine_cs *engine)
>
> engine->sched.active_request = execlists_active_request;
> engine->sched.is_executing = execlists_is_executing;
> + engine->sched.revoke_context = execlists_revoke_context;
> tasklet_setup(&engine->sched.tasklet, execlists_submission_tasklet);
>
> timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index b83bbae75b8d..8effd92ea4f5 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -158,6 +158,25 @@ i915_sched_default_active_request(struct i915_sched *se)
> return active;
> }
>
> +static bool context_active(struct intel_context *ce)
> +{
> + return i915_active_fence_isset(&ce->timeline->last_request);
> +}
> +
> +static void
> +i915_sched_default_revoke_context(struct intel_context *ce,
> + const char *force,
> + int error)
> +{
> + /*
> + * Without backend support, we cannot remove the context from the
> + * HW gracefully. All we can do is force a reset, as a last resort.
> + */
> + if (force && context_active(ce))
> + intel_gt_handle_error(ce->engine->gt, ce->engine->mask, 0,
> + "context revoked from %s", force);
> +}
> +
> void i915_sched_init(struct i915_sched *se,
> struct device *dev,
> const char *name,
> @@ -181,6 +200,7 @@ void i915_sched_init(struct i915_sched *se,
>
> se->submit_request = i915_request_enqueue;
> se->active_request = i915_sched_default_active_request;
> + se->revoke_context = i915_sched_default_revoke_context;
> }
>
> void i915_sched_park(struct i915_sched *se)
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index 8b688e2440f6..23413c09638e 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -14,6 +14,7 @@
> #include "i915_priolist_types.h"
>
> struct i915_request;
> +struct intel_context;
>
> /**
> * struct i915_sched - funnels requests towards hardware
> @@ -41,6 +42,10 @@ struct i915_sched {
>
> bool (*is_executing)(const struct i915_request *rq);
>
> + void (*revoke_context)(struct intel_context *ce,
> + const char *whom,
> + int error);
> +
> struct list_head requests; /* active request, on HW */
> struct list_head hold; /* ready requests, but on hold */
> /**
>
More information about the Intel-gfx-trybot
mailing list