[PATCH 12/57] drm/i915: Move context revocation to scheduler

Fri Feb 5 16:36:16 UTC 2021

On 05/02/2021 01:18, Chris Wilson wrote:
> Centralise the means by which to remove a context from execution to the
> scheduler, allowing the backends to specialise as necessary. Note that
> without backend support, we can simplify the procedure to forcibly reset
> the HW to remove the context.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c   | 117 +-----------------
>   .../drm/i915/gt/intel_execlists_submission.c  |  46 +++++++
>   drivers/gpu/drm/i915/i915_scheduler.c         |  20 +++
>   drivers/gpu/drm/i915/i915_scheduler_types.h   |   5 +
>   4 files changed, 74 insertions(+), 114 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index ca37d93ef5e7..be75f861db67 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -382,104 +382,9 @@ __context_engines_static(const struct i915_gem_context *ctx)
>   	return rcu_dereference_protected(ctx->engines, true);
>   }
>   
> -static void __reset_context(struct i915_gem_context *ctx,
> -			    struct intel_engine_cs *engine)
> -{
> -	intel_gt_handle_error(engine->gt, engine->mask, 0,
> -			      "context closure in %s", ctx->name);
> -}
> -
> -static bool __cancel_engine(struct intel_engine_cs *engine)
> -{
> -	/*
> -	 * Send a "high priority pulse" down the engine to cause the
> -	 * current request to be momentarily preempted. (If it fails to
> -	 * be preempted, it will be reset). As we have marked our context
> -	 * as banned, any incomplete request, including any running, will
> -	 * be skipped following the preemption.
> -	 *
> -	 * If there is no hangchecking (one of the reasons why we try to
> -	 * cancel the context) and no forced preemption, there may be no
> -	 * means by which we reset the GPU and evict the persistent hog.
> -	 * Ergo if we are unable to inject a preemptive pulse that can
> -	 * kill the banned context, we fallback to doing a local reset
> -	 * instead.
> -	 */
> -	return intel_engine_pulse(engine) == 0;
> -}
> -
> -static bool
> -__active_engine(struct i915_request *rq, struct intel_engine_cs **active)
> -{
> -	struct intel_engine_cs *engine, *locked;
> -	bool ret = false;
> -
> -	/*
> -	 * Serialise with __i915_request_submit() so that it sees
> -	 * is-banned?, or we know the request is already inflight.
> -	 *
> -	 * Note that rq->engine is unstable, and so we double
> -	 * check that we have acquired the lock on the final engine.
> -	 */
> -	locked = READ_ONCE(rq->engine);
> -	spin_lock_irq(&locked->sched.lock);
> -	while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) {
> -		spin_unlock(&locked->sched.lock);
> -		locked = engine;
> -		spin_lock(&locked->sched.lock);
> -	}
> -
> -	if (i915_request_is_active(rq)) {
> -		if (!__i915_request_is_complete(rq))
> -			*active = locked;
> -		ret = true;
> -	}
> -
> -	spin_unlock_irq(&locked->sched.lock);
> -
> -	return ret;
> -}
> -
> -static struct intel_engine_cs *active_engine(struct intel_context *ce)
> -{
> -	struct intel_engine_cs *engine = NULL;
> -	struct i915_request *rq;
> -
> -	if (intel_context_has_inflight(ce))
> -		return intel_context_inflight(ce);
> -
> -	if (!ce->timeline)
> -		return NULL;
> -
> -	/*
> -	 * rq->link is only SLAB_TYPESAFE_BY_RCU, we need to hold a reference
> -	 * to the request to prevent it being transferred to a new timeline
> -	 * (and onto a new timeline->requests list).
> -	 */
> -	rcu_read_lock();
> -	list_for_each_entry_reverse(rq, &ce->timeline->requests, link) {
> -		bool found;
> -
> -		/* timeline is already completed upto this point? */
> -		if (!i915_request_get_rcu(rq))
> -			break;
> -
> -		/* Check with the backend if the request is inflight */
> -		found = true;
> -		if (likely(rcu_access_pointer(rq->timeline) == ce->timeline))
> -			found = __active_engine(rq, &engine);
> -
> -		i915_request_put(rq);
> -		if (found)
> -			break;
> -	}
> -	rcu_read_unlock();

Moves and removes the above chunk of code. Now I assume 
intel_context_inflight should be good enough, but obviously existing 
code does not trust it. I can't figure it out. Do you remember why?

Regards,

Tvrtko

> -
> -	return engine;
> -}
> -
>   static void kill_engines(struct i915_gem_engines *engines, bool ban)
>   {
> +	const int error = ban ? -EIO : -EAGAIN;
>   	struct i915_gem_engines_iter it;
>   	struct intel_context *ce;
>   
> @@ -491,28 +396,12 @@ static void kill_engines(struct i915_gem_engines *engines, bool ban)
>   	 * engines on which there are incomplete requests.
>   	 */
>   	for_each_gem_engine(ce, engines, it) {
> -		struct intel_engine_cs *engine;
> +		struct i915_sched *se = intel_engine_get_scheduler(ce->engine);
>   
>   		if (ban && intel_context_set_banned(ce))
>   			continue;
>   
> -		/*
> -		 * Check the current active state of this context; if we
> -		 * are currently executing on the GPU we need to evict
> -		 * ourselves. On the other hand, if we haven't yet been
> -		 * submitted to the GPU or if everything is complete,
> -		 * we have nothing to do.
> -		 */
> -		engine = active_engine(ce);
> -
> -		/* First attempt to gracefully cancel the context */
> -		if (engine && !__cancel_engine(engine) && ban)
> -			/*
> -			 * If we are unable to send a preemptive pulse to bump
> -			 * the context from the GPU, we have to resort to a full
> -			 * reset. We hope the collateral damage is worth it.
> -			 */
> -			__reset_context(engines->ctx, engine);
> +		se->revoke_context(ce, ban ? engines->ctx->name : NULL, error);
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index b2b9e5b889a0..d9fd3ad27433 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -114,6 +114,7 @@
>   #include "gen8_engine_cs.h"
>   #include "intel_breadcrumbs.h"
>   #include "intel_context.h"
> +#include "intel_engine_heartbeat.h"
>   #include "intel_engine_pm.h"
>   #include "intel_engine_stats.h"
>   #include "intel_execlists_submission.h"
> @@ -2830,6 +2831,50 @@ static bool execlists_is_executing(const struct i915_request *rq)
>   	return inflight;
>   }
>   
> +static bool __cancel_engine(struct intel_engine_cs *engine)
> +{
> +	/*
> +	 * Send a "high priority pulse" down the engine to cause the
> +	 * current request to be momentarily preempted. (If it fails to
> +	 * be preempted, it will be reset). As we have marked our context
> +	 * as banned, any incomplete request, including any running, will
> +	 * be skipped following the preemption.
> +	 *
> +	 * If there is no hangchecking (one of the reasons why we try to
> +	 * cancel the context) and no forced preemption, there may be no
> +	 * means by which we reset the GPU and evict the persistent hog.
> +	 * Ergo if we are unable to inject a preemptive pulse that can
> +	 * kill the banned context, we fallback to doing a local reset
> +	 * instead.
> +	 */
> +	return intel_engine_pulse(engine) == 0;
> +}
> +
> +static void
> +execlists_revoke_context(struct intel_context *ce, const char *force, int error)
> +{
> +	struct intel_engine_cs *engine;
> +
> +	/*
> +	 * Check the current active state of this context; if we
> +	 * are currently executing on the GPU we need to evict
> +	 * ourselves. On the other hand, if we haven't yet been
> +	 * submitted to the GPU or if everything is complete,
> +	 * we have nothing to do.
> +	 */
> +	engine = intel_context_inflight(ce);
> +
> +	/* First attempt to gracefully cancel the context */
> +	if (engine && !__cancel_engine(engine) && force)
> +		/*
> +		 * If we are unable to send a preemptive pulse to bump
> +		 * the context from the GPU, we have to resort to a full
> +		 * reset. We hope the collateral damage is worth it.
> +		 */
> +		intel_gt_handle_error(engine->gt, engine->mask, 0,
> +				      "context revoked from %s", force);
> +}
> +
>   static bool can_preempt(struct intel_engine_cs *engine)
>   {
>   	if (INTEL_GEN(engine->i915) > 8)
> @@ -2968,6 +3013,7 @@ static void init_execlists(struct intel_engine_cs *engine)
>   
>   	engine->sched.active_request = execlists_active_request;
>   	engine->sched.is_executing = execlists_is_executing;
> +	engine->sched.revoke_context = execlists_revoke_context;
>   	tasklet_setup(&engine->sched.tasklet, execlists_submission_tasklet);
>   
>   	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index b83bbae75b8d..8effd92ea4f5 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -158,6 +158,25 @@ i915_sched_default_active_request(struct i915_sched *se)
>   	return active;
>   }
>   
> +static bool context_active(struct intel_context *ce)
> +{
> +	return i915_active_fence_isset(&ce->timeline->last_request);
> +}
> +
> +static void
> +i915_sched_default_revoke_context(struct intel_context *ce,
> +				  const char *force,
> +				  int error)
> +{
> +	/*
> +	 * Without backend support, we cannot remove the context from the
> +	 * HW gracefully. All we can do is force a reset, as a last resort.
> +	 */
> +	if (force && context_active(ce))
> +		intel_gt_handle_error(ce->engine->gt, ce->engine->mask, 0,
> +				      "context revoked from %s", force);
> +}
> +
>   void i915_sched_init(struct i915_sched *se,
>   		     struct device *dev,
>   		     const char *name,
> @@ -181,6 +200,7 @@ void i915_sched_init(struct i915_sched *se,
>   
>   	se->submit_request = i915_request_enqueue;
>   	se->active_request = i915_sched_default_active_request;
> +	se->revoke_context = i915_sched_default_revoke_context;
>   }
>   
>   void i915_sched_park(struct i915_sched *se)
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index 8b688e2440f6..23413c09638e 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -14,6 +14,7 @@
>   #include "i915_priolist_types.h"
>   
>   struct i915_request;
> +struct intel_context;
>   
>   /**
>    * struct i915_sched - funnels requests towards hardware
> @@ -41,6 +42,10 @@ struct i915_sched {
>   
>   	bool (*is_executing)(const struct i915_request *rq);
>   
> +	void (*revoke_context)(struct intel_context *ce,
> +			       const char *whom,
> +			       int error);
> +
>   	struct list_head requests; /* active request, on HW */
>   	struct list_head hold; /* ready requests, but on hold */
>   	/**
>