[PATCH 3/4] drm/i915/execlists: Fix execlists request cancellation corner case

John Harrison john.c.harrison at intel.com
Wed Jan 26 19:03:24 UTC 2022


On 1/24/2022 07:01, Matthew Brost wrote:
> More than 1 request can be submitted to a single ELSP at a time if
> multiple requests are ready run to on the same context. When a request
> is canceled it is marked bad, an idle pulse is triggered to the engine
> (high priority kernel request), the execlists scheduler sees that
> running request is bad and sets preemption timeout to minimum value (1
> ms). This fails to work if multiple requests are combined on the ELSP as
> only the most recent request is stored in the execlists schedule (the
> request stored in the ELSP isn't marked bad, thus preemption timeout
> isn't set to the minimum value). If the preempt timeout is configured to
> zero, the engine is permanently hung. This is shown by an upcoming
> selftest.
>
> To work around this, mark the idle pulse with a flag to force a preempt
> with the minimum value.
>
> Fixes: 38b237eab2bc7 ("drm/i915: Individual request cancellation")
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>   .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 23 +++++++++++++++----
>   .../gpu/drm/i915/gt/intel_engine_heartbeat.h  |  1 +
>   .../drm/i915/gt/intel_execlists_submission.c  | 18 ++++++++++-----
>   drivers/gpu/drm/i915/i915_request.h           |  6 +++++
>   4 files changed, 38 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> index a3698f611f457..efd1c719b4072 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> @@ -243,7 +243,8 @@ void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
>   	INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
>   }
>   
> -static int __intel_engine_pulse(struct intel_engine_cs *engine)
> +static int __intel_engine_pulse(struct intel_engine_cs *engine,
> +				bool force_preempt)
>   {
>   	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
>   	struct intel_context *ce = engine->kernel_context;
> @@ -258,6 +259,8 @@ static int __intel_engine_pulse(struct intel_engine_cs *engine)
>   		return PTR_ERR(rq);
>   
>   	__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
> +	if (force_preempt)
> +		__set_bit(I915_FENCE_FLAG_FORCE_PREEMPT, &rq->fence.flags);
>   
>   	heartbeat_commit(rq, &attr);
>   	GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
> @@ -299,7 +302,7 @@ int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
>   
>   		/* recheck current execution */
>   		if (intel_engine_has_preemption(engine)) {
> -			err = __intel_engine_pulse(engine);
> +			err = __intel_engine_pulse(engine, false);
>   			if (err)
>   				set_heartbeat(engine, saved);
>   		}
> @@ -312,7 +315,8 @@ int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
>   	return err;
>   }
>   
> -int intel_engine_pulse(struct intel_engine_cs *engine)
> +static int _intel_engine_pulse(struct intel_engine_cs *engine,
> +			       bool force_preempt)
>   {
>   	struct intel_context *ce = engine->kernel_context;
>   	int err;
> @@ -325,7 +329,7 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
>   
>   	err = -EINTR;
>   	if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
> -		err = __intel_engine_pulse(engine);
> +		err = __intel_engine_pulse(engine, force_preempt);
>   		mutex_unlock(&ce->timeline->mutex);
>   	}
>   
> @@ -334,6 +338,17 @@ int intel_engine_pulse(struct intel_engine_cs *engine)
>   	return err;
>   }
>   
> +int intel_engine_pulse(struct intel_engine_cs *engine)
> +{
> +	return _intel_engine_pulse(engine, false);
> +}
> +
> +
> +int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine)
> +{
> +	return _intel_engine_pulse(engine, true);
> +}
> +
>   int intel_engine_flush_barriers(struct intel_engine_cs *engine)
>   {
>   	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> index 5da6d809a87a2..d9c8386754cb3 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> @@ -21,6 +21,7 @@ void intel_gt_park_heartbeats(struct intel_gt *gt);
>   void intel_gt_unpark_heartbeats(struct intel_gt *gt);
>   
>   int intel_engine_pulse(struct intel_engine_cs *engine);
> +int intel_engine_pulse_force_preempt(struct intel_engine_cs *engine);
>   int intel_engine_flush_barriers(struct intel_engine_cs *engine);
>   
>   #endif /* INTEL_ENGINE_HEARTBEAT_H */
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 960a9aaf4f3a3..f0c2024058731 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -1222,26 +1222,29 @@ static void record_preemption(struct intel_engine_execlists *execlists)
>   }
>   
>   static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
> -					    const struct i915_request *rq)
> +					    const struct i915_request *rq,
> +					    bool force_preempt)
>   {
>   	if (!rq)
>   		return 0;
>   
>   	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
> -	if (unlikely(intel_context_is_banned(rq->context) || bad_request(rq)))
> +	if (unlikely(intel_context_is_banned(rq->context) || bad_request(rq) ||
> +		     force_preempt))
>   		return 1;
>   
>   	return READ_ONCE(engine->props.preempt_timeout_ms);
>   }
>   
>   static void set_preempt_timeout(struct intel_engine_cs *engine,
> -				const struct i915_request *rq)
> +				const struct i915_request *rq,
> +				bool force_preempt)
>   {
>   	if (!intel_engine_has_preempt_reset(engine))
>   		return;
>   
>   	set_timer_ms(&engine->execlists.preempt,
> -		     active_preempt_timeout(engine, rq));
> +		     active_preempt_timeout(engine, rq, force_preempt));
>   }
>   
>   static bool completed(const struct i915_request *rq)
> @@ -1584,12 +1587,15 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	    memcmp(active,
>   		   execlists->pending,
>   		   (port - execlists->pending) * sizeof(*port))) {
> +		bool force_preempt = test_bit(I915_FENCE_FLAG_FORCE_PREEMPT,
> +					      &last->fence.flags);
> +
>   		*port = NULL;
>   		while (port-- != execlists->pending)
>   			execlists_schedule_in(*port, port - execlists->pending);
>   
>   		WRITE_ONCE(execlists->yield, -1);
> -		set_preempt_timeout(engine, *active);
> +		set_preempt_timeout(engine, *active, force_preempt);
>   		execlists_submit_ports(engine);
>   	} else {
>   		ring_set_paused(engine, 0);
> @@ -2594,7 +2600,7 @@ static void execlists_context_cancel_request(struct intel_context *ce,
>   
>   	i915_request_active_engine(rq, &engine);
>   
> -	if (engine && intel_engine_pulse(engine))
> +	if (engine && intel_engine_pulse_force_preempt(engine))
>   		intel_gt_handle_error(engine->gt, engine->mask, 0,
>   				      "request cancellation by %s",
>   				      current->comm);
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 28b1f9db54875..7e6312233d4c7 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -170,6 +170,12 @@ enum {
>   	 * fence (dma_fence_array) and i915 generated for parallel submission.
>   	 */
>   	I915_FENCE_FLAG_COMPOSITE,
> +
> +	/*
> +	 * I915_FENCE_FLAG_FORCE_PREEMPT - Force preempt immediately regardless
> +	 * of preempt timeout configuration
> +	 */
> +	I915_FENCE_FLAG_FORCE_PREEMPT,
This would be execlist only? I'm a bit concerned about adding a global 
flag that cannot be implemented on current and future hardware.

John.

>   };
>   
>   /**



More information about the dri-devel mailing list