[Intel-gfx] [PATCH 4/4] drm/i915/gem: Always test execution status on closing the context

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Thu Sep 24 14:26:56 UTC 2020


On 16/09/2020 10:42, Chris Wilson wrote:
> Verify that if a context is active at the time it is closed, that it is
> either persistent and preemptible (with hangcheck running) or it shall
> be removed from execution.
> 
> Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs")
> Testcase: igt/gem_ctx_persistence/heartbeat-close
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: <stable at vger.kernel.org> # v5.7+
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 +++++----------------
>   1 file changed, 10 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index a548626fa8bc..4fd38101bb56 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context *ctx)
>   	return rcu_dereference_protected(ctx->engines, true);
>   }
>   
> -static bool __reset_engine(struct intel_engine_cs *engine)
> -{
> -	struct intel_gt *gt = engine->gt;
> -	bool success = false;
> -
> -	if (!intel_has_reset_engine(gt))
> -		return false;
> -
> -	if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
> -			      &gt->reset.flags)) {
> -		success = intel_engine_reset(engine, NULL) == 0;
> -		clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
> -				      &gt->reset.flags);
> -	}
> -
> -	return success;
> -}
> -
>   static void __reset_context(struct i915_gem_context *ctx,
>   			    struct intel_engine_cs *engine)
>   {
> @@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine)
>   	 * kill the banned context, we fallback to doing a local reset
>   	 * instead.
>   	 */
> -	if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
> -	    !intel_engine_pulse(engine))
> -		return true;
> -
> -	/* If we are unable to send a pulse, try resetting this engine. */
> -	return __reset_engine(engine);
> +	return intel_engine_pulse(engine) == 0;

Where is the path now which actually resets the currently running 
workload (engine) of a non-persistent context? Pulse will be sent and 
then rely on hangcheck but otherwise let it run?

Regards,

Tvrtko

>   }
>   
>   static bool
> @@ -506,7 +483,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
>   	return engine;
>   }
>   
> -static void kill_engines(struct i915_gem_engines *engines)
> +static void kill_engines(struct i915_gem_engines *engines, bool ban)
>   {
>   	struct i915_gem_engines_iter it;
>   	struct intel_context *ce;
> @@ -521,7 +498,7 @@ static void kill_engines(struct i915_gem_engines *engines)
>   	for_each_gem_engine(ce, engines, it) {
>   		struct intel_engine_cs *engine;
>   
> -		if (intel_context_set_banned(ce))
> +		if (ban && intel_context_set_banned(ce))
>   			continue;
>   
>   		/*
> @@ -534,7 +511,7 @@ static void kill_engines(struct i915_gem_engines *engines)
>   		engine = active_engine(ce);
>   
>   		/* First attempt to gracefully cancel the context */
> -		if (engine && !__cancel_engine(engine))
> +		if (engine && !__cancel_engine(engine) && ban)
>   			/*
>   			 * If we are unable to send a preemptive pulse to bump
>   			 * the context from the GPU, we have to resort to a full
> @@ -544,8 +521,10 @@ static void kill_engines(struct i915_gem_engines *engines)
>   	}
>   }
>   
> -static void kill_stale_engines(struct i915_gem_context *ctx)
> +static void kill_context(struct i915_gem_context *ctx)
>   {
> +	bool ban = (!i915_gem_context_is_persistent(ctx) ||
> +		    !ctx->i915->params.enable_hangcheck);
>   	struct i915_gem_engines *pos, *next;
>   
>   	spin_lock_irq(&ctx->stale.lock);
> @@ -558,7 +537,7 @@ static void kill_stale_engines(struct i915_gem_context *ctx)
>   
>   		spin_unlock_irq(&ctx->stale.lock);
>   
> -		kill_engines(pos);
> +		kill_engines(pos, ban);
>   
>   		spin_lock_irq(&ctx->stale.lock);
>   		GEM_BUG_ON(i915_sw_fence_signaled(&pos->fence));
> @@ -570,11 +549,6 @@ static void kill_stale_engines(struct i915_gem_context *ctx)
>   	spin_unlock_irq(&ctx->stale.lock);
>   }
>   
> -static void kill_context(struct i915_gem_context *ctx)
> -{
> -	kill_stale_engines(ctx);
> -}
> -
>   static void engines_idle_release(struct i915_gem_context *ctx,
>   				 struct i915_gem_engines *engines)
>   {
> @@ -609,7 +583,7 @@ static void engines_idle_release(struct i915_gem_context *ctx,
>   
>   kill:
>   	if (list_empty(&engines->link)) /* raced, already closed */
> -		kill_engines(engines);
> +		kill_engines(engines, true);
>   
>   	i915_sw_fence_commit(&engines->fence);
>   }
> @@ -667,9 +641,7 @@ static void context_close(struct i915_gem_context *ctx)
>   	 * case we opt to forcibly kill off all remaining requests on
>   	 * context close.
>   	 */
> -	if (!i915_gem_context_is_persistent(ctx) ||
> -	    !ctx->i915->params.enable_hangcheck)
> -		kill_context(ctx);
> +	kill_context(ctx);
>   
>   	i915_gem_context_put(ctx);
>   }
> 


More information about the Intel-gfx mailing list