[Intel-gfx] [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset

Mika Kuoppala mika.kuoppala at linux.intel.com
Wed Oct 3 06:22:13 UTC 2018


Chris Wilson <chris at chris-wilson.co.uk> writes:

> Some clients, such as mesa, may only emit minimal incremental batches
> that rely on the logical context state from previous batches. They know
> that recovery is impossible after a hang as their required GPU state is
> lost, and that each in flight and subsequent batch will hang (resetting
> the context image back to default perpetuating the problem).
>
> To avoid getting into the state in the first place, we can allow clients
> to opt out of automatic recovery and elect to ban any guilty context
> following a hang. This prevents the continual stream of hangs and allows
> the client to recreate their context and rebuild the state from scratch.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |  3 ++-
>  drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
>  include/uapi/drm/i915_drm.h             | 20 ++++++++++++++++++++
>  4 files changed, 50 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 7d45e71100bc..eee06d90d460 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>  
>  	bannable = i915_gem_context_is_bannable(ctx);
>  	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> -	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> +	banned = (i915_gem_context_is_unrecoverable(ctx) ||
> +		  score >= CONTEXT_SCORE_BAN_THRESHOLD);

We treat banned contexts rather aggressively on client level
banning scoring.

Should we give some leeway if a client tells it don't
need recovery, instead of being more harsh on them?

As with this, third hang would lead to client ban.

>  
>  	/* Cool contexts don't accumulate client ban score */
>  	if (!bannable)
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 8cbe58070561..2d5e4119786a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -878,6 +878,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_BANNABLE:
>  		args->value = i915_gem_context_is_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_RECOVERABLE:
> +		args->value = !i915_gem_context_is_unrecoverable(ctx);

Pondering here why not the internal coding be also
i915_gem_context_is_recoverable(ctx). Atleast in here
would read better.

-Mika

> +		break;
>  	case I915_CONTEXT_PARAM_PRIORITY:
>  		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
>  		break;
> @@ -933,6 +936,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  			i915_gem_context_clear_bannable(ctx);
>  		break;
>  
> +	case I915_CONTEXT_PARAM_RECOVERABLE:
> +		if (args->size)
> +			ret = -EINVAL;
> +		else if (args->value)
> +			i915_gem_context_clear_unrecoverable(ctx);
> +		else
> +			i915_gem_context_set_unrecoverable(ctx);
> +		break;
> +
>  	case I915_CONTEXT_PARAM_PRIORITY:
>  		{
>  			s64 priority = args->value;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 08165f6a0a84..2d6b8b0307e5 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -123,6 +123,7 @@ struct i915_gem_context {
>  #define UCONTEXT_NO_ZEROMAP		0
>  #define UCONTEXT_NO_ERROR_CAPTURE	1
>  #define UCONTEXT_BANNABLE		2
> +#define UCONTEXT_NO_RECOVERY		3
>  
>  	/**
>  	 * @flags: small set of booleans
> @@ -247,6 +248,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx)
>  	clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags);
>  }
>  
> +static inline bool i915_gem_context_is_unrecoverable(const struct i915_gem_context *ctx)
> +{
> +	return test_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_set_unrecoverable(struct i915_gem_context *ctx)
> +{
> +	set_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_clear_unrecoverable(struct i915_gem_context *ctx)
> +{
> +	clear_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
> +}
> +
>  static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
>  {
>  	return test_bit(CONTEXT_BANNED, &ctx->flags);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 298b2e197744..eeb258a12b95 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param {
>  #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
>  #define   I915_CONTEXT_DEFAULT_PRIORITY		0
>  #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
> +
> +/*
> + * Not all clients may want to attempt automatic recover of a context after
> + * a hang (for example, some clients may only submit very small incremental
> + * batches relying on known logical state of previous batches which will never
> + * recover correctly and each attempt will hang), and so would prefer that
> + * the context is forever banned instead.
> + *
> + * If set to false (0), after a reset, subsequent (and in flight) rendering
> + * from this context is discarded, and the client will need to create a new
> + * context to use instead.
> + *
> + * If set to true (1), the kernel will automatically attempt to recover the
> + * context by skipping the hanging batch and executing the next batch starting
> + * from the default context state (discarding the incomplete logical context
> + * state lost due to the reset).
> + *
> + * On creation, all new contexts are marked as recoverable.
> + */
> +#define I915_CONTEXT_PARAM_RECOVERABLE	0x7
>  	__u64 value;
>  };
>  
> -- 
> 2.19.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx


More information about the Intel-gfx mailing list