[Intel-gfx] [PATCH] drm/i915: Optionally disable automatic recovery after a GPU reset
Mika Kuoppala
mika.kuoppala at linux.intel.com
Wed Oct 3 06:22:13 UTC 2018
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Some clients, such as mesa, may only emit minimal incremental batches
> that rely on the logical context state from previous batches. They know
> that recovery is impossible after a hang as their required GPU state is
> lost, and that each in flight and subsequent batch will hang (resetting
> the context image back to default perpetuating the problem).
>
> To avoid getting into the state in the first place, we can allow clients
> to opt out of automatic recovery and elect to ban any guilty context
> following a hang. This prevents the continual stream of hangs and allows
> the client to recreate their context and rebuild the state from scratch.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 3 ++-
> drivers/gpu/drm/i915/i915_gem_context.c | 12 ++++++++++++
> drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
> include/uapi/drm/i915_drm.h | 20 ++++++++++++++++++++
> 4 files changed, 50 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 7d45e71100bc..eee06d90d460 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
>
> bannable = i915_gem_context_is_bannable(ctx);
> score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> - banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> + banned = (i915_gem_context_is_unrecoverable(ctx) ||
> + score >= CONTEXT_SCORE_BAN_THRESHOLD);
We treat banned contexts rather aggressively on client level
banning scoring.
Should we give some leeway if a client tells it don't
need recovery, instead of being more harsh on them?
As with this, third hang would lead to client ban.
>
> /* Cool contexts don't accumulate client ban score */
> if (!bannable)
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 8cbe58070561..2d5e4119786a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -878,6 +878,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
> case I915_CONTEXT_PARAM_BANNABLE:
> args->value = i915_gem_context_is_bannable(ctx);
> break;
> + case I915_CONTEXT_PARAM_RECOVERABLE:
> + args->value = !i915_gem_context_is_unrecoverable(ctx);
Pondering here why not the internal coding be also
i915_gem_context_is_recoverable(ctx). Atleast in here
would read better.
-Mika
> + break;
> case I915_CONTEXT_PARAM_PRIORITY:
> args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
> break;
> @@ -933,6 +936,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> i915_gem_context_clear_bannable(ctx);
> break;
>
> + case I915_CONTEXT_PARAM_RECOVERABLE:
> + if (args->size)
> + ret = -EINVAL;
> + else if (args->value)
> + i915_gem_context_clear_unrecoverable(ctx);
> + else
> + i915_gem_context_set_unrecoverable(ctx);
> + break;
> +
> case I915_CONTEXT_PARAM_PRIORITY:
> {
> s64 priority = args->value;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 08165f6a0a84..2d6b8b0307e5 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -123,6 +123,7 @@ struct i915_gem_context {
> #define UCONTEXT_NO_ZEROMAP 0
> #define UCONTEXT_NO_ERROR_CAPTURE 1
> #define UCONTEXT_BANNABLE 2
> +#define UCONTEXT_NO_RECOVERY 3
>
> /**
> * @flags: small set of booleans
> @@ -247,6 +248,21 @@ static inline void i915_gem_context_clear_bannable(struct i915_gem_context *ctx)
> clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags);
> }
>
> +static inline bool i915_gem_context_is_unrecoverable(const struct i915_gem_context *ctx)
> +{
> + return test_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_set_unrecoverable(struct i915_gem_context *ctx)
> +{
> + set_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_clear_unrecoverable(struct i915_gem_context *ctx)
> +{
> + clear_bit(UCONTEXT_NO_RECOVERY, &ctx->user_flags);
> +}
> +
> static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
> {
> return test_bit(CONTEXT_BANNED, &ctx->flags);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 298b2e197744..eeb258a12b95 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param {
> #define I915_CONTEXT_MAX_USER_PRIORITY 1023 /* inclusive */
> #define I915_CONTEXT_DEFAULT_PRIORITY 0
> #define I915_CONTEXT_MIN_USER_PRIORITY -1023 /* inclusive */
> +
> +/*
> + * Not all clients may want to attempt automatic recover of a context after
> + * a hang (for example, some clients may only submit very small incremental
> + * batches relying on known logical state of previous batches which will never
> + * recover correctly and each attempt will hang), and so would prefer that
> + * the context is forever banned instead.
> + *
> + * If set to false (0), after a reset, subsequent (and in flight) rendering
> + * from this context is discarded, and the client will need to create a new
> + * context to use instead.
> + *
> + * If set to true (1), the kernel will automatically attempt to recover the
> + * context by skipping the hanging batch and executing the next batch starting
> + * from the default context state (discarding the incomplete logical context
> + * state lost due to the reset).
> + *
> + * On creation, all new contexts are marked as recoverable.
> + */
> +#define I915_CONTEXT_PARAM_RECOVERABLE 0x7
> __u64 value;
> };
>
> --
> 2.19.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list