[Intel-gfx] [PATCH 08/10] drm/i915: Cancel non-persistent contexts on close

Fri Oct 11 13:55:00 UTC 2019

On 10/10/2019 08:14, Chris Wilson wrote:
> Normally, we rely on our hangcheck to prevent persistent batches from
> hogging the GPU. However, if the user disables hangcheck, this mechanism
> breaks down. Despite our insistence that this is unsafe, the users are
> equally insistent that they want to use endless batches and will disable
> the hangcheck mechanism. We are looking at perhaps replacing hangcheck
> with a softer mechanism, that sends a pulse down the engine to check if
> it is well. We can use the same preemptive pulse to flush an active
> persistent context off the GPU upon context close, preventing resources
> being lost and unkillable requests remaining on the GPU after process
> termination. To avoid changing the ABI and accidentally breaking
> existing userspace, we make the persistence of a context explicit and
> enable it by default (matching current ABI). Userspace can opt out of
> persistent mode (forcing requests to be cancelled when the context is
> closed by process termination or explicitly) by a context parameter. To
> facilitate existing use-cases of disabling hangcheck, if the modparam is
> disabled (i915.enable_hangcheck=0), we disable persistence mode by
> default.  (Note, one of the outcomes for supporting endless mode will be
> the removal of hangchecking, at which point opting into persistent mode
> will be mandatory, or maybe the default perhaps controlled by cgroups.)
> 
> v2: Check for hangchecking at context termination, so that we are not
> left with undying contexts from a crafty user.
> 
> Testcase: igt/gem_ctx_persistence
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski at intel.com>
> Cc: Jon Bloomfield <jon.bloomfield at intel.com>
> Reviewed-by: Jon Bloomfield <jon.bloomfield at intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c   | 132 ++++++++++++++++++
>   drivers/gpu/drm/i915/gem/i915_gem_context.h   |  15 ++
>   .../gpu/drm/i915/gem/i915_gem_context_types.h |   1 +
>   .../gpu/drm/i915/gem/selftests/mock_context.c |   2 +
>   include/uapi/drm/i915_drm.h                   |  15 ++
>   5 files changed, 165 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 5d8221c7ba83..46e5b3b53288 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -70,6 +70,7 @@
>   #include <drm/i915_drm.h>
>   
>   #include "gt/intel_lrc_reg.h"
> +#include "gt/intel_engine_heartbeat.h"
>   #include "gt/intel_engine_user.h"
>   
>   #include "i915_gem_context.h"
> @@ -269,6 +270,78 @@ void i915_gem_context_release(struct kref *ref)
>   		schedule_work(&gc->free_work);
>   }
>   
> +static inline struct i915_gem_engines *
> +__context_engines_static(struct i915_gem_context *ctx)
> +{
> +	return rcu_dereference_protected(ctx->engines, true);
> +}
> +
> +static void kill_context(struct i915_gem_context *ctx)
> +{
> +	intel_engine_mask_t tmp, active, reset;
> +	struct intel_gt *gt = &ctx->i915->gt;
> +	struct i915_gem_engines_iter it;
> +	struct intel_engine_cs *engine;
> +	struct intel_context *ce;
> +
> +	/*
> +	 * If we are already banned, it was due to a guilty request causing
> +	 * a reset and the entire context being evicted from the GPU.
> +	 */
> +	if (i915_gem_context_is_banned(ctx))
> +		return;
> +
> +	i915_gem_context_set_banned(ctx);
> +
> +	/*
> +	 * Map the user's engine back to the actual engines; one virtual
> +	 * engine will be mapped to multiple engines, and using ctx->engine[]
> +	 * the same engine may be have multiple instances in the user's map.
> +	 * However, we only care about pending requests, so only include
> +	 * engines on which there are incomplete requests.
> +	 */
> +	active = 0;
> +	for_each_gem_engine(ce, __context_engines_static(ctx), it) {
> +		struct dma_fence *fence;
> +
> +		if (!ce->timeline)
> +			continue;
> +
> +		fence = i915_active_fence_get(&ce->timeline->last_request);
> +		if (!fence)
> +			continue;
> +
> +		engine = to_request(fence)->engine;
> +		if (HAS_EXECLISTS(gt->i915))
> +			engine = intel_context_inflight(ce);

Okay preemption implies execlists, was confused for a moment.

When can engine be NULL here?

> +		if (engine)
> +			active |= engine->mask;
> +
> +		dma_fence_put(fence);
> +	}
> +
> +	/*
> +	 * Send a "high priority pulse" down the engine to cause the
> +	 * current request to be momentarily preempted. (If it fails to
> +	 * be preempted, it will be reset). As we have marked our context
> +	 * as banned, any incomplete request, including any running, will
> +	 * be skipped following the preemption.
> +	 */
> +	reset = 0;
> +	for_each_engine_masked(engine, gt->i915, active, tmp)
> +		if (intel_engine_pulse(engine))
> +			reset |= engine->mask;

What if we were able to send a pulse, but the hog cannot be preempted 
and hangcheck is obviously disabled - who will do the reset?

> +
> +	/*
> +	 * If we are unable to send a preemptive pulse to bump
> +	 * the context from the GPU, we have to resort to a full
> +	 * reset. We hope the collateral damage is worth it.
> +	 */
> +	if (reset)
> +		intel_gt_handle_error(gt, reset, 0,
> +				      "context closure in %s", ctx->name);
> +}
> +
>   static void context_close(struct i915_gem_context *ctx)
>   {
>   	struct i915_address_space *vm;
> @@ -291,9 +364,47 @@ static void context_close(struct i915_gem_context *ctx)
>   	lut_close(ctx);
>   
>   	mutex_unlock(&ctx->mutex);
> +
> +	/*
> +	 * If the user has disabled hangchecking, we can not be sure that
> +	 * the batches will ever complete after the context is closed,
> +	 * keep the context and all resources pinned forever. So in this

s/keep/keeping/, I think.

> +	 * case we opt to forcibly kill off all remaining requests on
> +	 * context close.
> +	 */
> +	if (!i915_gem_context_is_persistent(ctx) ||
> +	    !i915_modparams.enable_hangcheck)
> +		kill_context(ctx);
> +
>   	i915_gem_context_put(ctx);
>   }
>   
> +static int __context_set_persistence(struct i915_gem_context *ctx, bool state)
> +{
> +	if (i915_gem_context_is_persistent(ctx) == state)
> +		return 0;
> +
> +	if (state) {
> +		/*
> +		 * Only contexts that are short-lived [that will expire or be
> +		 * reset] are allowed to survive past termination. We require
> +		 * hangcheck to ensure that the persistent requests are healthy.
> +		 */
> +		if (!i915_modparams.enable_hangcheck)
> +			return -EINVAL;
> +
> +		i915_gem_context_set_persistence(ctx);
> +	} else {
> +		/* To cancel a context we use "preempt-to-idle" */
> +		if (!(ctx->i915->caps.scheduler & I915_SCHEDULER_CAP_PREEMPTION))
> +			return -ENODEV;
> +
> +		i915_gem_context_clear_persistence(ctx);
> +	}
> +
> +	return 0;
> +}
> +
>   static struct i915_gem_context *
>   __create_context(struct drm_i915_private *i915)
>   {
> @@ -328,6 +439,7 @@ __create_context(struct drm_i915_private *i915)
>   
>   	i915_gem_context_set_bannable(ctx);
>   	i915_gem_context_set_recoverable(ctx);
> +	__context_set_persistence(ctx, true /* cgroup hook? */);
>   
>   	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
>   		ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
> @@ -484,6 +596,7 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
>   		return ctx;
>   
>   	i915_gem_context_clear_bannable(ctx);
> +	i915_gem_context_set_persistence(ctx);
>   	ctx->sched.priority = I915_USER_PRIORITY(prio);
>   
>   	GEM_BUG_ON(!i915_gem_context_is_kernel(ctx));
> @@ -1594,6 +1707,16 @@ get_engines(struct i915_gem_context *ctx,
>   	return err;
>   }
>   
> +static int
> +set_persistence(struct i915_gem_context *ctx,
> +		const struct drm_i915_gem_context_param *args)
> +{
> +	if (args->size)
> +		return -EINVAL;
> +
> +	return __context_set_persistence(ctx, args->value);
> +}
> +
>   static int ctx_setparam(struct drm_i915_file_private *fpriv,
>   			struct i915_gem_context *ctx,
>   			struct drm_i915_gem_context_param *args)
> @@ -1671,6 +1794,10 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
>   		ret = set_engines(ctx, args);
>   		break;
>   
> +	case I915_CONTEXT_PARAM_PERSISTENCE:
> +		ret = set_persistence(ctx, args);
> +		break;
> +
>   	case I915_CONTEXT_PARAM_BAN_PERIOD:
>   	default:
>   		ret = -EINVAL;
> @@ -2123,6 +2250,11 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>   		ret = get_engines(ctx, args);
>   		break;
>   
> +	case I915_CONTEXT_PARAM_PERSISTENCE:
> +		args->size = 0;
> +		args->value = i915_gem_context_is_persistent(ctx);
> +		break;
> +
>   	case I915_CONTEXT_PARAM_BAN_PERIOD:
>   	default:
>   		ret = -EINVAL;
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> index 9234586830d1..2eec035382a2 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> @@ -76,6 +76,21 @@ static inline void i915_gem_context_clear_recoverable(struct i915_gem_context *c
>   	clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
>   }
>   
> +static inline bool i915_gem_context_is_persistent(const struct i915_gem_context *ctx)
> +{
> +	return test_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_set_persistence(struct i915_gem_context *ctx)
> +{
> +	set_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_clear_persistence(struct i915_gem_context *ctx)
> +{
> +	clear_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
> +}
> +
>   static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
>   {
>   	return test_bit(CONTEXT_BANNED, &ctx->flags);
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> index ab8e1367dfc8..a3ecd19f2303 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> @@ -137,6 +137,7 @@ struct i915_gem_context {
>   #define UCONTEXT_NO_ERROR_CAPTURE	1
>   #define UCONTEXT_BANNABLE		2
>   #define UCONTEXT_RECOVERABLE		3
> +#define UCONTEXT_PERSISTENCE		4
>   
>   	/**
>   	 * @flags: small set of booleans
> diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_context.c b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
> index 74ddd682c9cd..29b8984f0e47 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/mock_context.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
> @@ -22,6 +22,8 @@ mock_context(struct drm_i915_private *i915,
>   	INIT_LIST_HEAD(&ctx->link);
>   	ctx->i915 = i915;
>   
> +	i915_gem_context_set_persistence(ctx);
> +
>   	mutex_init(&ctx->engines_mutex);
>   	e = default_engines(ctx);
>   	if (IS_ERR(e))
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 30c542144016..eb9e704d717a 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1565,6 +1565,21 @@ struct drm_i915_gem_context_param {
>    *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
>    */
>   #define I915_CONTEXT_PARAM_ENGINES	0xa
> +
> +/*
> + * I915_CONTEXT_PARAM_PERSISTENCE:
> + *
> + * Allow the context and active rendering to survive the process until
> + * completion. Persistence allows fire-and-forget clients to queue up a
> + * bunch of work, hand the output over to a display server and the quit.
> + * If the context is not marked as persistent, upon closing (either via
> + * an explicit DRM_I915_GEM_CONTEXT_DESTROY or implicitly from file closure
> + * or process termination), the context and any outstanding requests will be
> + * cancelled (and exported fences for cancelled requests marked as -EIO).
> + *
> + * By default, new contexts allow persistence.
> + */
> +#define I915_CONTEXT_PARAM_PERSISTENCE	0xb
>   /* Must be kept compact -- no holes and well documented */
>   
>   	__u64 value;
> 

Regards,

Tvrtko