[Intel-gfx] [PATCH 5/5] drm/i915: Cancel non-persistent contexts on close

Tue Aug 6 14:26:07 UTC 2019

> -----Original Message-----
> From: Chris Wilson <chris at chris-wilson.co.uk>
> Sent: Tuesday, August 6, 2019 6:47 AM
> To: intel-gfx at lists.freedesktop.org
> Cc: Chris Wilson <chris at chris-wilson.co.uk>; Joonas Lahtinen
> <joonas.lahtinen at linux.intel.com>; Winiarski, Michal
> <michal.winiarski at intel.com>; Bloomfield, Jon <jon.bloomfield at intel.com>
> Subject: [PATCH 5/5] drm/i915: Cancel non-persistent contexts on close
> 
> Normally, we rely on our hangcheck to prevent persistent batches from
> hogging the GPU. However, if the user disables hangcheck, this mechanism
> breaks down. Despite our insistence that this is unsafe, the users are
> equally insistent that they want to use endless batches and will disable
> the hangcheck mechanism. We are looking are perhaps replacing hangcheck
> with a softer mechanism, that sends a pulse down the engine to check if
> it is well. We can use the same preemptive pulse to flush an active
> persistent context off the GPU upon context close, preventing resources
> being lost and unkillable requests remaining on the GPU, after process
> termination. To avoid changing the ABI and accidentally breaking
> existing userspace, we make the persistence of a context explicit and
> enable it by default. Userspace can opt out of persistent mode (forcing
> requests to be cancelled when the context is closed by process
> termination or explicitly) by a context parameter, or to facilitate
> existing use-cases by disabling hangcheck (i915.enable_hangcheck=0).
> (Note, one of the outcomes for supporting endless mode will be the
> removal of hangchecking, at which point opting into persistent mode will
> be mandatory, or maybe the default.)
> 
> Testcase: igt/gem_ctx_persistence
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski at intel.com>
> Cc: Jon Bloomfield <jon.bloomfield at intel.com>
> 
> ---
> Same sort of caveats as for hangcheck, a few corner cases need
> struct_mutex and some preallocation.
> ---
>  drivers/gpu/drm/i915/Makefile                 |  3 +-
>  drivers/gpu/drm/i915/gem/i915_gem_context.c   | 79 +++++++++++++++++++
>  drivers/gpu/drm/i915/gem/i915_gem_context.h   | 15 ++++
>  .../gpu/drm/i915/gem/i915_gem_context_types.h |  1 +
>  .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 53 +++++++++++++
>  .../gpu/drm/i915/gt/intel_engine_heartbeat.h  | 14 ++++
>  include/uapi/drm/i915_drm.h                   | 15 ++++
>  7 files changed, 179 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
>  create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index a1016858d014..42417d87496e 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -71,9 +71,10 @@ obj-y += gt/
>  gt-y += \
>  	gt/intel_breadcrumbs.o \
>  	gt/intel_context.o \
> -	gt/intel_engine_pool.o \
>  	gt/intel_engine_cs.o \
> +	gt/intel_engine_heartbeat.o \
>  	gt/intel_engine_pm.o \
> +	gt/intel_engine_pool.o \
>  	gt/intel_gt.o \
>  	gt/intel_gt_pm.o \
>  	gt/intel_hangcheck.o \
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 64f7a533e886..5718b74f95b7 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -70,6 +70,7 @@
>  #include <drm/i915_drm.h>
> 
>  #include "gt/intel_lrc_reg.h"
> +#include "gt/intel_engine_heartbeat.h"
> 
>  #include "i915_gem_context.h"
>  #include "i915_globals.h"
> @@ -373,6 +374,45 @@ void i915_gem_context_release(struct kref *ref)
>  		queue_work(i915->wq, &i915->contexts.free_work);
>  }
> 
> +static void kill_context(struct i915_gem_context *ctx)
> +{
> +	struct i915_gem_engines_iter it;
> +	struct intel_engine_cs *engine;
> +	intel_engine_mask_t tmp, active;
> +	struct intel_context *ce;
> +
> +	if (i915_gem_context_is_banned(ctx))
> +		return;
> +
> +	i915_gem_context_set_banned(ctx);
> +
> +	active = 0;
> +	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
> +		struct i915_request *rq;
> +
> +		if (!ce->ring)
> +			continue;
> +
> +		rq = i915_active_request_get_unlocked(&ce->ring->timeline-
> >last_request);
> +		if (!rq)
> +			continue;
> +
> +		active |= rq->engine->mask;
> +		i915_request_put(rq);
> +	}
> +	i915_gem_context_unlock_engines(ctx);
> +
> +	/*
> +	 * Send a "high priority pulse" down the engine to cause the
> +	 * current request to be momentarily preempted. (If it fails to
> +	 * be preempted, it will be reset). As we have marked our context
> +	 * as banned, any incomplete request, including any running, will
> +	 * be skipped.
> +	 */
> +	for_each_engine_masked(engine, ctx->i915, active, tmp)
> +		intel_engine_pulse(engine);
> +}
> +
>  static void context_close(struct i915_gem_context *ctx)
>  {
>  	mutex_lock(&ctx->mutex);
> @@ -394,6 +434,15 @@ static void context_close(struct i915_gem_context
> *ctx)
>  	lut_close(ctx);
> 
>  	mutex_unlock(&ctx->mutex);
> +
> +	/*
> +	 * If the user has disabled hangchecking, we can not be sure that
> +	 * the batches will ever complete and let the context be freed.
> +	 * Forcibly kill off any remaining requests in this case.
> +	 */

Persistence is independent of hangcheck which makes the above comment unrepresentative of the below code.
Do we even need a comment here, it looks self-explanatory? Instead maybe comment the defaulting state in __createContext() to make the connection between hangcheck and persistence?

> +	if (!i915_gem_context_is_persistent(ctx))
> +		kill_context(ctx);
> +
>  	i915_gem_context_put(ctx);
>  }
> 
> @@ -433,6 +482,8 @@ __create_context(struct drm_i915_private *i915)
> 
>  	i915_gem_context_set_bannable(ctx);
>  	i915_gem_context_set_recoverable(ctx);
> +	if (i915_modparams.enable_hangcheck)
> +		i915_gem_context_set_persistence(ctx);
> 
>  	ctx->ring_size = 4 * PAGE_SIZE;
> 
> @@ -1745,6 +1796,25 @@ get_engines(struct i915_gem_context *ctx,
>  	return err;
>  }
> 
> +static int
> +set_persistence(struct i915_gem_context *ctx,
> +		const struct drm_i915_gem_context_param *args)
> +{
> +	if (args->size)
> +		return -EINVAL;
> +
> +	if (!args->value) {
> +		i915_gem_context_clear_persistence(ctx);
> +		return 0;
> +	}
> +
> +	if (!(ctx->i915->caps.scheduler &
> I915_SCHEDULER_CAP_PREEMPTION))
> +		return -ENODEV;
> +
> +	i915_gem_context_set_persistence(ctx);
> +	return 0;
> +}
> +
>  static int ctx_setparam(struct drm_i915_file_private *fpriv,
>  			struct i915_gem_context *ctx,
>  			struct drm_i915_gem_context_param *args)
> @@ -1822,6 +1892,10 @@ static int ctx_setparam(struct
> drm_i915_file_private *fpriv,
>  		ret = set_engines(ctx, args);
>  		break;
> 
> +	case I915_CONTEXT_PARAM_PERSISTENCE:
> +		ret = set_persistence(ctx, args);
> +		break;
> +
>  	case I915_CONTEXT_PARAM_BAN_PERIOD:
>  	default:
>  		ret = -EINVAL;
> @@ -2278,6 +2352,11 @@ int i915_gem_context_getparam_ioctl(struct
> drm_device *dev, void *data,
>  		ret = get_engines(ctx, args);
>  		break;
> 
> +	case I915_CONTEXT_PARAM_PERSISTENCE:
> +		args->size = 0;
> +		args->value = i915_gem_context_is_persistent(ctx);
> +		break;
> +
>  	case I915_CONTEXT_PARAM_BAN_PERIOD:
>  	default:
>  		ret = -EINVAL;
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h
> b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> index 106e2ccf7a4c..1834d1df2ac9 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> @@ -74,6 +74,21 @@ static inline void
> i915_gem_context_clear_recoverable(struct i915_gem_context *c
>  	clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
>  }
> 
> +static inline bool i915_gem_context_is_persistent(const struct
> i915_gem_context *ctx)
> +{
> +	return test_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_set_persistence(struct i915_gem_context
> *ctx)
> +{
> +	set_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_clear_persistence(struct
> i915_gem_context *ctx)
> +{
> +	clear_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
> +}
> +
>  static inline bool i915_gem_context_is_banned(const struct i915_gem_context
> *ctx)
>  {
>  	return test_bit(CONTEXT_BANNED, &ctx->flags);
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> index a02d98494078..bf41b43ffa9a 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> @@ -137,6 +137,7 @@ struct i915_gem_context {
>  #define UCONTEXT_NO_ERROR_CAPTURE	1
>  #define UCONTEXT_BANNABLE		2
>  #define UCONTEXT_RECOVERABLE		3
> +#define UCONTEXT_PERSISTENCE		4
> 
>  	/**
>  	 * @flags: small set of booleans
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> new file mode 100644
> index 000000000000..0c0632a423a7
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> @@ -0,0 +1,53 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#include "i915_request.h"
> +
> +#include "intel_context.h"
> +#include "intel_engine_heartbeat.h"
> +#include "intel_engine_pm.h"
> +#include "intel_engine.h"
> +#include "intel_gt.h"
> +
> +void intel_engine_pulse(struct intel_engine_cs *engine)
> +{
> +	struct intel_context *ce = engine->kernel_context;
> +	struct i915_sched_attr attr = { .priority = INT_MAX };
> +	struct i915_request *rq;
> +	int err = 0;
> +
> +	GEM_BUG_ON(!engine->schedule);
> +
> +	if (!intel_engine_pm_get_if_awake(engine))
> +		return;
> +
> +	mutex_lock(&ce->ring->timeline->mutex);
> +
> +	intel_context_enter(ce);
> +	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
> +	intel_context_exit(ce);
> +	if (IS_ERR(rq)) {
> +		err = PTR_ERR(rq);
> +		goto out_unlock;
> +	}
> +	i915_request_get(rq);
> +
> +	rq->flags |= I915_REQUEST_SENTINEL;
> +	__i915_request_commit(rq);
> +
> +	local_bh_disable();
> +	engine->schedule(rq, &attr);
> +	local_bh_enable();
> +
> +	i915_request_put(rq);
> +
> +out_unlock:
> +	mutex_unlock(&ce->ring->timeline->mutex);
> +	intel_context_timeline_unlock(ce);
> +	intel_engine_pm_put(engine);
> +	if (err) /* XXX must not be allowed to fail */
> +		DRM_ERROR("Failed to ping %s, err=%d\n", engine->name,
> err);
> +}
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> new file mode 100644
> index 000000000000..86761748dc21
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
> @@ -0,0 +1,14 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#ifndef INTEL_ENGINE_HEARTBEAT_H
> +#define INTEL_ENGINE_HEARTBEAT_H
> +
> +struct intel_engine_cs;
> +
> +void intel_engine_pulse(struct intel_engine_cs *engine);
> +
> +#endif /* INTEL_ENGINE_HEARTBEAT_H */
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 469dc512cca3..dbc8691d75d0 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1565,6 +1565,21 @@ struct drm_i915_gem_context_param {
>   *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
>   */
>  #define I915_CONTEXT_PARAM_ENGINES	0xa
> +
> +/*
> + * I915_CONTEXT_PARAM_PERSISTENCE:
> + *
> + * Allow the context and active rendering to survive the process until
> + * completion. Persistence allows fire-and-forget clients to queue up a
> + * bunch of work, hand the output over to a display server and the quit.
> + * If the context is not marked as persistent, upon closing (either via
> + * an explicit DRM_I915_GEM_CONTEXT_DESTROY or implicitly from file
> closure
> + * or process termination), the context and any outstanding requests will be
> + * cancelled (and exported fences for cancelled requests marked as -EIO).
> + *
> + * By default, new contexts allow persistence.
> + */
> +#define I915_CONTEXT_PARAM_PERSISTENCE	0xb
>  /* Must be kept compact -- no holes and well documented */
> 
>  	__u64 value;
> --
> 2.23.0.rc1