[Intel-gfx] [PATCH 5/5] drm/i915: Cancel non-persistent contexts on close

Tue Aug 6 13:47:25 UTC 2019

Normally, we rely on our hangcheck to prevent persistent batches from
hogging the GPU. However, if the user disables hangcheck, this mechanism
breaks down. Despite our insistence that this is unsafe, the users are
equally insistent that they want to use endless batches and will disable
the hangcheck mechanism. We are looking are perhaps replacing hangcheck
with a softer mechanism, that sends a pulse down the engine to check if
it is well. We can use the same preemptive pulse to flush an active
persistent context off the GPU upon context close, preventing resources
being lost and unkillable requests remaining on the GPU, after process
termination. To avoid changing the ABI and accidentally breaking
existing userspace, we make the persistence of a context explicit and
enable it by default. Userspace can opt out of persistent mode (forcing
requests to be cancelled when the context is closed by process
termination or explicitly) by a context parameter, or to facilitate
existing use-cases by disabling hangcheck (i915.enable_hangcheck=0).
(Note, one of the outcomes for supporting endless mode will be the
removal of hangchecking, at which point opting into persistent mode will
be mandatory, or maybe the default.)

Testcase: igt/gem_ctx_persistence
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Michał Winiarski <michal.winiarski at intel.com>
Cc: Jon Bloomfield <jon.bloomfield at intel.com>

---
Same sort of caveats as for hangcheck, a few corner cases need
struct_mutex and some preallocation.
---
 drivers/gpu/drm/i915/Makefile                 |  3 +-
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 79 +++++++++++++++++++
 drivers/gpu/drm/i915/gem/i915_gem_context.h   | 15 ++++
 .../gpu/drm/i915/gem/i915_gem_context_types.h |  1 +
 .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 53 +++++++++++++
 .../gpu/drm/i915/gt/intel_engine_heartbeat.h  | 14 ++++
 include/uapi/drm/i915_drm.h                   | 15 ++++
 7 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
 create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a1016858d014..42417d87496e 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -71,9 +71,10 @@ obj-y += gt/
 gt-y += \
 	gt/intel_breadcrumbs.o \
 	gt/intel_context.o \
-	gt/intel_engine_pool.o \
 	gt/intel_engine_cs.o \
+	gt/intel_engine_heartbeat.o \
 	gt/intel_engine_pm.o \
+	gt/intel_engine_pool.o \
 	gt/intel_gt.o \
 	gt/intel_gt_pm.o \
 	gt/intel_hangcheck.o \
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 64f7a533e886..5718b74f95b7 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -70,6 +70,7 @@
 #include <drm/i915_drm.h>
 
 #include "gt/intel_lrc_reg.h"
+#include "gt/intel_engine_heartbeat.h"
 
 #include "i915_gem_context.h"
 #include "i915_globals.h"
@@ -373,6 +374,45 @@ void i915_gem_context_release(struct kref *ref)
 		queue_work(i915->wq, &i915->contexts.free_work);
 }
 
+static void kill_context(struct i915_gem_context *ctx)
+{
+	struct i915_gem_engines_iter it;
+	struct intel_engine_cs *engine;
+	intel_engine_mask_t tmp, active;
+	struct intel_context *ce;
+
+	if (i915_gem_context_is_banned(ctx))
+		return;
+
+	i915_gem_context_set_banned(ctx);
+
+	active = 0;
+	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
+		struct i915_request *rq;
+
+		if (!ce->ring)
+			continue;
+
+		rq = i915_active_request_get_unlocked(&ce->ring->timeline->last_request);
+		if (!rq)
+			continue;
+
+		active |= rq->engine->mask;
+		i915_request_put(rq);
+	}
+	i915_gem_context_unlock_engines(ctx);
+
+	/*
+	 * Send a "high priority pulse" down the engine to cause the
+	 * current request to be momentarily preempted. (If it fails to
+	 * be preempted, it will be reset). As we have marked our context
+	 * as banned, any incomplete request, including any running, will
+	 * be skipped.
+	 */
+	for_each_engine_masked(engine, ctx->i915, active, tmp)
+		intel_engine_pulse(engine);
+}
+
 static void context_close(struct i915_gem_context *ctx)
 {
 	mutex_lock(&ctx->mutex);
@@ -394,6 +434,15 @@ static void context_close(struct i915_gem_context *ctx)
 	lut_close(ctx);
 
 	mutex_unlock(&ctx->mutex);
+
+	/*
+	 * If the user has disabled hangchecking, we can not be sure that
+	 * the batches will ever complete and let the context be freed.
+	 * Forcibly kill off any remaining requests in this case.
+	 */
+	if (!i915_gem_context_is_persistent(ctx))
+		kill_context(ctx);
+
 	i915_gem_context_put(ctx);
 }
 
@@ -433,6 +482,8 @@ __create_context(struct drm_i915_private *i915)
 
 	i915_gem_context_set_bannable(ctx);
 	i915_gem_context_set_recoverable(ctx);
+	if (i915_modparams.enable_hangcheck)
+		i915_gem_context_set_persistence(ctx);
 
 	ctx->ring_size = 4 * PAGE_SIZE;
 
@@ -1745,6 +1796,25 @@ get_engines(struct i915_gem_context *ctx,
 	return err;
 }
 
+static int
+set_persistence(struct i915_gem_context *ctx,
+		const struct drm_i915_gem_context_param *args)
+{
+	if (args->size)
+		return -EINVAL;
+
+	if (!args->value) {
+		i915_gem_context_clear_persistence(ctx);
+		return 0;
+	}
+
+	if (!(ctx->i915->caps.scheduler & I915_SCHEDULER_CAP_PREEMPTION))
+		return -ENODEV;
+
+	i915_gem_context_set_persistence(ctx);
+	return 0;
+}
+
 static int ctx_setparam(struct drm_i915_file_private *fpriv,
 			struct i915_gem_context *ctx,
 			struct drm_i915_gem_context_param *args)
@@ -1822,6 +1892,10 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
 		ret = set_engines(ctx, args);
 		break;
 
+	case I915_CONTEXT_PARAM_PERSISTENCE:
+		ret = set_persistence(ctx, args);
+		break;
+
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
@@ -2278,6 +2352,11 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 		ret = get_engines(ctx, args);
 		break;
 
+	case I915_CONTEXT_PARAM_PERSISTENCE:
+		args->size = 0;
+		args->value = i915_gem_context_is_persistent(ctx);
+		break;
+
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
index 106e2ccf7a4c..1834d1df2ac9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
@@ -74,6 +74,21 @@ static inline void i915_gem_context_clear_recoverable(struct i915_gem_context *c
 	clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
 }
 
+static inline bool i915_gem_context_is_persistent(const struct i915_gem_context *ctx)
+{
+	return test_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
+}
+
+static inline void i915_gem_context_set_persistence(struct i915_gem_context *ctx)
+{
+	set_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
+}
+
+static inline void i915_gem_context_clear_persistence(struct i915_gem_context *ctx)
+{
+	clear_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
+}
+
 static inline bool i915_gem_context_is_banned(const struct i915_gem_context *ctx)
 {
 	return test_bit(CONTEXT_BANNED, &ctx->flags);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index a02d98494078..bf41b43ffa9a 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -137,6 +137,7 @@ struct i915_gem_context {
 #define UCONTEXT_NO_ERROR_CAPTURE	1
 #define UCONTEXT_BANNABLE		2
 #define UCONTEXT_RECOVERABLE		3
+#define UCONTEXT_PERSISTENCE		4
 
 	/**
 	 * @flags: small set of booleans
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
new file mode 100644
index 000000000000..0c0632a423a7
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -0,0 +1,53 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include "i915_request.h"
+
+#include "intel_context.h"
+#include "intel_engine_heartbeat.h"
+#include "intel_engine_pm.h"
+#include "intel_engine.h"
+#include "intel_gt.h"
+
+void intel_engine_pulse(struct intel_engine_cs *engine)
+{
+	struct intel_context *ce = engine->kernel_context;
+	struct i915_sched_attr attr = { .priority = INT_MAX };
+	struct i915_request *rq;
+	int err = 0;
+
+	GEM_BUG_ON(!engine->schedule);
+
+	if (!intel_engine_pm_get_if_awake(engine))
+		return;
+
+	mutex_lock(&ce->ring->timeline->mutex);
+
+	intel_context_enter(ce);
+	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
+	intel_context_exit(ce);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto out_unlock;
+	}
+	i915_request_get(rq);
+
+	rq->flags |= I915_REQUEST_SENTINEL;
+	__i915_request_commit(rq);
+
+	local_bh_disable();
+	engine->schedule(rq, &attr);
+	local_bh_enable();
+
+	i915_request_put(rq);
+
+out_unlock:
+	mutex_unlock(&ce->ring->timeline->mutex);
+	intel_context_timeline_unlock(ce);
+	intel_engine_pm_put(engine);
+	if (err) /* XXX must not be allowed to fail */
+		DRM_ERROR("Failed to ping %s, err=%d\n", engine->name, err);
+}
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
new file mode 100644
index 000000000000..86761748dc21
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
@@ -0,0 +1,14 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef INTEL_ENGINE_HEARTBEAT_H
+#define INTEL_ENGINE_HEARTBEAT_H
+
+struct intel_engine_cs;
+
+void intel_engine_pulse(struct intel_engine_cs *engine);
+
+#endif /* INTEL_ENGINE_HEARTBEAT_H */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 469dc512cca3..dbc8691d75d0 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1565,6 +1565,21 @@ struct drm_i915_gem_context_param {
  *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
  */
 #define I915_CONTEXT_PARAM_ENGINES	0xa
+
+/*
+ * I915_CONTEXT_PARAM_PERSISTENCE:
+ *
+ * Allow the context and active rendering to survive the process until
+ * completion. Persistence allows fire-and-forget clients to queue up a
+ * bunch of work, hand the output over to a display server and the quit.
+ * If the context is not marked as persistent, upon closing (either via
+ * an explicit DRM_I915_GEM_CONTEXT_DESTROY or implicitly from file closure
+ * or process termination), the context and any outstanding requests will be
+ * cancelled (and exported fences for cancelled requests marked as -EIO).
+ *
+ * By default, new contexts allow persistence.
+ */
+#define I915_CONTEXT_PARAM_PERSISTENCE	0xb
 /* Must be kept compact -- no holes and well documented */
 
 	__u64 value;
-- 
2.23.0.rc1