[Intel-gfx] [PATCH] drm/i915: Cancel persistent contexts if !hangcheck

Chris Wilson chris at chris-wilson.co.uk
Mon Aug 5 22:19:56 UTC 2019


Normally, we rely on our hangcheck to prevent persistent batches from
hogging the GPU. However, if the user disables hangcheck, this mechanism
breaks down. Despite our insistence that this is unsafe, the users are
equally insistent that they want to use endless batches and will disable
the hangcheck mechanism. We are looking are perhaps replacing hangcheck
with a softer mechanism, that sends a pulse down the engine to check if
it is well. We can use the same preemptive pulse to flush an active
persistent context off the GPU upon context close, preventing resources
being lost and unkillable requests remaining on the GPU, after process
termination.

XXX Fixup banned contexts on schedule-out, so that we don't jump back
into the middle of an infinite loop on resume.

XXX This does need a preempt-to-idle style of pulse, I need to set a
flag on the request to prevent queuing into the second port, so that we
have a context-out evebt where we can patch up the context image.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Michał Winiarski <michal.winiarski at intel.com>
Cc: Jon Bloomfield <jon.bloomfield at intel.com>

---
Same sort of caveats as for hangcheck, a few corner cases need
struct_mutex and some preallocation.
---
 drivers/gpu/drm/i915/Makefile                 |  3 +-
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 46 ++++++++++++++++
 .../gpu/drm/i915/gt/intel_engine_heartbeat.c  | 53 +++++++++++++++++++
 .../gpu/drm/i915/gt/intel_engine_heartbeat.h  | 14 +++++
 4 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
 create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a1016858d014..455f5c6213ee 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -71,8 +71,9 @@ obj-y += gt/
 gt-y += \
 	gt/intel_breadcrumbs.o \
 	gt/intel_context.o \
-	gt/intel_engine_pool.o \
 	gt/intel_engine_cs.o \
+	gt/intel_engine_heartbeat.o \
+	gt/intel_engine_pool.o \
 	gt/intel_engine_pm.o \
 	gt/intel_gt.o \
 	gt/intel_gt_pm.o \
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 64f7a533e886..bdf21cf9ce3c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -70,6 +70,7 @@
 #include <drm/i915_drm.h>
 
 #include "gt/intel_lrc_reg.h"
+#include "gt/intel_engine_heartbeat.h"
 
 #include "i915_gem_context.h"
 #include "i915_globals.h"
@@ -373,6 +374,42 @@ void i915_gem_context_release(struct kref *ref)
 		queue_work(i915->wq, &i915->contexts.free_work);
 }
 
+static void kill_context(struct i915_gem_context *ctx)
+{
+	struct i915_gem_engines_iter it;
+	struct intel_engine_cs *engine;
+	intel_engine_mask_t tmp, active;
+	struct intel_context *ce;
+
+	if (i915_gem_context_is_banned(ctx))
+		return;
+
+	i915_gem_context_set_banned(ctx);
+
+	active = 0;
+	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
+		struct i915_request *rq;
+
+		rq = i915_active_request_get_unlocked(&ce->ring->timeline->last_request);
+		if (!rq)
+			continue;
+
+		active |= rq->engine->mask;
+		i915_request_put(rq);
+	}
+	i915_gem_context_unlock_engines(ctx);
+
+	/*
+	 * Send a "high priority pulse" down the engine to cause the
+	 * current request to be momentarily preempted. (If it fails to
+	 * be preempted, it will be reset). As we have marked our context
+	 * as banned, any incomplete request, including any running, will
+	 * be skipped.
+	 */
+	for_each_engine_masked(engine, ctx->i915, active, tmp)
+		intel_engine_pulse(engine);
+}
+
 static void context_close(struct i915_gem_context *ctx)
 {
 	mutex_lock(&ctx->mutex);
@@ -394,6 +431,15 @@ static void context_close(struct i915_gem_context *ctx)
 	lut_close(ctx);
 
 	mutex_unlock(&ctx->mutex);
+
+	/*
+	 * If the user has disabled hangchecking, we can not be sure that
+	 * the batches will ever complete and let the context be freed.
+	 * Forcibly kill off any remaining requests in this case.
+	 */
+	if (!i915_modparams.enable_hangcheck)
+		kill_context(ctx);
+
 	i915_gem_context_put(ctx);
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
new file mode 100644
index 000000000000..0c6ea9750048
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -0,0 +1,53 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include "i915_request.h"
+
+#include "intel_context.h"
+#include "intel_engine_heartbeat.h"
+#include "intel_engine_pm.h"
+#include "intel_engine.h"
+#include "intel_gt.h"
+
+void intel_engine_pulse(struct intel_engine_cs *engine)
+{
+	struct intel_context *ce = engine->kernel_context;
+	struct i915_sched_attr attr = { .priority = INT_MAX };
+	struct i915_request *rq;
+	int err;
+
+	GEM_BUG_ON(!engine->schedule);
+
+	if (!intel_engine_pm_get_if_awake(engine))
+		return;
+
+	mutex_lock(&ce->ring->timeline->mutex);
+
+	intel_context_enter(ce);
+	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
+	intel_context_exit(ce);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto out_unlock;
+	}
+	i915_request_get(rq);
+
+	engine->wakeref_serial = engine->serial + 1;
+	__i915_request_commit(rq);
+
+	local_bh_disable();
+	engine->schedule(rq, &attr);
+	local_bh_enable();
+
+	i915_request_put(rq);
+
+out_unlock:
+	mutex_unlock(&ce->ring->timeline->mutex);
+	intel_context_timeline_unlock(ce);
+	intel_engine_pm_put(engine);
+	if (err) /* XXX must not be allowed to fail */
+		DRM_ERROR("Failed to ping %s, err=%d\n", engine->name, err);
+}
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
new file mode 100644
index 000000000000..86761748dc21
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.h
@@ -0,0 +1,14 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef INTEL_ENGINE_HEARTBEAT_H
+#define INTEL_ENGINE_HEARTBEAT_H
+
+struct intel_engine_cs;
+
+void intel_engine_pulse(struct intel_engine_cs *engine);
+
+#endif /* INTEL_ENGINE_HEARTBEAT_H */
-- 
2.23.0.rc1



More information about the Intel-gfx mailing list