[PATCH 27/75] drm/i915: Move context revocation to scheduler

Tue Feb 2 00:35:53 UTC 2021

Centralise the means by which to remove a context from execution to the
scheduler, allowing the backends to specialise as necessary. Note that
without backend support, we can simplify the procedure to forcibly reset
the HW to remove the context.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 117 +-----------------
 .../drm/i915/gt/intel_execlists_submission.c  |  46 +++++++
 drivers/gpu/drm/i915/i915_scheduler.c         |  19 +++
 drivers/gpu/drm/i915/i915_scheduler_types.h   |   5 +
 4 files changed, 73 insertions(+), 114 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index ca37d93ef5e7..be75f861db67 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -382,104 +382,9 @@ __context_engines_static(const struct i915_gem_context *ctx)
 	return rcu_dereference_protected(ctx->engines, true);
 }
 
-static void __reset_context(struct i915_gem_context *ctx,
-			    struct intel_engine_cs *engine)
-{
-	intel_gt_handle_error(engine->gt, engine->mask, 0,
-			      "context closure in %s", ctx->name);
-}
-
-static bool __cancel_engine(struct intel_engine_cs *engine)
-{
-	/*
-	 * Send a "high priority pulse" down the engine to cause the
-	 * current request to be momentarily preempted. (If it fails to
-	 * be preempted, it will be reset). As we have marked our context
-	 * as banned, any incomplete request, including any running, will
-	 * be skipped following the preemption.
-	 *
-	 * If there is no hangchecking (one of the reasons why we try to
-	 * cancel the context) and no forced preemption, there may be no
-	 * means by which we reset the GPU and evict the persistent hog.
-	 * Ergo if we are unable to inject a preemptive pulse that can
-	 * kill the banned context, we fallback to doing a local reset
-	 * instead.
-	 */
-	return intel_engine_pulse(engine) == 0;
-}
-
-static bool
-__active_engine(struct i915_request *rq, struct intel_engine_cs **active)
-{
-	struct intel_engine_cs *engine, *locked;
-	bool ret = false;
-
-	/*
-	 * Serialise with __i915_request_submit() so that it sees
-	 * is-banned?, or we know the request is already inflight.
-	 *
-	 * Note that rq->engine is unstable, and so we double
-	 * check that we have acquired the lock on the final engine.
-	 */
-	locked = READ_ONCE(rq->engine);
-	spin_lock_irq(&locked->sched.lock);
-	while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) {
-		spin_unlock(&locked->sched.lock);
-		locked = engine;
-		spin_lock(&locked->sched.lock);
-	}
-
-	if (i915_request_is_active(rq)) {
-		if (!__i915_request_is_complete(rq))
-			*active = locked;
-		ret = true;
-	}
-
-	spin_unlock_irq(&locked->sched.lock);
-
-	return ret;
-}
-
-static struct intel_engine_cs *active_engine(struct intel_context *ce)
-{
-	struct intel_engine_cs *engine = NULL;
-	struct i915_request *rq;
-
-	if (intel_context_has_inflight(ce))
-		return intel_context_inflight(ce);
-
-	if (!ce->timeline)
-		return NULL;
-
-	/*
-	 * rq->link is only SLAB_TYPESAFE_BY_RCU, we need to hold a reference
-	 * to the request to prevent it being transferred to a new timeline
-	 * (and onto a new timeline->requests list).
-	 */
-	rcu_read_lock();
-	list_for_each_entry_reverse(rq, &ce->timeline->requests, link) {
-		bool found;
-
-		/* timeline is already completed upto this point? */
-		if (!i915_request_get_rcu(rq))
-			break;
-
-		/* Check with the backend if the request is inflight */
-		found = true;
-		if (likely(rcu_access_pointer(rq->timeline) == ce->timeline))
-			found = __active_engine(rq, &engine);
-
-		i915_request_put(rq);
-		if (found)
-			break;
-	}
-	rcu_read_unlock();
-
-	return engine;
-}
-
 static void kill_engines(struct i915_gem_engines *engines, bool ban)
 {
+	const int error = ban ? -EIO : -EAGAIN;
 	struct i915_gem_engines_iter it;
 	struct intel_context *ce;
 
@@ -491,28 +396,12 @@ static void kill_engines(struct i915_gem_engines *engines, bool ban)
 	 * engines on which there are incomplete requests.
 	 */
 	for_each_gem_engine(ce, engines, it) {
-		struct intel_engine_cs *engine;
+		struct i915_sched *se = intel_engine_get_scheduler(ce->engine);
 
 		if (ban && intel_context_set_banned(ce))
 			continue;
 
-		/*
-		 * Check the current active state of this context; if we
-		 * are currently executing on the GPU we need to evict
-		 * ourselves. On the other hand, if we haven't yet been
-		 * submitted to the GPU or if everything is complete,
-		 * we have nothing to do.
-		 */
-		engine = active_engine(ce);
-
-		/* First attempt to gracefully cancel the context */
-		if (engine && !__cancel_engine(engine) && ban)
-			/*
-			 * If we are unable to send a preemptive pulse to bump
-			 * the context from the GPU, we have to resort to a full
-			 * reset. We hope the collateral damage is worth it.
-			 */
-			__reset_context(engines->ctx, engine);
+		se->revoke_context(ce, ban ? engines->ctx->name : NULL, error);
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index b1a79e637621..339bac693b97 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -114,6 +114,7 @@
 #include "gen8_engine_cs.h"
 #include "intel_breadcrumbs.h"
 #include "intel_context.h"
+#include "intel_engine_heartbeat.h"
 #include "intel_engine_pm.h"
 #include "intel_engine_stats.h"
 #include "intel_execlists_submission.h"
@@ -2810,6 +2811,50 @@ static bool execlists_is_executing(const struct i915_request *rq)
 	return inflight;
 }
 
+static bool __cancel_engine(struct intel_engine_cs *engine)
+{
+	/*
+	 * Send a "high priority pulse" down the engine to cause the
+	 * current request to be momentarily preempted. (If it fails to
+	 * be preempted, it will be reset). As we have marked our context
+	 * as banned, any incomplete request, including any running, will
+	 * be skipped following the preemption.
+	 *
+	 * If there is no hangchecking (one of the reasons why we try to
+	 * cancel the context) and no forced preemption, there may be no
+	 * means by which we reset the GPU and evict the persistent hog.
+	 * Ergo if we are unable to inject a preemptive pulse that can
+	 * kill the banned context, we fallback to doing a local reset
+	 * instead.
+	 */
+	return intel_engine_pulse(engine) == 0;
+}
+
+static void
+execlists_revoke_context(struct intel_context *ce, const char *force, int error)
+{
+	struct intel_engine_cs *engine;
+
+	/*
+	 * Check the current active state of this context; if we
+	 * are currently executing on the GPU we need to evict
+	 * ourselves. On the other hand, if we haven't yet been
+	 * submitted to the GPU or if everything is complete,
+	 * we have nothing to do.
+	 */
+	engine = intel_context_inflight(ce);
+
+	/* First attempt to gracefully cancel the context */
+	if (engine && !__cancel_engine(engine) && force)
+		/*
+		 * If we are unable to send a preemptive pulse to bump
+		 * the context from the GPU, we have to resort to a full
+		 * reset. We hope the collateral damage is worth it.
+		 */
+		intel_gt_handle_error(engine->gt, engine->mask, 0,
+				      "context revoked from %s", force);
+}
+
 static bool can_preempt(struct intel_engine_cs *engine)
 {
 	if (INTEL_GEN(engine->i915) > 8)
@@ -2947,6 +2992,7 @@ static void init_execlists(struct intel_engine_cs *engine)
 
 	engine->sched.active_request = execlists_active_request;
 	engine->sched.is_executing = execlists_is_executing;
+	engine->sched.revoke_context = execlists_revoke_context;
 	tasklet_setup(&engine->sched.tasklet, execlists_submission_tasklet);
 
 	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 427b48b52191..86cdcf531fc0 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -155,6 +155,24 @@ static const struct i915_request *active_request(struct i915_sched *se)
 	return active;
 }
 
+static bool context_active(struct intel_context *ce)
+{
+	return i915_active_fence_isset(&ce->timeline->last_request);
+}
+
+static void revoke_context(struct intel_context *ce,
+			   const char *force,
+			   int error)
+{
+	/*
+	 * Without backend support, we cannot remove the context from the
+	 * HW gracefully. All we can do is force a reset, as a last resort.
+	 */
+	if (force && context_active(ce))
+		intel_gt_handle_error(ce->engine->gt, ce->engine->mask, 0,
+				      "context revoked from %s", force);
+}
+
 static bool not_executing(const struct i915_request *rq)
 {
 	return false;
@@ -184,6 +202,7 @@ void i915_sched_init(struct i915_sched *se,
 	se->submit_request = i915_request_enqueue;
 	se->active_request = active_request;
 	se->is_executing = not_executing;
+	se->revoke_context = revoke_context;
 }
 
 void i915_sched_park(struct i915_sched *se)
diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
index 9a9b8e0d78ae..51599d4a022d 100644
--- a/drivers/gpu/drm/i915/i915_scheduler_types.h
+++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
@@ -14,6 +14,7 @@
 #include "i915_priolist_types.h"
 
 struct i915_request;
+struct intel_context;
 
 /**
  * struct i915_sched - funnels requests towards hardware
@@ -41,6 +42,10 @@ struct i915_sched {
 
 	bool (*is_executing)(const struct i915_request *rq);
 
+	void (*revoke_context)(struct intel_context *ce,
+			       const char *whom,
+			       int error);
+
 	struct list_head requests; /* active request, on HW */
 	struct list_head hold; /* ready requests, but on hold */
 	/**
-- 
2.20.1