[PATCH 4/4] drm/i915/execlists: Defer schedule_out until after the next dequeue

Wed Jun 10 15:16:05 UTC 2020

Inside schedule_out, we do extra work upon idling the context, such as
updating the runtime, kicking off retires, kicking virtual engines.
However, if we are in a series of processing single requests per
contexts, we may find ourselves scheduling out the context, only to
immediately schedule it back in during dequeue. This is just extra work
that we can avoid if we keep the context marked as inflight across the
dequeue. This becomes more significant later on for minimising virtual
engine misses.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_context_types.h |  4 +-
 drivers/gpu/drm/i915/gt/intel_engine_cs.c     |  2 +
 drivers/gpu/drm/i915/gt/intel_engine_types.h  | 13 +++++++
 drivers/gpu/drm/i915/gt/intel_lrc.c           | 37 +++++++++++++++----
 4 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 4954b0df4864..b63db45bab7b 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -45,8 +45,8 @@ struct intel_context {
 
 	struct intel_engine_cs *engine;
 	struct intel_engine_cs *inflight;
-#define intel_context_inflight(ce) ptr_mask_bits(READ_ONCE((ce)->inflight), 2)
-#define intel_context_inflight_count(ce) ptr_unmask_bits(READ_ONCE((ce)->inflight), 2)
+#define intel_context_inflight(ce) ptr_mask_bits(READ_ONCE((ce)->inflight), 3)
+#define intel_context_inflight_count(ce) ptr_unmask_bits(READ_ONCE((ce)->inflight), 3)
 
 	struct i915_address_space *vm;
 	struct i915_gem_context __rcu *gem_context;
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 0a05301e00fb..72f0029d490b 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -515,6 +515,8 @@ void intel_engine_init_execlists(struct intel_engine_cs *engine)
 	memset(execlists->pending, 0, sizeof(execlists->pending));
 	execlists->active =
 		memset(execlists->inflight, 0, sizeof(execlists->inflight));
+	execlists->inactive =
+		memset(execlists->post, 0, sizeof(execlists->post));
 
 	execlists->queue_priority_hint = INT_MIN;
 	execlists->queue = RB_ROOT_CACHED;
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 073c3769e8cc..31cf60cef5a8 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -208,6 +208,10 @@ struct intel_engine_execlists {
 	 * @active: the currently known context executing on HW
 	 */
 	struct i915_request * const *active;
+	/**
+	 * @inactive: the current vacancy of completed CS
+	 */
+	struct i915_request **inactive;
 	/**
 	 * @inflight: the set of contexts submitted and acknowleged by HW
 	 *
@@ -225,6 +229,15 @@ struct intel_engine_execlists {
 	 * preemption or idle-to-active event.
 	 */
 	struct i915_request *pending[EXECLIST_MAX_PORTS + 1];
+	/**
+	 * @post: the set of completed context switches
+	 *
+	 * Since we may want to stagger the processing of the CS switches
+	 * with the next submission, so that the context are notionally
+	 * kept in flight across the dequeue, we defer scheduling out of
+	 * the completed context switches.
+	 */
+	struct i915_request *post[2 * EXECLIST_MAX_PORTS + 1];
 
 	/**
 	 * @port_mask: number of execlist ports - 1
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index e866b8d721ed..f92670236a5a 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2055,9 +2055,10 @@ static void set_preempt_timeout(struct intel_engine_cs *engine,
 		     active_preempt_timeout(engine, rq));
 }
 
-static inline void clear_ports(struct i915_request **ports, int count)
+static inline struct i915_request **
+clear_ports(struct i915_request **ports, int count)
 {
-	memset_p((void **)ports, NULL, count);
+	return memset_p((void **)ports, NULL, count);
 }
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
@@ -2455,6 +2456,11 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
 {
 	struct i915_request * const *port;
 
+	for (port = execlists->post; *port; port++)
+		execlists_schedule_out(*port);
+	execlists->inactive =
+		clear_ports(execlists->post, ARRAY_SIZE(execlists->post));
+
 	for (port = execlists->pending; *port; port++)
 		execlists_schedule_out(*port);
 	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
@@ -2622,7 +2628,7 @@ static void process_csb(struct intel_engine_cs *engine)
 			/* cancel old inflight, prepare for switch */
 			trace_ports(execlists, "preempted", old);
 			while (*old)
-				execlists_schedule_out(*old++);
+				*execlists->inactive++ = *old++;
 
 			/* switch pending to inflight */
 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
@@ -2679,7 +2685,7 @@ static void process_csb(struct intel_engine_cs *engine)
 					     regs[CTX_RING_TAIL]);
 			}
 
-			execlists_schedule_out(*execlists->active++);
+			*execlists->inactive++ = *execlists->active++;
 
 			GEM_BUG_ON(execlists->active - execlists->inflight >
 				   execlists_num_ports(execlists));
@@ -2703,6 +2709,20 @@ static void process_csb(struct intel_engine_cs *engine)
 	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
 }
 
+static void post_process_csb(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const el = &engine->execlists;
+	struct i915_request **port;
+
+	GEM_BUG_ON(el->post[2 * EXECLIST_MAX_PORTS]);
+	if (!el->post[0])
+		return;
+
+	for (port = el->post; *port; port++)
+		execlists_schedule_out(*port);
+	el->inactive = clear_ports(el->post, port - el->post);
+}
+
 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
 {
 	lockdep_assert_held(&engine->active.lock);
@@ -3133,6 +3153,8 @@ static void execlists_submission_tasklet(unsigned long data)
 		if (unlikely(timeout && preempt_timeout(engine)))
 			execlists_reset(engine, "preemption time out");
 	}
+
+	post_process_csb(engine);
 }
 
 static void __execlists_kick(struct intel_engine_execlists *execlists)
@@ -3165,9 +3187,9 @@ static void queue_request(struct intel_engine_cs *engine,
 
 static void __submit_queue_imm(struct intel_engine_cs *engine)
 {
-	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct intel_engine_execlists * const el = &engine->execlists;
 
-	if (reset_in_progress(execlists))
+	if (reset_in_progress(el))
 		return; /* defer until we restart the engine following reset */
 
 	__execlists_submission_tasklet(engine);
@@ -3196,7 +3218,7 @@ static void flush_csb(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists *el = &engine->execlists;
 
-	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
+	if (!READ_ONCE(el->post[0]) && tasklet_trylock(&el->tasklet)) {
 		if (!reset_in_progress(el))
 			process_csb(engine);
 		tasklet_unlock(&el->tasklet);
@@ -4183,6 +4205,7 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	mb();
 
 	process_csb(engine); /* drain preemption events */
+	post_process_csb(engine);
 
 	/* Following the reset, we need to reload the CSB read/write pointers */
 	reset_csb_pointers(engine);
-- 
2.20.1