[PATCH 10/11] drm/i915/execlists: Preemption!

Fri Sep 22 21:56:57 UTC 2017

When we write to ELSP, it triggers a context preemption at the earliest
arbitration point (3DPRIMITIVE, some PIPECONTROLs, a few other
operations and the explicit MI_ARB_CHECK). If this is to the same
context, it triggers a LITE_RESTORE where the RING_TAIL is merely
updated (used currently to chain requests from the same context
together, avoiding bubbles). However, if it is to a different context, a
full context-switch is performed and it will start to execute the new
context saving the image of the old for later execution.

Previously we avoided preemption by only submitting a new context when
the old was idle. But now we wish embrace it, and if the new request has
a higher priority than the currently executing request, we write to the
ELSP regardless, thus triggering preemption. In the context-switch
interrupt handler, we therefore need to check whether the old context
was completed or whether we just switched to the new context
preemptively. In the dequeue function (responsible for deciding who
executes next), we need to take note of when we will cause a preemption
and move all the preempted requests back onto the execution list. After
that we can proceed as normal.

The current heuristic for deciding when to preempt are only if the new
request is of higher priority, and has the privileged priority of
greater than 0.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Michal Winiarski <michal.winiarski at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Arkadiusz Hiler <arkadiusz.hiler at intel.com>
Cc: Mika Kuoppala <mika.kuoppala at intel.com>
Cc: Ben Widawsky <benjamin.widawsky at intel.com>
Cc: Zhenyu Wang <zhenyuw at linux.intel.com>
Cc: Zhi Wang <zhi.a.wang at intel.com>
---
 drivers/gpu/drm/i915/i915_reg.h         |   5 +
 drivers/gpu/drm/i915/intel_lrc.c        | 188 +++++++++++++++++++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +-
 3 files changed, 153 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 82f36dd0cd94..02b77ff2cbf2 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -525,7 +525,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
 #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
 #define   MI_SEMAPHORE_POLL		(1<<15)
+#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
 #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
+#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
+#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
+#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
+#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
 #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
 #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
 #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 9010ae310b18..c9f74ae284e8 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -392,9 +392,9 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
 	return ce->lrc_desc;
 }
 
-static void execlists_submit_ports(struct intel_engine_cs *engine)
+static void execlists_submit_ports(struct intel_engine_cs *engine,
+				   struct execlist_port *port)
 {
-	struct execlist_port *port = engine->execlists.port;
 	u32 __iomem *elsp =
 		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
 	unsigned int n;
@@ -451,25 +451,28 @@ static void port_assign(struct execlist_port *port,
 	port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
 }
 
+static void unwind_wa_tail(struct drm_i915_gem_request *rq)
+{
+	rq->tail = intel_ring_wrap(rq->ring,
+				   rq->wa_tail - WA_TAIL_DWORDS*sizeof(u32));
+	assert_ring_tail_valid(rq->ring, rq->tail);
+}
+
+static void gpu_sema_set(struct intel_engine_cs *engine, bool state)
+{
+	u32 *gpu_sema = &engine->status_page.page_addr[I915_GEM_HWS_INDEX + 2];
+	WRITE_ONCE(*gpu_sema, state);
+}
+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *last;
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct execlist_port *port = execlists->port;
-	const struct execlist_port * const last_port =
-		&execlists->port[execlists->port_mask];
+	struct execlist_port *ports = execlists->port;
+	struct execlist_port *port = ports;
+	struct drm_i915_gem_request *last = port_request(port);
 	struct rb_node *rb;
 	bool submit = false;
-
-	last = port_request(port);
-	if (last)
-		/* WaIdleLiteRestore:bdw,skl
-		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
-		 * as we resubmit the request. See gen8_emit_breadcrumb()
-		 * for where we prepare the padding after the end of the
-		 * request.
-		 */
-		last->tail = last->wa_tail;
+	bool once = last;
 
 	/* Hardware submission is through 2 ports. Conceptually each port
 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
@@ -499,6 +502,66 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
 		struct drm_i915_gem_request *rq, *rn;
 
+		if (once) {
+			/*
+			 * Don't resubmit or switch until all outstanding
+			 * preemptions (lite-restore) are seen. Then we
+			 * know the next preemption status we see corresponds
+			 * to this ELSP update.
+			 */
+			if (port_count(&port[0]) > 1)
+				goto done;
+
+			if (p->priority > max(last->priotree.priority, 0)) {
+				/* Suspend breadcrumb updates from the gpu */
+				gpu_sema_set(engine, 1);
+				readl(engine->i915->regs +
+				      i915_mmio_reg_offset(RING_ACTHD(engine->mmio_base)));
+
+				list_for_each_entry_safe_reverse(rq, rn,
+								 &engine->timeline->requests,
+								 link) {
+					struct i915_priolist *p;
+
+					if (i915_gem_request_completed(rq))
+						break;
+
+					__i915_gem_request_unsubmit(rq);
+					unwind_wa_tail(rq);
+
+					p = lookup_priolist(engine,
+							    &rq->priotree,
+							    rq->priotree.priority);
+					GEM_BUG_ON(ptr_unmask_bits(p, 1));
+					list_add(&rq->priotree.link,
+						 &ptr_mask_bits(p, 1)->requests);
+				}
+
+				/*
+				 * Submit via our second stack so that we don't
+				 * lose the inflight stack until we know the
+				 * GPU has completed the preemption switch.
+				 */
+				ports = execlists->preempt;
+				port = ports;
+				last = NULL;
+			} else {
+				/* WaIdleLiteRestore:bdw,skl
+				 * Apply the wa NOOPs to prevent
+				 * ring:HEAD == req:TAIL as we resubmit the
+				 * request. See gen8_emit_breadcrumb() for
+				 * where we prepare the padding after the
+				 * end of the request.
+				 */
+				last->tail = last->wa_tail;
+
+				if (port_count(&port[1]))
+					goto done;
+			}
+
+			once = false;
+		}
+
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
 			/*
 			 * Can we combine this request with the current port?
@@ -517,7 +580,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 				 * combine this request with the last, then we
 				 * are done.
 				 */
-				if (port == last_port) {
+				if (port == &ports[execlists->port_mask]) {
 					__list_del_many(&p->requests,
 							&rq->priotree.link);
 					goto done;
@@ -566,17 +629,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	spin_unlock_irq(&engine->timeline->lock);
 
 	if (submit)
-		execlists_submit_ports(engine);
+		execlists_submit_ports(engine, ports);
 }
 
-static void execlist_cancel_port_requests(struct intel_engine_execlists *execlists)
+static void
+__execlist_cancel_port_requests(struct execlist_port *port,
+				unsigned int num_ports)
 {
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(execlists->port); i++)
-		i915_gem_request_put(port_request(&execlists->port[i]));
+	while (num_ports-- && port_isset(port)) {
+		i915_gem_request_put(port_request(port));
+		memset(port, 0, sizeof(*port));
+		port++;
+	}
+}
 
-	memset(execlists->port, 0, sizeof(execlists->port));
+static void
+execlist_cancel_port_requests(struct intel_engine_execlists *execlists)
+{
+	__execlist_cancel_port_requests(execlists->port,
+					ARRAY_SIZE(execlists->port));
+	__execlist_cancel_port_requests(execlists->preempt,
+					ARRAY_SIZE(execlists->preempt));
 }
 
 static void execlists_cancel_requests(struct intel_engine_cs *engine)
@@ -623,7 +696,8 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 
 	execlists->queue = RB_ROOT;
 	execlists->first = NULL;
-	GEM_BUG_ON(port_isset(&execlists->port[0]));
+	GEM_BUG_ON(port_isset(execlists->port));
+	GEM_BUG_ON(port_isset(execlists->preempt));
 
 	/*
 	 * The port is checked prior to scheduling a tasklet, but
@@ -636,11 +710,15 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 }
 
-static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
+static void switch_to_preempt(struct intel_engine_cs *engine)
 {
-	const struct execlist_port *port = engine->execlists.port;
+	struct intel_engine_execlists *execlists = &engine->execlists;
 
-	return port_count(&port[0]) + port_count(&port[1]) < 2;
+	memcpy(execlists->port, execlists->preempt, sizeof(execlists->preempt));
+	memset(execlists->preempt, 0, sizeof(execlists->preempt));
+
+	/* Restart breadcrumbs from the gpu */
+	gpu_sema_set(engine, 0);
 }
 
 /*
@@ -651,7 +729,7 @@ static void intel_lrc_irq_handler(unsigned long data)
 {
 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct execlist_port *port = execlists->port;
+	struct execlist_port * const port = execlists->port;
 	struct drm_i915_private *dev_priv = engine->i915;
 
 	/* We can skip acquiring intel_runtime_pm_get() here as it was taken
@@ -739,6 +817,14 @@ static void intel_lrc_irq_handler(unsigned long data)
 			/* Check the context/desc id for this event matches */
 			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
 
+			if (status & GEN8_CTX_STATUS_PREEMPTED &&
+			    port_isset(execlists->preempt)) {
+				__execlist_cancel_port_requests(execlists->port,
+								ARRAY_SIZE(execlists->port));
+				switch_to_preempt(engine);
+				continue;
+			}
+
 			rq = port_unpack(port, &count);
 			GEM_BUG_ON(count == 0);
 			if (--count == 0) {
@@ -755,6 +841,9 @@ static void intel_lrc_irq_handler(unsigned long data)
 				port_set(port, port_pack(rq, count));
 			}
 
+			if (!port_isset(port) && port_isset(execlists->preempt))
+				switch_to_preempt(engine);
+
 			/* After the final element, the hw should be idle */
 			GEM_BUG_ON(port_count(port) == 0 &&
 				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
@@ -767,7 +856,7 @@ static void intel_lrc_irq_handler(unsigned long data)
 		}
 	}
 
-	if (execlists_elsp_ready(engine))
+	if (!port_isset(execlists->preempt))
 		execlists_dequeue(engine);
 
 	intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
@@ -780,7 +869,7 @@ static void insert_request(struct intel_engine_cs *engine,
 	struct i915_priolist *p = lookup_priolist(engine, pt, prio);
 
 	list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
-	if (ptr_unmask_bits(p, 1) && execlists_elsp_ready(engine))
+	if (ptr_unmask_bits(p, 1))
 		tasklet_hi_schedule(&engine->execlists.irq_tasklet);
 }
 
@@ -906,8 +995,6 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
 	}
 
 	spin_unlock_irq(&engine->timeline->lock);
-
-	/* XXX Do we need to preempt to make room for us and our deps? */
 }
 
 static struct intel_ring *
@@ -1103,6 +1190,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 				       i915_ggtt_offset(engine->scratch) +
 				       2 * CACHELINE_BYTES);
 
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
 	/* Pad to end of cacheline */
 	while ((unsigned long)batch % CACHELINE_BYTES)
 		*batch++ = MI_NOOP;
@@ -1118,6 +1207,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 
 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 {
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
 
@@ -1163,6 +1254,8 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 		*batch++ = 0;
 	}
 
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
 	/* Pad to end of cacheline */
 	while ((unsigned long)batch % CACHELINE_BYTES)
 		*batch++ = MI_NOOP;
@@ -1430,10 +1523,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	intel_ring_update_space(request->ring);
 
 	/* Reset WaIdleLiteRestore:bdw,skl as well */
-	request->tail =
-		intel_ring_wrap(request->ring,
-				request->wa_tail - WA_TAIL_DWORDS*sizeof(u32));
-	assert_ring_tail_valid(request->ring, request->tail);
+	unwind_wa_tail(request);
 }
 
 static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
@@ -1629,8 +1719,8 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
  */
 static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *cs)
 {
-	*cs++ = MI_NOOP;
-	*cs++ = MI_NOOP;
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+	*cs++ = MI_ARB_CHECK; /* preemption point *between* requests */
 	request->wa_tail = intel_ring_offset(request, cs);
 }
 
@@ -1639,6 +1729,14 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
 
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_POLL |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_SAD_EQ_SDD;
+	*cs++ = 0; /* continue if zero (preempt == 0) */
+	*cs++ = intel_hws_seqno_address(request->engine) + 8;
+	*cs++ = 0;
+
 	*cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
 	*cs++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT;
 	*cs++ = 0;
@@ -1651,7 +1749,7 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
 	gen8_emit_wa_tail(request, cs);
 }
 
-static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
 
 static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
 					u32 *cs)
@@ -1659,6 +1757,14 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
 	/* We're using qword write, seqno should be aligned to 8 bytes. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
 
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_POLL |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_SAD_EQ_SDD;
+	*cs++ = 0; /* continue if zero */
+	*cs++ = intel_hws_seqno_address(request->engine) + 8;
+	*cs++ = 0;
+
 	/* w/a for post sync ops following a GPGPU operation we
 	 * need a prior CS_STALL, which is emitted by the flush
 	 * following the batch.
@@ -1679,7 +1785,7 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
 	gen8_emit_wa_tail(request, cs);
 }
 
-static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_render_sz = 12 + WA_TAIL_DWORDS;
 
 static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
 {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 56d7ae9f298b..7ac040dbf0bc 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -236,7 +236,7 @@ struct intel_engine_execlists {
 		GEM_DEBUG_DECL(u32 context_id);
 
 #define EXECLIST_MAX_PORTS 2
-	} port[EXECLIST_MAX_PORTS];
+	} port[EXECLIST_MAX_PORTS], preempt[EXECLIST_MAX_PORTS];
 
 	/**
 	 * @port_mask: number of execlist ports - 1
-- 
2.14.1