[PATCH 12/13] drm/i915/execlists: Preemption!

Sun Sep 24 16:40:54 UTC 2017

When we write to ELSP, it triggers a context preemption at the earliest
arbitration point (3DPRIMITIVE, some PIPECONTROLs, a few other
operations and the explicit MI_ARB_CHECK). If this is to the same
context, it triggers a LITE_RESTORE where the RING_TAIL is merely
updated (used currently to chain requests from the same context
together, avoiding bubbles). However, if it is to a different context, a
full context-switch is performed and it will start to execute the new
context saving the image of the old for later execution.

Previously we avoided preemption by only submitting a new context when
the old was idle. But now we wish embrace it, and if the new request has
a higher priority than the currently executing request, we write to the
ELSP regardless, thus triggering preemption. In the context-switch
interrupt handler, we therefore need to check whether the old context
was completed or whether we just switched to the new context
preemptively. In the dequeue function (responsible for deciding who
executes next), we need to take note of when we will cause a preemption
and move all the preempted requests back onto the execution list. After
that we can proceed as normal.

The current heuristic for deciding when to preempt are only if the new
request is of higher priority, and has the privileged priority of
greater than 0. Note that the scheduler remains unfair!

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Michal Winiarski <michal.winiarski at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Arkadiusz Hiler <arkadiusz.hiler at intel.com>
Cc: Mika Kuoppala <mika.kuoppala at intel.com>
Cc: Ben Widawsky <benjamin.widawsky at intel.com>
Cc: Zhenyu Wang <zhenyuw at linux.intel.com>
Cc: Zhi Wang <zhi.a.wang at intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c         |   6 +-
 drivers/gpu/drm/i915/intel_lrc.c        | 138 +++++++++++++++++++++++---------
 drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
 3 files changed, 106 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index af82bd721dbc..20c1fd905110 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1350,10 +1350,8 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 	bool tasklet = false;
 
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
-		if (port_count(&execlists->port[0])) {
-			__set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-			tasklet = true;
-		}
+		__set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+		tasklet = true;
 	}
 
 	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 48df54923b2b..2781e4b92b67 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -208,8 +208,8 @@
 
 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
-
 #define WA_TAIL_DWORDS 2
+#define PREEMPT_ID 0x1
 
 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 					    struct intel_engine_cs *engine);
@@ -484,24 +484,14 @@ static void port_assign(struct execlist_port *port,
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *last;
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct execlist_port *port = execlists->port;
-	const struct execlist_port * const last_port =
+	struct execlist_port * const last_port =
 		&execlists->port[execlists->port_mask];
+	struct drm_i915_gem_request *last = port_request(port);
 	struct rb_node *rb;
 	bool submit = false;
 
-	last = port_request(port);
-	if (last)
-		/* WaIdleLiteRestore:bdw,skl
-		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
-		 * as we resubmit the request. See gen8_emit_breadcrumb()
-		 * for where we prepare the padding after the end of the
-		 * request.
-		 */
-		last->tail = last->wa_tail;
-
 	/* Hardware submission is through 2 ports. Conceptually each port
 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
 	 * static for a context, and unique to each, so we only execute
@@ -526,7 +516,61 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	spin_lock_irq(&engine->timeline->lock);
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
-	while (rb) {
+	if (!rb)
+		goto unlock;
+
+	if (last) {
+		/*
+		 * Don't resubmit or switch until all outstanding
+		 * preemptions (lite-restore) are seen. Then we
+		 * know the next preemption status we see corresponds
+		 * to this ELSP update.
+		 */
+		if (port_count(&port[0]) > 1)
+			goto unlock;
+
+		if (rb_entry(rb, struct i915_priolist, node)->priority >
+		    max(last->priotree.priority, 0)) {
+			struct intel_context *ce = &engine->i915->preempt_context->engine[engine->id];
+			u32 __iomem *elsp =
+				engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+			unsigned int n;
+
+			memset(ce->ring->vaddr + ce->ring->tail, 0, 8);
+			ce->ring->tail += 8;
+			ce->ring->tail &= (ce->ring->size - 1);
+			ce->lrc_reg_state[CTX_RING_TAIL+1] = ce->ring->tail;
+
+			/*
+			 * Switch to out empty preempt context so
+			 * the state of the GPU is known.
+			 */
+			for (n = execlists_num_ports(&engine->execlists); --n; ) {
+				writel(0, elsp);
+				writel(0, elsp);
+			}
+			writel(upper_32_bits(ce->lrc_desc), elsp);
+			writel(lower_32_bits(ce->lrc_desc), elsp);
+
+			GEM_BUG_ON(engine->i915->preempt_context->hw_id != PREEMPT_ID);
+			execlists->preempt = true;
+			goto unlock;
+		} else {
+			if (port_count(&port[1]))
+				goto unlock;
+
+			/* WaIdleLiteRestore:bdw,skl
+			 * Apply the wa NOOPs to prevent
+			 * ring:HEAD == req:TAIL as we resubmit the
+			 * request. See gen8_emit_breadcrumb() for
+			 * where we prepare the padding after the
+			 * end of the request.
+			 */
+			last->tail = last->wa_tail;
+		}
+	}
+
+	do {
 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
 		struct drm_i915_gem_request *rq, *rn;
 
@@ -589,25 +633,29 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
-	}
+	} while (rb);
 done:
 	execlists->first = rb;
 	if (submit)
 		port_assign(port, last);
+unlock:
 	spin_unlock_irq(&engine->timeline->lock);
 
 	if (submit)
 		execlists_submit_ports(engine);
 }
 
-static void execlist_cancel_port_requests(struct intel_engine_execlists *execlists)
+static void
+execlist_cancel_port_requests(struct intel_engine_execlists *execlists)
 {
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(execlists->port); i++)
-		i915_gem_request_put(port_request(&execlists->port[i]));
+	struct execlist_port *port = execlists->port;
+	unsigned int num_ports = ARRAY_SIZE(execlists->port);
 
-	memset(execlists->port, 0, sizeof(execlists->port));
+	while (num_ports-- && port_isset(port)) {
+		i915_gem_request_put(port_request(port));
+		memset(port, 0, sizeof(*port));
+		port++;
+	}
 }
 
 static void execlists_cancel_requests(struct intel_engine_cs *engine)
@@ -654,7 +702,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 
 	execlists->queue = RB_ROOT;
 	execlists->first = NULL;
-	GEM_BUG_ON(port_isset(&execlists->port[0]));
+	GEM_BUG_ON(port_isset(execlists->port));
 
 	/*
 	 * The port is checked prior to scheduling a tasklet, but
@@ -667,13 +715,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 }
 
-static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
-{
-	const struct execlist_port *port = engine->execlists.port;
-
-	return port_count(&port[0]) + port_count(&port[1]) < 2;
-}
-
 /*
  * Check the unread Context Status Buffers and manage the submission of new
  * contexts to the ELSP accordingly.
@@ -682,7 +723,7 @@ static void intel_lrc_irq_handler(unsigned long data)
 {
 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct execlist_port *port = execlists->port;
+	struct execlist_port * const port = execlists->port;
 	struct drm_i915_private *dev_priv = engine->i915;
 
 	/* We can skip acquiring intel_runtime_pm_get() here as it was taken
@@ -767,6 +808,29 @@ static void intel_lrc_irq_handler(unsigned long data)
 			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
 				continue;
 
+			if (status & GEN8_CTX_STATUS_ACTIVE_IDLE &&
+			    buf[2*head + 1] == PREEMPT_ID) {
+				unsigned long flags;
+
+				execlist_cancel_port_requests(execlists);
+
+				spin_lock_irqsave(&engine->timeline->lock,
+						  flags);
+
+				unwind_incomplete_requests(engine);
+
+				spin_unlock_irqrestore(&engine->timeline->lock,
+						       flags);
+
+				GEM_BUG_ON(!execlists->preempt);
+				execlists->preempt = false;
+				continue;
+			}
+
+			if (status & GEN8_CTX_STATUS_PREEMPTED &&
+			    execlists->preempt)
+				continue;
+
 			/* Check the context/desc id for this event matches */
 			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
 
@@ -798,7 +862,7 @@ static void intel_lrc_irq_handler(unsigned long data)
 		}
 	}
 
-	if (execlists_elsp_ready(engine))
+	if (!execlists->preempt)
 		execlists_dequeue(engine);
 
 	intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
@@ -811,7 +875,7 @@ static void insert_request(struct intel_engine_cs *engine,
 	struct i915_priolist *p = lookup_priolist(engine, pt, prio);
 
 	list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
-	if (ptr_unmask_bits(p, 1) && execlists_elsp_ready(engine))
+	if (ptr_unmask_bits(p, 1))
 		tasklet_hi_schedule(&engine->execlists.irq_tasklet);
 }
 
@@ -939,8 +1003,6 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
 	}
 
 	spin_unlock_irq(&engine->timeline->lock);
-
-	/* XXX Do we need to preempt to make room for us and our deps? */
 }
 
 static struct intel_ring *
@@ -1136,6 +1198,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 				       i915_ggtt_offset(engine->scratch) +
 				       2 * CACHELINE_BYTES);
 
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
 	/* Pad to end of cacheline */
 	while ((unsigned long)batch % CACHELINE_BYTES)
 		*batch++ = MI_NOOP;
@@ -1151,6 +1215,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 
 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 {
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
 
@@ -1196,6 +1262,8 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 		*batch++ = 0;
 	}
 
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
 	/* Pad to end of cacheline */
 	while ((unsigned long)batch % CACHELINE_BYTES)
 		*batch++ = MI_NOOP;
@@ -1665,7 +1733,6 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
 
 	gen8_emit_wa_tail(request, cs);
 }
-
 static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
 
 static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
@@ -1693,7 +1760,6 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
 
 	gen8_emit_wa_tail(request, cs);
 }
-
 static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;
 
 static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 56d7ae9f298b..891d9555dce6 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -238,6 +238,8 @@ struct intel_engine_execlists {
 #define EXECLIST_MAX_PORTS 2
 	} port[EXECLIST_MAX_PORTS];
 
+	bool preempt;
+
 	/**
 	 * @port_mask: number of execlist ports - 1
 	 */
-- 
2.14.1