[PATCH 74/74] preemption

Sun Jul 16 19:01:15 UTC 2017

---
 drivers/gpu/drm/i915/intel_lrc.c        | 144 +++++++++++++++++++++-----------
 drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +-
 2 files changed, 97 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 6215123b2943..b0410544d684 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -397,9 +397,9 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
 	return ce->lrc_desc;
 }
 
-static void execlists_submit_ports(struct intel_engine_cs *engine)
+static void execlists_submit_ports(struct intel_engine_cs *engine,
+				   struct execlist_port *port)
 {
-	struct execlist_port *port = engine->execlist_port;
 	u32 __iomem *elsp =
 		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
 	unsigned int n;
@@ -458,22 +458,12 @@ static void port_assign(struct execlist_port *port,
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *last;
-	struct execlist_port *port = engine->execlist_port;
+	struct execlist_port *ports = engine->execlist_port;
+	struct execlist_port *port = ports;
+	struct drm_i915_gem_request *last = port_request(port);
 	struct rb_node *rb;
 	bool submit = false;
-
-	last = port_request(port);
-	if (last)
-		/* WaIdleLiteRestore:bdw,skl
-		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
-		 * as we resubmit the request. See gen8_emit_breadcrumb()
-		 * for where we prepare the padding after the end of the
-		 * request.
-		 */
-		last->tail = last->wa_tail;
-
-	GEM_BUG_ON(port_isset(&port[1]));
+	bool once = last;
 
 	/* Hardware submission is through 2 ports. Conceptually each port
 	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
@@ -503,6 +493,48 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
 		struct drm_i915_gem_request *rq, *rn;
 
+		if (once) {
+			if (port_count(&port[0]) > 1)
+				goto done;
+
+			if (p->priority > max(last->priotree.priority, 0)) {
+				list_for_each_entry_safe_reverse(rq, rn,
+								 &engine->timeline->requests,
+								 link) {
+					struct i915_priolist *p;
+
+					if (i915_gem_request_completed(rq))
+						break;
+
+					__i915_gem_request_unsubmit(rq);
+
+					p = lookup_priolist(engine,
+							    &rq->priotree,
+							    rq->priotree.priority);
+					list_add(&rq->priotree.link,
+						 &ptr_mask_bits(p, 1)->requests);
+				}
+
+				ports = engine->execlist_preempt;
+				port = ports;
+				last = NULL;
+			} else {
+				/* WaIdleLiteRestore:bdw,skl
+				 * Apply the wa NOOPs to prevent
+				 * ring:HEAD == req:TAIL as we resubmit the
+				 * request. See gen8_emit_breadcrumb() for
+				 * where we prepare the padding after the
+				 * end of the request.
+				 */
+				last->tail = last->wa_tail;
+			}
+
+			if (port_count(&port[1]))
+				goto done;
+
+			once = false;
+		}
+
 		list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
 			/*
 			 * Can we combine this request with the current port?
@@ -521,7 +553,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 				 * combine this request with the last, then we
 				 * are done.
 				 */
-				if (port != engine->execlist_port) {
+				if (port != ports) {
 					__list_del_many(&p->requests,
 							&rq->priotree.link);
 					goto done;
@@ -568,14 +600,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	spin_unlock_irq(&engine->timeline->lock);
 
 	if (submit)
-		execlists_submit_ports(engine);
+		execlists_submit_ports(engine, ports);
 }
 
-static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
+static void switch_to_preempt(struct intel_engine_cs *engine)
 {
-	const struct execlist_port *port = engine->execlist_port;
-
-	return port_count(&port[0]) + port_count(&port[1]) < 2;
+	memcpy(engine->execlist_port,
+	       engine->execlist_preempt,
+	       sizeof(engine->execlist_preempt));
+	memset(engine->execlist_preempt, 0,
+	       sizeof(engine->execlist_preempt));
 }
 
 /*
@@ -585,7 +619,7 @@ static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
 static void intel_lrc_irq_handler(unsigned long data)
 {
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
-	struct execlist_port *port = engine->execlist_port;
+	struct execlist_port * const port = engine->execlist_port;
 	struct drm_i915_private *dev_priv = engine->i915;
 
 	/* We can skip acquiring intel_runtime_pm_get() here as it was taken
@@ -674,6 +708,24 @@ static void intel_lrc_irq_handler(unsigned long data)
 			/* Check the context/desc id for this event matches */
 			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
 
+			if (status & GEN8_CTX_STATUS_PREEMPTED &&
+			    !(status & GEN8_CTX_STATUS_LITE_RESTORE)) {
+				int i;
+
+				GEM_BUG_ON(!port_isset(port));
+				GEM_BUG_ON(!port_isset(engine->execlist_preempt));
+				for (i = 0; i < ARRAY_SIZE(engine->execlist_port); i++) {
+					if (!port_isset(&port[i]))
+						break;
+
+					rq = port_request(&port[i]);
+					i915_gem_request_put(rq);
+				}
+
+				switch_to_preempt(engine);
+				continue;
+			}
+
 			rq = port_unpack(port, &count);
 			GEM_BUG_ON(count == 0);
 			if (--count == 0) {
@@ -691,6 +743,10 @@ static void intel_lrc_irq_handler(unsigned long data)
 				port_set(port, port_pack(rq, count));
 			}
 
+			if (!port_isset(port) &&
+			    port_isset(engine->execlist_preempt))
+				switch_to_preempt(engine);
+
 			/* After the final element, the hw should be idle */
 			GEM_BUG_ON(port_count(port) == 0 &&
 				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
@@ -703,7 +759,7 @@ static void intel_lrc_irq_handler(unsigned long data)
 		}
 	}
 
-	if (execlists_elsp_ready(engine))
+	if (!port_isset(engine->execlist_preempt))
 		execlists_dequeue(engine);
 
 	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
@@ -716,7 +772,7 @@ static void insert_request(struct intel_engine_cs *engine,
 	struct i915_priolist *p = lookup_priolist(engine, pt, prio);
 
 	list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
-	if (ptr_unmask_bits(p, 1) && execlists_elsp_ready(engine))
+	if (ptr_unmask_bits(p, 1) && !port_isset(engine->execlist_preempt))
 		tasklet_hi_schedule(&engine->irq_tasklet);
 }
 
@@ -837,8 +893,6 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
 	}
 
 	spin_unlock_irq(&engine->timeline->lock);
-
-	/* XXX Do we need to preempt to make room for us and our deps? */
 }
 
 static struct intel_ring *
@@ -1075,17 +1129,11 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
  *  This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding
  *  to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant.
  */
-static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
-	/* WaDisableCtxRestoreArbitration:bdw,chv */
-	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
-	*batch++ = MI_BATCH_BUFFER_END;
-
-	return batch;
-}
 
 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 {
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
 
@@ -1138,13 +1186,6 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
 	return batch;
 }
 
-static u32 *gen9_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
-	*batch++ = MI_BATCH_BUFFER_END;
-
-	return batch;
-}
-
 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
 
 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
@@ -1199,11 +1240,11 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
 	switch (INTEL_GEN(engine->i915)) {
 	case 9:
 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
-		wa_bb_fn[1] = gen9_init_perctx_bb;
+		wa_bb_fn[1] = NULL;
 		break;
 	case 8:
 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
-		wa_bb_fn[1] = gen8_init_perctx_bb;
+		wa_bb_fn[1] = NULL;
 		break;
 	default:
 		MISSING_CASE(INTEL_GEN(engine->i915));
@@ -1272,6 +1313,9 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
 	engine->csb_head = -1;
 
+	if (port_isset(engine->execlist_preempt))
+		switch_to_preempt(engine);
+
 	submit = false;
 	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
 		if (!port_isset(&port[n]))
@@ -1287,7 +1331,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	}
 
 	if (submit && !i915.enable_guc_submission)
-		execlists_submit_ports(engine);
+		execlists_submit_ports(engine, port);
 
 	return 0;
 }
@@ -1447,10 +1491,12 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 		req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine);
 	}
 
-	cs = intel_ring_begin(req, 4);
+	cs = intel_ring_begin(req, 6);
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
 	/* FIXME(BDW): Address space and security selectors. */
 	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
 		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
@@ -1458,6 +1504,8 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 	*cs++ = lower_32_bits(offset);
 	*cs++ = upper_32_bits(offset);
 	*cs++ = MI_NOOP;
+
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
 	intel_ring_advance(req, cs);
 
 	return 0;
@@ -1601,7 +1649,7 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
 	*cs++ = 0;
 	*cs++ = request->global_seqno;
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 	request->tail = intel_ring_offset(request, cs);
 	assert_ring_tail_valid(request->ring, request->tail);
 
@@ -1629,7 +1677,7 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
 	/* We're thrashing one dword of HWS. */
 	*cs++ = 0;
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 	request->tail = intel_ring_offset(request, cs);
 	assert_ring_tail_valid(request->ring, request->tail);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5d73e0a406d5..72545eb7a6e4 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -389,7 +389,7 @@ struct intel_engine_cs {
 #define port_isset(p) ((p)->request_count)
 #define port_index(p, e) ((p) - (e)->execlist_port)
 		GEM_DEBUG_DECL(u32 context_id);
-	} execlist_port[2];
+	} execlist_port[2], execlist_preempt[2];
 	struct rb_root execlist_queue;
 	struct rb_node *execlist_first;
 	unsigned int fw_domains;
-- 
2.13.2