[Intel-gfx] [RFC 4/4] drm/i915/execlists: Preemption!
Chris Wilson
chris at chris-wilson.co.uk
Mon Jul 17 08:42:35 UTC 2017
When we write to ELSP, it triggers a context preemption at the earliest
arbitration point (3DPRIMITIVE, some PIPECONTROLs, a few other
operations and the explicit MI_ARB_CHECK). If this is to the same
context, it triggers a LITE_RESTORE where the RING_TAIL is merely
updated (used currently to chain requests from the same context
together, avoiding bubbles). However, if it is to a different, a full
context-switch is performed and it will start to execute the new context
saving the image of the old for later execution.
Previously we avoided preemption by only submitting a new context when
the old was idle. But now we wish embrace it, and if the new request has
a higher priority than the currently executing request, we write to the
ELSP regardless, thus triggering preemption. In the context-switch
interrupt handler, we therefore need to check whether the old context
was completed or whether we just switched to the new context
preemptively. In the deqeueu function (responsible for deciding who
executes next), we need to take note of when we will cause a preemption
and move all the preempted requests back onto the execution list. After
that we can proceed as normal.
The current heuristic for deciding when to preempt are only if the new
request is of higher priority, and has the privileged priority of
greater than 0.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Michal Winiarski <michal.winiarski at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Arkadiusz Hiler <arkadiusz.hiler at intel.com>
Cc: Mika Kuoppala <mika.kuoppala at intel.com>
Cc: Ben Widawsky <benjamin.widawsky at intel.com>
---
drivers/gpu/drm/i915/intel_lrc.c | 146 ++++++++++++++++++++++++--------
drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +-
2 files changed, 110 insertions(+), 38 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d227480b3a26..fe037bb9644c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -397,9 +397,9 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
return ce->lrc_desc;
}
-static void execlists_submit_ports(struct intel_engine_cs *engine)
+static void execlists_submit_ports(struct intel_engine_cs *engine,
+ struct execlist_port *port)
{
- struct execlist_port *port = engine->execlist_port;
u32 __iomem *elsp =
engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
unsigned int n;
@@ -456,24 +456,21 @@ static void port_assign(struct execlist_port *port,
port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
}
+static void unwind_wa_tail(struct drm_i915_gem_request *rq)
+{
+ rq->tail = intel_ring_wrap(rq->ring,
+ rq->wa_tail - WA_TAIL_DWORDS*sizeof(u32));
+ assert_ring_tail_valid(rq->ring, rq->tail);
+}
+
static void execlists_dequeue(struct intel_engine_cs *engine)
{
- struct drm_i915_gem_request *last;
- struct execlist_port *port = engine->execlist_port;
+ struct execlist_port *ports = engine->execlist_port;
+ struct execlist_port *port = ports;
+ struct drm_i915_gem_request *last = port_request(port);
struct rb_node *rb;
bool submit = false;
-
- last = port_request(port);
- if (last)
- /* WaIdleLiteRestore:bdw,skl
- * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
- * as we resubmit the request. See gen8_emit_breadcrumb()
- * for where we prepare the padding after the end of the
- * request.
- */
- last->tail = last->wa_tail;
-
- GEM_BUG_ON(port_isset(&port[1]));
+ bool once = last;
/* Hardware submission is through 2 ports. Conceptually each port
* has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
@@ -503,6 +500,49 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
struct drm_i915_gem_request *rq, *rn;
+ if (once) {
+ if (port_count(&port[0]) > 1)
+ goto done;
+
+ if (p->priority > max(last->priotree.priority, 0)) {
+ list_for_each_entry_safe_reverse(rq, rn,
+ &engine->timeline->requests,
+ link) {
+ struct i915_priolist *p;
+
+ if (i915_gem_request_completed(rq))
+ break;
+
+ __i915_gem_request_unsubmit(rq);
+ unwind_wa_tail(rq);
+
+ p = lookup_priolist(engine,
+ &rq->priotree,
+ rq->priotree.priority);
+ list_add(&rq->priotree.link,
+ &ptr_mask_bits(p, 1)->requests);
+ }
+
+ ports = engine->execlist_preempt;
+ port = ports;
+ last = NULL;
+ } else {
+ /* WaIdleLiteRestore:bdw,skl
+ * Apply the wa NOOPs to prevent
+ * ring:HEAD == req:TAIL as we resubmit the
+ * request. See gen8_emit_breadcrumb() for
+ * where we prepare the padding after the
+ * end of the request.
+ */
+ last->tail = last->wa_tail;
+
+ if (port_count(&port[1]))
+ goto done;
+ }
+
+ once = false;
+ }
+
list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
/*
* Can we combine this request with the current port?
@@ -521,7 +561,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
* combine this request with the last, then we
* are done.
*/
- if (port != engine->execlist_port) {
+ if (port != ports) {
__list_del_many(&p->requests,
&rq->priotree.link);
goto done;
@@ -568,14 +608,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
spin_unlock_irq(&engine->timeline->lock);
if (submit)
- execlists_submit_ports(engine);
+ execlists_submit_ports(engine, ports);
}
-static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
+static void switch_to_preempt(struct intel_engine_cs *engine)
{
- const struct execlist_port *port = engine->execlist_port;
-
- return port_count(&port[0]) + port_count(&port[1]) < 2;
+ memcpy(engine->execlist_port,
+ engine->execlist_preempt,
+ sizeof(engine->execlist_preempt));
+ memset(engine->execlist_preempt, 0,
+ sizeof(engine->execlist_preempt));
}
/*
@@ -585,7 +627,7 @@ static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
static void intel_lrc_irq_handler(unsigned long data)
{
struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
- struct execlist_port *port = engine->execlist_port;
+ struct execlist_port * const port = engine->execlist_port;
struct drm_i915_private *dev_priv = engine->i915;
/* We can skip acquiring intel_runtime_pm_get() here as it was taken
@@ -674,6 +716,24 @@ static void intel_lrc_irq_handler(unsigned long data)
/* Check the context/desc id for this event matches */
GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
+ if (status & GEN8_CTX_STATUS_PREEMPTED &&
+ !(status & GEN8_CTX_STATUS_LITE_RESTORE)) {
+ int i;
+
+ GEM_BUG_ON(!port_isset(port));
+ GEM_BUG_ON(!port_isset(engine->execlist_preempt));
+ for (i = 0; i < ARRAY_SIZE(engine->execlist_port); i++) {
+ if (!port_isset(&port[i]))
+ break;
+
+ rq = port_request(&port[i]);
+ i915_gem_request_put(rq);
+ }
+
+ switch_to_preempt(engine);
+ continue;
+ }
+
rq = port_unpack(port, &count);
GEM_BUG_ON(count == 0);
if (--count == 0) {
@@ -691,6 +751,10 @@ static void intel_lrc_irq_handler(unsigned long data)
port_set(port, port_pack(rq, count));
}
+ if (!port_isset(port) &&
+ port_isset(engine->execlist_preempt))
+ switch_to_preempt(engine);
+
/* After the final element, the hw should be idle */
GEM_BUG_ON(port_count(port) == 0 &&
!(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
@@ -703,7 +767,7 @@ static void intel_lrc_irq_handler(unsigned long data)
}
}
- if (execlists_elsp_ready(engine))
+ if (!port_isset(engine->execlist_preempt))
execlists_dequeue(engine);
intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
@@ -716,7 +780,7 @@ static void insert_request(struct intel_engine_cs *engine,
struct i915_priolist *p = lookup_priolist(engine, pt, prio);
list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
- if (ptr_unmask_bits(p, 1) && execlists_elsp_ready(engine))
+ if (ptr_unmask_bits(p, 1) && !port_isset(engine->execlist_preempt))
tasklet_hi_schedule(&engine->irq_tasklet);
}
@@ -837,8 +901,6 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
}
spin_unlock_irq(&engine->timeline->lock);
-
- /* XXX Do we need to preempt to make room for us and our deps? */
}
static struct intel_ring *
@@ -1053,6 +1115,9 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
i915_ggtt_offset(engine->scratch) +
2 * CACHELINE_BYTES);
+ /* WaDisableCtxRestoreArbitration:bdw,chv */
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
/* Pad to end of cacheline */
while ((unsigned long)batch % CACHELINE_BYTES)
*batch++ = MI_NOOP;
@@ -1077,8 +1142,6 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
*/
static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
{
- /* WaDisableCtxRestoreArbitration:bdw,chv */
- *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
*batch++ = MI_BATCH_BUFFER_END;
return batch;
@@ -1086,6 +1149,8 @@ static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
{
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
batch = gen8_emit_flush_coherentl3_wa(engine, batch);
@@ -1131,6 +1196,8 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
*batch++ = 0;
}
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
/* Pad to end of cacheline */
while ((unsigned long)batch % CACHELINE_BYTES)
*batch++ = MI_NOOP;
@@ -1271,6 +1338,8 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
engine->csb_head = -1;
+ GEM_BUG_ON(port_isset(engine->execlist_preempt));
+
submit = false;
for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
if (!port_isset(&port[n]))
@@ -1286,7 +1355,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
}
if (submit && !i915.enable_guc_submission)
- execlists_submit_ports(engine);
+ execlists_submit_ports(engine, port);
return 0;
}
@@ -1340,6 +1409,10 @@ static void reset_common_ring(struct intel_engine_cs *engine,
* guessing the missed context-switch events by looking at what
* requests were completed.
*/
+
+ if (port_isset(engine->execlist_preempt))
+ switch_to_preempt(engine); /* XXX fubar XXX */
+
if (!request) {
for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
i915_gem_request_put(port_request(&port[n]));
@@ -1388,10 +1461,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
intel_ring_update_space(request->ring);
/* Reset WaIdleLiteRestore:bdw,skl as well */
- request->tail =
- intel_ring_wrap(request->ring,
- request->wa_tail - WA_TAIL_DWORDS*sizeof(u32));
- assert_ring_tail_valid(request->ring, request->tail);
+ unwind_wa_tail(request);
}
static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
@@ -1450,13 +1520,15 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
if (IS_ERR(cs))
return PTR_ERR(cs);
+ *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
/* FIXME(BDW): Address space and security selectors. */
*cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
(flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
*cs++ = lower_32_bits(offset);
*cs++ = upper_32_bits(offset);
- *cs++ = MI_NOOP;
+
intel_ring_advance(req, cs);
return 0;
@@ -1585,8 +1657,8 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
*/
static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *cs)
{
- *cs++ = MI_NOOP;
- *cs++ = MI_NOOP;
+ *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+ *cs++ = MI_ARB_CHECK; /* preemption point *between* requests */
request->wa_tail = intel_ring_offset(request, cs);
}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index a182da7eb9a9..109d64daf5d9 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -387,7 +387,7 @@ struct intel_engine_cs {
#define port_isset(p) ((p)->request_count)
#define port_index(p, e) ((p) - (e)->execlist_port)
GEM_DEBUG_DECL(u32 context_id);
- } execlist_port[2];
+ } execlist_port[2], execlist_preempt[2];
struct rb_root execlist_queue;
struct rb_node *execlist_first;
unsigned int fw_domains;
--
2.13.2
More information about the Intel-gfx
mailing list