[Intel-gfx] [RFC 4/4] drm/i915/execlists: Preemption!
Chris Wilson
chris at chris-wilson.co.uk
Mon Jul 17 12:58:48 UTC 2017
Quoting Chris Wilson (2017-07-17 10:46:23)
> Quoting Chris Wilson (2017-07-17 09:42:35)
> > @@ -503,6 +500,49 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> > struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> > struct drm_i915_gem_request *rq, *rn;
> >
> > + if (once) {
> > + if (port_count(&port[0]) > 1)
> > + goto done;
> > +
> > + if (p->priority > max(last->priotree.priority, 0)) {
> > + list_for_each_entry_safe_reverse(rq, rn,
> > + &engine->timeline->requests,
> > + link) {
> > + struct i915_priolist *p;
> > +
> > + if (i915_gem_request_completed(rq))
> > + break;
> > +
> > + __i915_gem_request_unsubmit(rq);
> > + unwind_wa_tail(rq);
>
> Fwiw, this is the flaw in this approach. As we decide to move the
> request back to the execution queue, it may complete. If we try to
> reexecute it, its ring->tail will be less than RING_HEAD, telling the hw
> to execute everything after it again.
>
> Michal's approach was to use a preemptive switch to a dummy context and
> then once hw knew the hw wasn't executing any of the other requests, he
> would unsubmit them and recompute the desired order. I've yet to see
> another solution for a cheaper barrier between hw/sw, as otherwise we
> must deliberately insert a stall to do preemption. :|
First thought to closing the race is to ensure that the gpu doesn't
advance across a breadcrumb as we are processing the list:
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index c712d01f92ab..18a08c701604 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -412,7 +412,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define MI_SEMAPHORE_TARGET(engine) ((engine)<<15)
#define MI_SEMAPHORE_WAIT MI_INSTR(0x1c, 2) /* GEN8+ */
#define MI_SEMAPHORE_POLL (1<<15)
+#define MI_SEMAPHORE_SAD_GT_SDD (0<<12)
#define MI_SEMAPHORE_SAD_GTE_SDD (1<<12)
+#define MI_SEMAPHORE_SAD_LT_SDD (2<<12)
+#define MI_SEMAPHORE_SAD_LTE_SDD (3<<12)
+#define MI_SEMAPHORE_SAD_EQ_SDD (4<<12)
+#define MI_SEMAPHORE_SAD_NEQ_SDD (5<<12)
#define MI_STORE_DWORD_IMM MI_INSTR(0x20, 1)
#define MI_STORE_DWORD_IMM_GEN4 MI_INSTR(0x20, 2)
#define MI_MEM_VIRTUAL (1 << 22) /* 945,g33,965 */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fe037bb9644c..c84e485cb323 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -505,6 +505,12 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
goto done;
if (p->priority > max(last->priotree.priority, 0)) {
+ u32 *gpu_sema = &engine->status_page.page_addr[I915_GEM_HWS_INDEX + 2];
+ /* Suspend updates from the gpu */
+ WRITE_ONCE(*gpu_sema, 1);
+ readl(engine->i915->regs +
+ i915_mmio_reg_offset(RING_ACTHD(engine->mmio_base)));
+
list_for_each_entry_safe_reverse(rq, rn,
&engine->timeline->requests,
link) {
@@ -613,11 +619,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
static void switch_to_preempt(struct intel_engine_cs *engine)
{
+ u32 *gpu_sema = &engine->status_page.page_addr[I915_GEM_HWS_INDEX + 2];
+
memcpy(engine->execlist_port,
engine->execlist_preempt,
sizeof(engine->execlist_preempt));
memset(engine->execlist_preempt, 0,
sizeof(engine->execlist_preempt));
+
+ /* Restart updates from the gpu */
+ WRITE_ONCE(*gpu_sema, 0);
}
/*
@@ -1667,6 +1678,14 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = 0; /* continue if zero (preempt == 0) */
+ *cs++ = intel_hws_seqno_address(request->engine) + 8;
+ *cs++ = 0;
+
*cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
*cs++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT;
*cs++ = 0;
@@ -1679,7 +1698,7 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
gen8_emit_wa_tail(request, cs);
}
-static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
u32 *cs)
@@ -1687,6 +1706,14 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
/* We're using qword write, seqno should be aligned to 8 bytes. */
BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
+ *cs++ = MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_SAD_EQ_SDD;
+ *cs++ = 0; /* continue if zero */
+ *cs++ = intel_hws_seqno_address(request->engine) + 8;
+ *cs++ = 0;
+
/* w/a for post sync ops following a GPGPU operation we
* need a prior CS_STALL, which is emitted by the flush
* following the batch.
@@ -1707,7 +1734,7 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
gen8_emit_wa_tail(request, cs);
}
-static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_render_sz = 12 + WA_TAIL_DWORDS;
static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
{
cuts down the race to concurrent memory op between cpu/gpu. The mmio read
there is to hand wave that away by delaying just enough to allow the GPU
write to complete. Though a no-op MI_SEMAPHORE_WAIT just about has a
measurable impact on no-op batches.
-Chris
More information about the Intel-gfx
mailing list