[Intel-gfx] [RFC 4/4] drm/i915/execlists: Preemption!

Mon Jul 17 12:58:48 UTC 2017

Quoting Chris Wilson (2017-07-17 10:46:23)
> Quoting Chris Wilson (2017-07-17 09:42:35)
> > @@ -503,6 +500,49 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> >                 struct drm_i915_gem_request *rq, *rn;
> >  
> > +               if (once) {
> > +                       if (port_count(&port[0]) > 1)
> > +                               goto done;
> > +
> > +                       if (p->priority > max(last->priotree.priority, 0)) {
> > +                               list_for_each_entry_safe_reverse(rq, rn,
> > +                                                                &engine->timeline->requests,
> > +                                                                link) {
> > +                                       struct i915_priolist *p;
> > +
> > +                                       if (i915_gem_request_completed(rq))
> > +                                               break;
> > +
> > +                                       __i915_gem_request_unsubmit(rq);
> > +                                       unwind_wa_tail(rq);
> 
> Fwiw, this is the flaw in this approach. As we decide to move the
> request back to the execution queue, it may complete. If we try to
> reexecute it, its ring->tail will be less than RING_HEAD, telling the hw
> to execute everything after it again.
> 
> Michal's approach was to use a preemptive switch to a dummy context and
> then once hw knew the hw wasn't executing any of the other requests, he
> would unsubmit them and recompute the desired order. I've yet to see
> another solution for a cheaper barrier between hw/sw, as otherwise we
> must deliberately insert a stall to do preemption. :|

First thought to closing the race is to ensure that the gpu doesn't
advance across a breadcrumb as we are processing the list:

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index c712d01f92ab..18a08c701604 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -412,7 +412,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   MI_SEMAPHORE_TARGET(engine)  ((engine)<<15)
 #define MI_SEMAPHORE_WAIT      MI_INSTR(0x1c, 2) /* GEN8+ */
 #define   MI_SEMAPHORE_POLL            (1<<15)
+#define   MI_SEMAPHORE_SAD_GT_SDD      (0<<12)
 #define   MI_SEMAPHORE_SAD_GTE_SDD     (1<<12)
+#define   MI_SEMAPHORE_SAD_LT_SDD      (2<<12)
+#define   MI_SEMAPHORE_SAD_LTE_SDD     (3<<12)
+#define   MI_SEMAPHORE_SAD_EQ_SDD      (4<<12)
+#define   MI_SEMAPHORE_SAD_NEQ_SDD     (5<<12)
 #define MI_STORE_DWORD_IMM     MI_INSTR(0x20, 1)
 #define MI_STORE_DWORD_IMM_GEN4        MI_INSTR(0x20, 2)
 #define   MI_MEM_VIRTUAL       (1 << 22) /* 945,g33,965 */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fe037bb9644c..c84e485cb323 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -505,6 +505,12 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
                                goto done;
 
                        if (p->priority > max(last->priotree.priority, 0)) {
+                               u32 *gpu_sema = &engine->status_page.page_addr[I915_GEM_HWS_INDEX + 2];
+                               /* Suspend updates from the gpu */
+                               WRITE_ONCE(*gpu_sema, 1);
+                               readl(engine->i915->regs +
+                                     i915_mmio_reg_offset(RING_ACTHD(engine->mmio_base)));
+
                                list_for_each_entry_safe_reverse(rq, rn,
                                                                 &engine->timeline->requests,
                                                                 link) {
@@ -613,11 +619,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 
 static void switch_to_preempt(struct intel_engine_cs *engine)
 {
+       u32 *gpu_sema = &engine->status_page.page_addr[I915_GEM_HWS_INDEX + 2];
+
        memcpy(engine->execlist_port,
               engine->execlist_preempt,
               sizeof(engine->execlist_preempt));
        memset(engine->execlist_preempt, 0,
               sizeof(engine->execlist_preempt));
+
+       /* Restart updates from the gpu */
+       WRITE_ONCE(*gpu_sema, 0);
 }
 
 /*
@@ -1667,6 +1678,14 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
        /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
        BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
 
+       *cs++ = MI_SEMAPHORE_WAIT |
+               MI_SEMAPHORE_POLL |
+               MI_SEMAPHORE_GLOBAL_GTT |
+               MI_SEMAPHORE_SAD_EQ_SDD;
+       *cs++ = 0; /* continue if zero (preempt == 0) */
+       *cs++ = intel_hws_seqno_address(request->engine) + 8;
+       *cs++ = 0;
+
        *cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
        *cs++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT;
        *cs++ = 0;
@@ -1679,7 +1698,7 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
        gen8_emit_wa_tail(request, cs);
 }
 
-static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
 
 static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
                                        u32 *cs)
@@ -1687,6 +1706,14 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
        /* We're using qword write, seqno should be aligned to 8 bytes. */
        BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
 
+       *cs++ = MI_SEMAPHORE_WAIT |
+               MI_SEMAPHORE_POLL |
+               MI_SEMAPHORE_GLOBAL_GTT |
+               MI_SEMAPHORE_SAD_EQ_SDD;
+       *cs++ = 0; /* continue if zero */
+       *cs++ = intel_hws_seqno_address(request->engine) + 8;
+       *cs++ = 0;
+
        /* w/a for post sync ops following a GPGPU operation we
         * need a prior CS_STALL, which is emitted by the flush
         * following the batch.
@@ -1707,7 +1734,7 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
        gen8_emit_wa_tail(request, cs);
 }
 
-static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_render_sz = 12 + WA_TAIL_DWORDS;
 
 static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
 {

cuts down the race to concurrent memory op between cpu/gpu. The mmio read
there is to hand wave that away by delaying just enough to allow the GPU
write to complete. Though a no-op MI_SEMAPHORE_WAIT just about has a
measurable impact on no-op batches.
-Chris