Mesa (master): i965: Emit VF cache invalidates for 48-bit addressing bugs with softpin.

Tue May 22 17:05:10 UTC 2018

Module: Mesa
Branch: master
Commit: 92f01fc5f914fd500497d0c3aed75f3ac8dc054d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=92f01fc5f914fd500497d0c3aed75f3ac8dc054d

Author: Kenneth Graunke <kenneth at whitecape.org>
Date:   Mon Apr  9 15:39:56 2018 -0700

i965: Emit VF cache invalidates for 48-bit addressing bugs with softpin.

We'd like to start using soft-pin to assign BO addresses up front, and
never move them again.  Our previous plan for dealing with 48-bit VF
cache bugs was to relocate vertex buffers to the low 4GB, so we'd never
have addresses that alias in the low 32 bits.  But that requires moving
buffers dynamically.

This patch tracks the last seen BO address for each vertex/index buffer,
and emits a VF cache invalidate if the high bits change.  (Ideally, we
won't hit this case very often.)  This should work for the soft-pin
case, but unfortunately won't work in the relocation case, as we don't
actually know the addresses.  So, we have to use both methods.

v2: Mention that the cache uses a <VertexBufferIndex, Address> tuple
    more explicitly (suggested by Scott).  Mention "single batch" too
    (suggested by Chris).

Reviewed-by: Scott D Phillips <scott.d.phillips at intel.com>

---

 src/mesa/drivers/dri/i965/brw_context.h       |  6 +++
 src/mesa/drivers/dri/i965/genX_state_upload.c | 63 +++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 0844400bc5..773f104824 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -966,6 +966,9 @@ struct brw_context
        * These bitfields indicate which workarounds are needed.
        */
       uint8_t attrib_wa_flags[VERT_ATTRIB_MAX];
+
+      /* High bits of the last seen vertex buffer address (for workarounds). */
+      uint16_t last_bo_high_bits[33];
    } vb;
 
    struct {
@@ -986,6 +989,9 @@ struct brw_context
        * referencing the same index buffer.
        */
       unsigned int start_vertex_offset;
+
+      /* High bits of the last seen index buffer address (for workarounds). */
+      uint16_t last_bo_high_bits;
    } ib;
 
    /* Active vertex program:
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
index 6178bfa3f8..4f44b9965e 100644
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -480,6 +480,65 @@ upload_format_size(uint32_t upload_format)
    }
 }
 
+static UNUSED uint16_t
+pinned_bo_high_bits(struct brw_bo *bo)
+{
+   return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
+}
+
+/* The VF cache designers apparently cut corners, and made the cache key's
+ * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
+ * of the address.  If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions.  (These collisions can happen within a single batch.)
+ *
+ * In the soft-pin world, we'd like to assign addresses up front, and never
+ * move buffers.  So, we need to do a VF cache invalidate if the buffer for
+ * a particular VB slot has different [48:32] address bits than the last one.
+ *
+ * In the relocation world, we have no idea what the addresses will be, so
+ * we can't apply this workaround.  Instead, we tell the kernel to move it
+ * to the low 4GB regardless.
+ */
+static void
+vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   bool need_invalidate = true;
+   unsigned i;
+
+   for (i = 0; i < brw->vb.nr_buffers; i++) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
+
+      if (high_bits != brw->vb.last_bo_high_bits[i]) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[i] = high_bits;
+      }
+   }
+
+   /* Don't bother with draw parameter buffers - those are generated by
+    * the driver so we can select a consistent memory zone.
+    */
+
+   if (need_invalidate) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+   }
+#endif
+}
+
+static void
+vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
+
+   if (high_bits != brw->ib.last_bo_high_bits) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+      brw->ib.last_bo_high_bits = high_bits;
+   }
+#endif
+}
+
 static void
 genX(emit_vertices)(struct brw_context *brw)
 {
@@ -594,6 +653,8 @@ genX(emit_vertices)(struct brw_context *brw)
    const unsigned nr_buffers = brw->vb.nr_buffers +
       uses_draw_params + uses_derived_draw_params;
 
+   vf_invalidate_for_vb_48bit_transitions(brw);
+
    if (nr_buffers) {
       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
 
@@ -886,6 +947,8 @@ genX(emit_index_buffer)(struct brw_context *brw)
    if (index_buffer == NULL)
       return;
 
+   vf_invalidate_for_ib_48bit_transition(brw);
+
    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 #if GEN_GEN < 8 && !GEN_IS_HASWELL
       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;