[Mesa-dev] [RFC 11/12] anv/cmd_buffer: Emit instanced draws for multiple views

Fri Mar 24 12:53:58 UTC 2017

On Wed, 2017-03-22 at 21:01 -0700, Jason Ekstrand wrote:
---
 src/intel/vulkan/anv_private.h     |   6 ++
 src/intel/vulkan/genX_cmd_buffer.c | 141
++++++++++++++++++++++++++++++++++++-
 src/intel/vulkan/genX_pipeline.c   |  10 ++-
 3 files changed, 152 insertions(+), 5 deletions(-)

diff --git a/src/intel/vulkan/anv_private.h
b/src/intel/vulkan/anv_private.h
index b402bc3..253dce2 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1980,6 +1980,12 @@ struct anv_subpass {
    bool                                         has_resolve;
 };
 
+static inline unsigned
+anv_subpass_view_count(const struct anv_subpass *subpass)
+{
+   return MAX2(1, _mesa_bitcount(subpass->view_mask));
+}
+
 enum anv_subpass_usage {
    ANV_SUBPASS_USAGE_DRAW =         (1 << 0),
    ANV_SUBPASS_USAGE_INPUT =        (1 << 1),
diff --git a/src/intel/vulkan/genX_cmd_buffer.c
b/src/intel/vulkan/genX_cmd_buffer.c
index aafe7fd..8c21c33 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -26,6 +26,7 @@
 
 #include "anv_private.h"
 #include "vk_format_info.h"
+#include "util/vk_util.h"
 
 #include "common/gen_l3_config.h"
 #include "genxml/gen_macros.h"
@@ -50,6 +51,17 @@ emit_lri(struct anv_batch *batch, uint32_t reg,
uint32_t imm)
    }
 }
 
+#if GEN_IS_HASWELL || GEN_GEN >= 8
+static void
+emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src)
+{
+   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
+      lrr.SourceRegisterAddress        = src;
+      lrr.DestinationRegisterAddress   = dst;
+   }
+}
+#endif
+
 void
 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer
*cmd_buffer)
 {
@@ -1525,7 +1537,13 @@ genX(cmd_buffer_flush_state)(struct
anv_cmd_buffer *cmd_buffer)
             .MemoryObjectControlState = GENX(MOCS),
 #else
             .BufferAccessType = pipeline->instancing_enable[vb] ?
INSTANCEDATA : VERTEXDATA,
-            .InstanceDataStepRate = 1,
+            /* Our implementation of VK_KHR_multiview uses
instancing to draw
+             * the different views.  If the client asks for
instancing, we
+             * need to use the Instance Data Step Rate to ensure
that we
+             * repeat the client's per-instance data once for each
view.
+             */
+            .InstanceDataStepRate =
+               _mesa_bitcount(pipeline->subpass->view_mask),

mmm... shouldn't this be:

.InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass);

so that we set this to 1 when multiview is not in use? (view_mask == 0)

             .VertexBufferMemoryObjectControlState = GENX(MOCS),
 #endif
 
@@ -1715,6 +1733,11 @@ void genX(CmdDraw)(
    if (vs_prog_data->uses_drawid)
       emit_draw_index(cmd_buffer, 0);
 
+   /* Our implementation of VK_KHR_multiview uses instancing to draw
the
+    * different views.  We need to multiply instanceCount by the
view count.
+    */
+   instanceCount *= anv_subpass_view_count(cmd_buffer-
>state.subpass);
+
    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
       prim.VertexAccessType         = SEQUENTIAL;
       prim.PrimitiveTopologyType    = pipeline->topology;
@@ -1748,6 +1771,11 @@ void genX(CmdDrawIndexed)(
    if (vs_prog_data->uses_drawid)
       emit_draw_index(cmd_buffer, 0);
 
+   /* Our implementation of VK_KHR_multiview uses instancing to draw
the
+    * different views.  We need to multiply instanceCount by the
view count.
+    */
+   instanceCount *= anv_subpass_view_count(cmd_buffer-
>state.subpass);
+
    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
       prim.VertexAccessType         = RANDOM;
       prim.PrimitiveTopologyType    = pipeline->topology;
@@ -1767,6 +1795,90 @@ void genX(CmdDrawIndexed)(
 #define GEN7_3DPRIM_START_INSTANCE      0x243C
 #define GEN7_3DPRIM_BASE_VERTEX         0x2440
 
+/* MI_MATH only exists on Haswell+ */
+#if GEN_IS_HASWELL || GEN_GEN >= 8
+
+#define alu_opcode(v)   __gen_uint((v),  20, 31)
+#define alu_operand1(v) __gen_uint((v),  10, 19)
+#define alu_operand2(v) __gen_uint((v),   0,  9)
+#define alu(opcode, operand1, operand2) \
+   alu_opcode(opcode) | alu_operand1(operand1) |
alu_operand2(operand2)
+
+#define OPCODE_NOOP      0x000
+#define OPCODE_LOAD      0x080
+#define OPCODE_LOADINV   0x480
+#define OPCODE_LOAD0     0x081
+#define OPCODE_LOAD1     0x481
+#define OPCODE_ADD       0x100
+#define OPCODE_SUB       0x101
+#define OPCODE_AND       0x102
+#define OPCODE_OR        0x103
+#define OPCODE_XOR       0x104
+#define OPCODE_STORE     0x180
+#define OPCODE_STOREINV  0x580
+
+#define OPERAND_R0   0x00
+#define OPERAND_R1   0x01
+#define OPERAND_R2   0x02
+#define OPERAND_R3   0x03
+#define OPERAND_R4   0x04
+#define OPERAND_SRCA 0x20
+#define OPERAND_SRCB 0x21
+#define OPERAND_ACCU 0x31
+#define OPERAND_ZF   0x32
+#define OPERAND_CF   0x33
+
+#define CS_GPR(n) (0x2600 + (n) * 8)
+
+/* Emit dwords to multiply GPR0 by N */
+static void
+build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t
N)
+{
+   VK_OUTARRAY_MAKE(out, dw, dw_count);
+
+#define append_alu(opcode, operand1, operand2) \
+   vk_outarray_append(&out, alu_dw) *alu_dw = alu(opcode, operand1,
operand2)
+
+   assert(N > 0);
+   unsigned top_bit = 31 - __builtin_clz(N);
+   for (int i = top_bit - 1; i >= 0; i--) {
+      /* We get our initial data in GPR0 and we write the final data
out to
+       * GPR0 but we use GPR1 as our scratch register.
+       */
+      unsigned src_reg = i == top_bit - 1 ? OPERAND_R0 : OPERAND_R1;
+      unsigned dst_reg = i == 0 ? OPERAND_R0 : OPERAND_R1;
+
+      /* Shift the current value left by 1 */
+      append_alu(OPCODE_LOAD, OPERAND_SRCA, src_reg);
+      append_alu(OPCODE_LOAD, OPERAND_SRCB, src_reg);
+      append_alu(OPCODE_AND, 0, 0);
+
+      if (N & (1 << i)) {
+         /* Store ACCU to R1 and add R0 to R1 */
+         append_alu(OPCODE_STORE, OPERAND_R1, OPERAND_ACCU);
+         append_alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
+         append_alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1);
+         append_alu(OPCODE_AND, 0, 0);
+      }
+
+      append_alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
+   }
+
+#undef append_alu
+}
+
+static void
+emit_mul_gpr0(struct anv_batch *batch, uint32_t N)
+{
+   uint32_t num_dwords;
> +   build_alu_multiply_gpr0(NULL, &num_dwords, N);

I don't get this, it seems that the expectation of this code is
that build_alu_multiply_gpr0() fills in num_dwords  when we call it
with NULL in the first argument, but I don't see where it does that,
what am I missing?

+ 
+   uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords,
GENX(MI_MATH));
+   build_alu_multiply_gpr0(dw, &num_dwords, N);
+}
+
+#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
+
 static void
 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
                          struct anv_buffer *buffer, uint64_t offset,
@@ -1777,7 +1889,22 @@ load_indirect_parameters(struct anv_cmd_buffer
*cmd_buffer,
    uint32_t bo_offset = buffer->offset + offset;
 
    emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
-   emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+
+   unsigned view_count = anv_subpass_view_count(cmd_buffer-
>state.subpass);
+   if (view_count > 1) {
+#if GEN_IS_HASWELL || GEN_GEN >= 8
+      emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4);
+      emit_mul_gpr0(batch, view_count);
+      emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0));
+#else
+      anv_finishme("Multiview + indirect draw requires MI_MATH\n"
+                   "MI_MATH is not supported on Ivy Bridge");
+      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
4);
+#endif
+   } else {
+      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
4);
+   }
+
    emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
 
    if (indexed) {
@@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(struct
anv_cmd_buffer *cmd_buffer,
 
    cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
 
+   /* Our implementation of VK_KHR_multiview uses instancing to draw
the
+    * different views.  If the client asks for instancing, we need
to use the
+    * Instance Data Step Rate to ensure that we repeat the client's
+    * per-instance data once for each view.  Since this bit is in
+    * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers
at the top
+    * of each subpass.
+    */
+   if (GEN_GEN == 7)
+      cmd_buffer->state.vb_dirty |= ~0;
> +

Shouldn't we do:

if (GEN_GEN == 7 && subpass->view_mask != 0)
    cmd_buffer->state.vb_dirty |= ~0;

So this doesn't affect cases that don't use multiview?

>     /* Perform transitions to the subpass layout before any writes
> have
     * occurred.
     */
diff --git a/src/intel/vulkan/genX_pipeline.c
b/src/intel/vulkan/genX_pipeline.c
index 85a9e4f..9aaa8a0 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -155,9 +155,13 @@ emit_vertex_input(struct anv_pipeline *pipeline,
       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING),
vfi) {
          vfi.InstancingEnable = pipeline->instancing_enable[desc-
>binding];
          vfi.VertexElementIndex = slot;
-         /* Vulkan so far doesn't have an instance divisor, so
-          * this is always 1 (ignored if not instancing). */
-         vfi.InstanceDataStepRate = 1;
+         /* Our implementation of VK_KHX_multiview uses instancing
to draw
+          * the different views.  If the client asks for instancing,
we
+          * need to use the Instance Data Step Rate to ensure that
we
+          * repeat the client's per-instance data once for each
view.
+          */
+         vfi.InstanceDataStepRate =
> +            _mesa_bitcount(pipeline->subpass->view_mask);

Same comment as above, shouldn't this be:

vfi.InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass);

instead?

>        }
 #endif
    }