[Mesa-dev] [RFC 11/12] anv/cmd_buffer: Emit instanced draws for multiple views

Mon Mar 27 12:33:07 UTC 2017

On Fri, 2017-03-24 at 16:57 -0700, Jason Ekstrand wrote:
> On Fri, Mar 24, 2017 at 5:53 AM, Iago Toral <itoral at igalia.com>
> wrote:
> > On Wed, 2017-03-22 at 21:01 -0700, Jason Ekstrand wrote:
> > ---
> >  src/intel/vulkan/anv_private.h     |   6 ++
> >  src/intel/vulkan/genX_cmd_buffer.c | 141
> > ++++++++++++++++++++++++++++++++++++-
> >  src/intel/vulkan/genX_pipeline.c   |  10 ++-
> >  3 files changed, 152 insertions(+), 5 deletions(-)
> > 
> > 
> > diff --git a/src/intel/vulkan/anv_private.h
> > b/src/intel/vulkan/anv_private.h
> > index b402bc3..253dce2 100644
> > --- a/src/intel/vulkan/anv_private.h
> > +++ b/src/intel/vulkan/anv_private.h
> > @@ -1980,6 +1980,12 @@ struct anv_subpass {
> >     bool                                         has_resolve;
> >  };
> >  
> > +static inline unsigned
> > +anv_subpass_view_count(const struct anv_subpass *subpass)
> > +{
> > +   return MAX2(1, _mesa_bitcount(subpass->view_mask));
> > +}
> > +
> >  enum anv_subpass_usage {
> >     ANV_SUBPASS_USAGE_DRAW =         (1 << 0),
> >     ANV_SUBPASS_USAGE_INPUT =        (1 << 1),
> > diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> > b/src/intel/vulkan/genX_cmd_buffer.c
> > index aafe7fd..8c21c33 100644
> > --- a/src/intel/vulkan/genX_cmd_buffer.c
> > +++ b/src/intel/vulkan/genX_cmd_buffer.c
> > @@ -26,6 +26,7 @@
> >  
> >  #include "anv_private.h"
> >  #include "vk_format_info.h"
> > +#include "util/vk_util.h"
> >  
> >  #include "common/gen_l3_config.h"
> >  #include "genxml/gen_macros.h"
> > @@ -50,6 +51,17 @@ emit_lri(struct anv_batch *batch, uint32_t reg,
> > uint32_t imm)
> >     }
> >  }
> >  
> > +#if GEN_IS_HASWELL || GEN_GEN >= 8
> > +static void
> > +emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src)
> > +{
> > +   anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
> > +      lrr.SourceRegisterAddress        = src;
> > +      lrr.DestinationRegisterAddress   = dst;
> > +   }
> > +}
> > +#endif
> > +
> >  void
> >  genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer
> > *cmd_buffer)
> >  {
> > @@ -1525,7 +1537,13 @@ genX(cmd_buffer_flush_state)(struct
> > anv_cmd_buffer *cmd_buffer)
> >              .MemoryObjectControlState = GENX(MOCS),
> >  #else
> >              .BufferAccessType = pipeline->instancing_enable[vb] ?
> > INSTANCEDATA : VERTEXDATA,
> > -            .InstanceDataStepRate = 1,
> > +            /* Our implementation of VK_KHR_multiview uses
> > instancing to draw
> > +             * the different views.  If the client asks for
> > instancing, we
> > +             * need to use the Instance Data Step Rate to ensure
> > that we
> > +             * repeat the client's per-instance data once for each
> > view.
> > +             */
> > +            .InstanceDataStepRate =
> > +               _mesa_bitcount(pipeline->subpass->view_mask),
> > 
> > mmm... shouldn't this be:
> > 
> > .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass);
> > 
> > so that we set this to 1 when multiview is not in use? (view_mask
> > == 0)
> Good call!  You're absolutely right.  This line is older than
> anv_subpass_view_count, I think.
>  
> > 
> >              .VertexBufferMemoryObjectControlState = GENX(MOCS),
> >  #endif
> >  
> > @@ -1715,6 +1733,11 @@ void genX(CmdDraw)(
> >     if (vs_prog_data->uses_drawid)
> >        emit_draw_index(cmd_buffer, 0);
> >  
> > +   /* Our implementation of VK_KHR_multiview uses instancing to
> > draw
> > the
> > +    * different views.  We need to multiply instanceCount by the
> > view count.
> > +    */
> > +   instanceCount *= anv_subpass_view_count(cmd_buffer-
> > >state.subpass);
> > +
> >     anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
> >        prim.VertexAccessType         = SEQUENTIAL;
> >        prim.PrimitiveTopologyType    = pipeline->topology;
> > @@ -1748,6 +1771,11 @@ void genX(CmdDrawIndexed)(
> >     if (vs_prog_data->uses_drawid)
> >        emit_draw_index(cmd_buffer, 0);
> >  
> > +   /* Our implementation of VK_KHR_multiview uses instancing to
> > draw
> > the
> > +    * different views.  We need to multiply instanceCount by the
> > view count.
> > +    */
> > +   instanceCount *= anv_subpass_view_count(cmd_buffer-
> > >state.subpass);
> > +
> >     anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
> >        prim.VertexAccessType         = RANDOM;
> >        prim.PrimitiveTopologyType    = pipeline->topology;
> > @@ -1767,6 +1795,90 @@ void genX(CmdDrawIndexed)(
> >  #define GEN7_3DPRIM_START_INSTANCE      0x243C
> >  #define GEN7_3DPRIM_BASE_VERTEX         0x2440
> >  
> > +/* MI_MATH only exists on Haswell+ */
> > +#if GEN_IS_HASWELL || GEN_GEN >= 8
> > +
> > +#define alu_opcode(v)   __gen_uint((v),  20, 31)
> > +#define alu_operand1(v) __gen_uint((v),  10, 19)
> > +#define alu_operand2(v) __gen_uint((v),   0,  9)
> > +#define alu(opcode, operand1, operand2) \
> > +   alu_opcode(opcode) | alu_operand1(operand1) |
> > alu_operand2(operand2)
> > +
> > +#define OPCODE_NOOP      0x000
> > +#define OPCODE_LOAD      0x080
> > +#define OPCODE_LOADINV   0x480
> > +#define OPCODE_LOAD0     0x081
> > +#define OPCODE_LOAD1     0x481
> > +#define OPCODE_ADD       0x100
> > +#define OPCODE_SUB       0x101
> > +#define OPCODE_AND       0x102
> > +#define OPCODE_OR        0x103
> > +#define OPCODE_XOR       0x104
> > +#define OPCODE_STORE     0x180
> > +#define OPCODE_STOREINV  0x580
> > +
> > +#define OPERAND_R0   0x00
> > +#define OPERAND_R1   0x01
> > +#define OPERAND_R2   0x02
> > +#define OPERAND_R3   0x03
> > +#define OPERAND_R4   0x04
> > +#define OPERAND_SRCA 0x20
> > +#define OPERAND_SRCB 0x21
> > +#define OPERAND_ACCU 0x31
> > +#define OPERAND_ZF   0x32
> > +#define OPERAND_CF   0x33
> > +
> > +#define CS_GPR(n) (0x2600 + (n) * 8)
> > +
> > +/* Emit dwords to multiply GPR0 by N */
> > +static void
> > +build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t
> > N)
> > +{
> > +   VK_OUTARRAY_MAKE(out, dw, dw_count);
> > +
> > +#define append_alu(opcode, operand1, operand2) \
> > +   vk_outarray_append(&out, alu_dw) *alu_dw = alu(opcode,
> > operand1,
> > operand2)
> > +
> > +   assert(N > 0);
> > +   unsigned top_bit = 31 - __builtin_clz(N);
> > +   for (int i = top_bit - 1; i >= 0; i--) {
> > +      /* We get our initial data in GPR0 and we write the final
> > data
> > out to
> > +       * GPR0 but we use GPR1 as our scratch register.
> > +       */
> > +      unsigned src_reg = i == top_bit - 1 ? OPERAND_R0 :
> > OPERAND_R1;
> > +      unsigned dst_reg = i == 0 ? OPERAND_R0 : OPERAND_R1;
> > +
> > +      /* Shift the current value left by 1 */
> > +      append_alu(OPCODE_LOAD, OPERAND_SRCA, src_reg);
> > +      append_alu(OPCODE_LOAD, OPERAND_SRCB, src_reg);
> > +      append_alu(OPCODE_AND, 0, 0);
> > +
> > +      if (N & (1 << i)) {
> > +         /* Store ACCU to R1 and add R0 to R1 */
> > +         append_alu(OPCODE_STORE, OPERAND_R1, OPERAND_ACCU);
> > +         append_alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
> > +         append_alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1);
> > +         append_alu(OPCODE_AND, 0, 0);
> > +      }
> > +
> > +      append_alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
> > +   }
> > +
> > +#undef append_alu
> > +}
> > +
> > +static void
> > +emit_mul_gpr0(struct anv_batch *batch, uint32_t N)
> > +{
> > +   uint32_t num_dwords;
> > > +   build_alu_multiply_gpr0(NULL, &num_dwords, N);
> > 
> > I don't get this, it seems that the expectation of this code is
> > that build_alu_multiply_gpr0() fills in num_dwords  when we call it
> > with NULL in the first argument, but I don't see where it does
> > that,
> > what am I missing?
> build_alu_multiply_gpr0 has the same semantics as the Vulkan query
> functions that return arrays.  If you call it with NULL, it returns
> the number of dwords required.  Otherwise, it assumes num_dwords is
> the number of dwords that it has available to write.  I made use of
> vk_outarray to get this behavior.  Yeah, it's a bit deceptive, but it
> worked out really well. :-)  Feel free to tell me that it's a bit too
> clever and that I should make it simpler.
No, it is okay, I understood what the intended behavior was but somehow
I didn't see clearly how the macros in vk-util achieved that and got
confused, I looked at it more carefully now and I think it is clear and
make the implementation easier too.
> > 
> > + 
> > 
> > +   uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords,
> > 
> > GENX(MI_MATH));
> > 
> > +   build_alu_multiply_gpr0(> > dw, &num_dwords, N);
> > 
> > +}
> > 
> > +
> > 
> > +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
> > 
> > +
> > 
> >  static void
> > 
> >  load_indirect_parameters(> > struct anv_cmd_buffer *cmd_buffer,
> > 
> >                           > > struct anv_buffer *buffer, uint64_t offset,
> > 
> > @@ -1777,7 +1889,22 @@ load_indirect_parameters(> > struct anv_cmd_buffer
> > 
> > *cmd_buffer,
> > 
> >     uint32_t bo_offset = buffer->offset + offset;
> > 
> >  
> > 
> >     emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
> > 
> > -   emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
> > 
> > +
> > 
> > +   unsigned view_count = anv_subpass_view_count(cmd_> > buffer-
> > 
> > >state.subpass);
> > 
> > +   if (view_count > 1) {
> > 
> > +#if GEN_IS_HASWELL || GEN_GEN >= 8
> > 
> > +      emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4);
> > 
> > +      emit_mul_gpr0(batch, view_count);
> > 
> > +      emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0));
> > 
> > +#else
> > 
> > +      anv_finishme("Multiview + indirect draw requires MI_MATH\n"
> > 
> > +                   "MI_MATH is not supported on Ivy Bridge");
> > 
> > +      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
> > 
> > 4);
> > 
> > +#endif
> > 
> > +   } else {
> > 
> > +      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
> > 
> > 4);
> > 
> > +   }
> > 
> > +
> > 
> >     emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
> > 
> >  
> > 
> >     if (indexed) {
> > 
> > @@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(> > struct
> > 
> > anv_cmd_buffer *cmd_buffer,
> > 
> >  
> > 
> >     cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
> > 
> >  
> > 
> > +   /* Our implementation of VK_KHR_multiview uses instancing to draw
> > 
> > the
> > 
> > +    * different views.  If the client asks for instancing, we need
> > 
> > to use the
> > 
> > +    * Instance Data Step Rate to ensure that we repeat the client's
> > 
> > +    * per-instance data once for each view.  Since this bit is in
> > 
> > +    * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers
> > 
> > at the top
> > 
> > +    * of each subpass.
> > 
> > +    */
> > 
> > +   if (GEN_GEN == 7)
> > 
> > +      cmd_buffer->state.vb_> > dirty |= ~0;
> > 
> > > +
> > 

> > 

> > Shouldn't we do:
> > 

> > 
> > if (GEN_GEN == 7 && subpass->view_mask != 0)
> > 
> >     cmd_buffer->state.vb_dirty |= ~0;
> > 

> > 
> > So this doesn't affect cases that don't use multiview? > > Yes, we could do that because we're guaranteed that, for each render pass, either all subpass view masks are 0 or all are non-zero.  This seems safer though and I doubt flushing vertex buffers at the subpass boundary will hurt anyone.
Right, makes sense.
Reviewed-by: Iago Toral Quiroga <itoral at igalia.com>
> > 
> >  >     /* Perform transitions to the subpass layout before any writes
> > 
> > > have
> > 
> >      * occurred.
> > 
> >      */
> > 
> > diff --git a/src/intel/vulkan/genX_> > pipeline.c
> > 
> > b/src/intel/vulkan/genX_> > pipeline.c
> > 
> > index 85a9e4f..9aaa8a0 100644
> > 
> > --- a/src/intel/vulkan/genX_> > pipeline.c
> > 
> > +++ b/src/intel/vulkan/genX_> > pipeline.c
> > 
> > @@ -155,9 +155,13 @@ emit_vertex_input(struct anv_pipeline *pipeline,
> > 
> >        anv_batch_emit(&> > pipeline->batch, GENX(3DSTATE_VF_INSTANCING),
> > 
> > vfi) {
> > 
> >           vfi.InstancingEnable = pipeline->instancing_enable[> > desc-
> > 
> > >binding];
> > 
> >           vfi.> > VertexElementIndex = slot;
> > 
> > -         /* Vulkan so far doesn't have an instance divisor, so
> > 
> > -          * this is always 1 (ignored if not instancing). */
> > 
> > -         vfi.> > InstanceDataStepRate = 1;
> > 
> > +         /* Our implementation of VK_KHX_multiview uses instancing
> > 
> > to draw
> > 
> > +          * the different views.  If the client asks for instancing,
> > 
> > we
> > 
> > +          * need to use the Instance Data Step Rate to ensure that
> > 
> > we
> > 
> > +          * repeat the client's per-instance data once for each
> > 
> > view.
> > 
> > +          */
> > 
> > +         vfi.> > InstanceDataStepRate =
> > 
> > > +            _mesa_bitcount(> > pipeline->subpass->view_mask);
> > 

> > 

> > Same comment as above, shouldn't this be:
> > 

> > 
> > vfi.InstanceDataStepRate = anv_subpass_view_count(> > pipeline->subpass);> > > 
> > + 
> > 
> > +   uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords,
> > 
> > GENX(MI_MATH));
> > 
> > +   build_alu_multiply_gpr0(> > dw, &num_dwords, N);
> > 
> > +}
> > 
> > +
> > 
> > +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
> > 
> > +
> > 
> >  static void
> > 
> >  load_indirect_parameters(> > struct anv_cmd_buffer *cmd_buffer,
> > 
> >                           > > struct anv_buffer *buffer, uint64_t offset,
> > 
> > @@ -1777,7 +1889,22 @@ load_indirect_parameters(> > struct anv_cmd_buffer
> > 
> > *cmd_buffer,
> > 
> >     uint32_t bo_offset = buffer->offset + offset;
> > 
> >  
> > 
> >     emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
> > 
> > -   emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
> > 
> > +
> > 
> > +   unsigned view_count = anv_subpass_view_count(cmd_> > buffer-
> > 
> > >state.subpass);
> > 
> > +   if (view_count > 1) {
> > 
> > +#if GEN_IS_HASWELL || GEN_GEN >= 8
> > 
> > +      emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4);
> > 
> > +      emit_mul_gpr0(batch, view_count);
> > 
> > +      emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0));
> > 
> > +#else
> > 
> > +      anv_finishme("Multiview + indirect draw requires MI_MATH\n"
> > 
> > +                   "MI_MATH is not supported on Ivy Bridge");
> > 
> > +      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
> > 
> > 4);
> > 
> > +#endif
> > 
> > +   } else {
> > 
> > +      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
> > 
> > 4);
> > 
> > +   }
> > 
> > +
> > 
> >     emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
> > 
> >  
> > 
> >     if (indexed) {
> > 
> > @@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(> > struct
> > 
> > anv_cmd_buffer *cmd_buffer,
> > 
> >  
> > 
> >     cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
> > 
> >  
> > 
> > +   /* Our implementation of VK_KHR_multiview uses instancing to draw
> > 
> > the
> > 
> > +    * different views.  If the client asks for instancing, we need
> > 
> > to use the
> > 
> > +    * Instance Data Step Rate to ensure that we repeat the client's
> > 
> > +    * per-instance data once for each view.  Since this bit is in
> > 
> > +    * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers
> > 
> > at the top
> > 
> > +    * of each subpass.
> > 
> > +    */
> > 
> > +   if (GEN_GEN == 7)
> > 
> > +      cmd_buffer->state.vb_> > dirty |= ~0;
> > 
> > > +
> > 

> > 

> > Shouldn't we do:
> > 

> > 
> > if (GEN_GEN == 7 && subpass->view_mask != 0)
> > 
> >     cmd_buffer->state.vb_dirty |= ~0;
> > 

> > 
> > So this doesn't affect cases that don't use multiview? > > Yes, we could do that because we're guaranteed that, for each render pass, either all subpass view masks are 0 or all are non-zero.  This seems safer though and I doubt flushing vertex buffers at the subpass boundary will hurt anyone.> 
> 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170327/c015c2ad/attachment-0001.html>