[Mesa-dev] [RFC 11/12] anv/cmd_buffer: Emit instanced draws for multiple views
Jason Ekstrand
jason at jlekstrand.net
Mon Mar 27 14:46:19 UTC 2017
On Mon, Mar 27, 2017 at 5:33 AM, Iago Toral <itoral at igalia.com> wrote:
> On Fri, 2017-03-24 at 16:57 -0700, Jason Ekstrand wrote:
>
> On Fri, Mar 24, 2017 at 5:53 AM, Iago Toral <itoral at igalia.com> wrote:
>
> On Wed, 2017-03-22 at 21:01 -0700, Jason Ekstrand wrote:
> ---
> src/intel/vulkan/anv_private.h | 6 ++
> src/intel/vulkan/genX_cmd_buffer.c | 141
> ++++++++++++++++++++++++++++++++++++-
> src/intel/vulkan/genX_pipeline.c | 10 ++-
> 3 files changed, 152 insertions(+), 5 deletions(-)
>
>
> diff --git a/src/intel/vulkan/anv_private.h
> b/src/intel/vulkan/anv_private.h
> index b402bc3..253dce2 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -1980,6 +1980,12 @@ struct anv_subpass {
> bool has_resolve;
> };
>
> +static inline unsigned
> +anv_subpass_view_count(const struct anv_subpass *subpass)
> +{
> + return MAX2(1, _mesa_bitcount(subpass->view_mask));
> +}
> +
> enum anv_subpass_usage {
> ANV_SUBPASS_USAGE_DRAW = (1 << 0),
> ANV_SUBPASS_USAGE_INPUT = (1 << 1),
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> b/src/intel/vulkan/genX_cmd_buffer.c
> index aafe7fd..8c21c33 100644
> --- a/src/intel/vulkan/genX_cmd_buffer.c
> +++ b/src/intel/vulkan/genX_cmd_buffer.c
> @@ -26,6 +26,7 @@
>
> #include "anv_private.h"
> #include "vk_format_info.h"
> +#include "util/vk_util.h"
>
> #include "common/gen_l3_config.h"
> #include "genxml/gen_macros.h"
> @@ -50,6 +51,17 @@ emit_lri(struct anv_batch *batch, uint32_t reg,
> uint32_t imm)
> }
> }
>
> +#if GEN_IS_HASWELL || GEN_GEN >= 8
> +static void
> +emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src)
> +{
> + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
> + lrr.SourceRegisterAddress = src;
> + lrr.DestinationRegisterAddress = dst;
> + }
> +}
> +#endif
> +
> void
> genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer
> *cmd_buffer)
> {
> @@ -1525,7 +1537,13 @@ genX(cmd_buffer_flush_state)(struct
> anv_cmd_buffer *cmd_buffer)
> .MemoryObjectControlState = GENX(MOCS),
> #else
> .BufferAccessType = pipeline->instancing_enable[vb] ?
> INSTANCEDATA : VERTEXDATA,
> - .InstanceDataStepRate = 1,
> + /* Our implementation of VK_KHR_multiview uses
> instancing to draw
> + * the different views. If the client asks for
> instancing, we
> + * need to use the Instance Data Step Rate to ensure
> that we
> + * repeat the client's per-instance data once for each
> view.
> + */
> + .InstanceDataStepRate =
> + _mesa_bitcount(pipeline->subpass->view_mask),
>
> mmm... shouldn't this be:
>
> .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass);
>
> so that we set this to 1 when multiview is not in use? (view_mask == 0)
>
>
> Good call! You're absolutely right. This line is older than
> anv_subpass_view_count, I think.
>
>
>
> .VertexBufferMemoryObjectControlState = GENX(MOCS),
> #endif
>
> @@ -1715,6 +1733,11 @@ void genX(CmdDraw)(
> if (vs_prog_data->uses_drawid)
> emit_draw_index(cmd_buffer, 0);
>
> + /* Our implementation of VK_KHR_multiview uses instancing to draw
> the
> + * different views. We need to multiply instanceCount by the
> view count.
> + */
> + instanceCount *= anv_subpass_view_count(cmd_buffer-
> >state.subpass);
> +
> anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
> prim.VertexAccessType = SEQUENTIAL;
> prim.PrimitiveTopologyType = pipeline->topology;
> @@ -1748,6 +1771,11 @@ void genX(CmdDrawIndexed)(
> if (vs_prog_data->uses_drawid)
> emit_draw_index(cmd_buffer, 0);
>
> + /* Our implementation of VK_KHR_multiview uses instancing to draw
> the
> + * different views. We need to multiply instanceCount by the
> view count.
> + */
> + instanceCount *= anv_subpass_view_count(cmd_buffer-
> >state.subpass);
> +
> anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
> prim.VertexAccessType = RANDOM;
> prim.PrimitiveTopologyType = pipeline->topology;
> @@ -1767,6 +1795,90 @@ void genX(CmdDrawIndexed)(
> #define GEN7_3DPRIM_START_INSTANCE 0x243C
> #define GEN7_3DPRIM_BASE_VERTEX 0x2440
>
> +/* MI_MATH only exists on Haswell+ */
> +#if GEN_IS_HASWELL || GEN_GEN >= 8
> +
> +#define alu_opcode(v) __gen_uint((v), 20, 31)
> +#define alu_operand1(v) __gen_uint((v), 10, 19)
> +#define alu_operand2(v) __gen_uint((v), 0, 9)
> +#define alu(opcode, operand1, operand2) \
> + alu_opcode(opcode) | alu_operand1(operand1) |
> alu_operand2(operand2)
> +
> +#define OPCODE_NOOP 0x000
> +#define OPCODE_LOAD 0x080
> +#define OPCODE_LOADINV 0x480
> +#define OPCODE_LOAD0 0x081
> +#define OPCODE_LOAD1 0x481
> +#define OPCODE_ADD 0x100
> +#define OPCODE_SUB 0x101
> +#define OPCODE_AND 0x102
> +#define OPCODE_OR 0x103
> +#define OPCODE_XOR 0x104
> +#define OPCODE_STORE 0x180
> +#define OPCODE_STOREINV 0x580
> +
> +#define OPERAND_R0 0x00
> +#define OPERAND_R1 0x01
> +#define OPERAND_R2 0x02
> +#define OPERAND_R3 0x03
> +#define OPERAND_R4 0x04
> +#define OPERAND_SRCA 0x20
> +#define OPERAND_SRCB 0x21
> +#define OPERAND_ACCU 0x31
> +#define OPERAND_ZF 0x32
> +#define OPERAND_CF 0x33
> +
> +#define CS_GPR(n) (0x2600 + (n) * 8)
> +
> +/* Emit dwords to multiply GPR0 by N */
> +static void
> +build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t
> N)
> +{
> + VK_OUTARRAY_MAKE(out, dw, dw_count);
> +
> +#define append_alu(opcode, operand1, operand2) \
> + vk_outarray_append(&out, alu_dw) *alu_dw = alu(opcode, operand1,
> operand2)
> +
> + assert(N > 0);
> + unsigned top_bit = 31 - __builtin_clz(N);
> + for (int i = top_bit - 1; i >= 0; i--) {
> + /* We get our initial data in GPR0 and we write the final data
> out to
> + * GPR0 but we use GPR1 as our scratch register.
> + */
> + unsigned src_reg = i == top_bit - 1 ? OPERAND_R0 : OPERAND_R1;
> + unsigned dst_reg = i == 0 ? OPERAND_R0 : OPERAND_R1;
> +
> + /* Shift the current value left by 1 */
> + append_alu(OPCODE_LOAD, OPERAND_SRCA, src_reg);
> + append_alu(OPCODE_LOAD, OPERAND_SRCB, src_reg);
> + append_alu(OPCODE_AND, 0, 0);
> +
> + if (N & (1 << i)) {
> + /* Store ACCU to R1 and add R0 to R1 */
> + append_alu(OPCODE_STORE, OPERAND_R1, OPERAND_ACCU);
> + append_alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
> + append_alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1);
> + append_alu(OPCODE_AND, 0, 0);
> + }
> +
> + append_alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
> + }
> +
> +#undef append_alu
> +}
> +
> +static void
> +emit_mul_gpr0(struct anv_batch *batch, uint32_t N)
> +{
> + uint32_t num_dwords;
> > + build_alu_multiply_gpr0(NULL, &num_dwords, N);
>
> I don't get this, it seems that the expectation of this code is
> that build_alu_multiply_gpr0() fills in num_dwords when we call it
> with NULL in the first argument, but I don't see where it does that,
> what am I missing?
>
>
> build_alu_multiply_gpr0 has the same semantics as the Vulkan query
> functions that return arrays. If you call it with NULL, it returns the
> number of dwords required. Otherwise, it assumes num_dwords is the number
> of dwords that it has available to write. I made use of vk_outarray to get
> this behavior. Yeah, it's a bit deceptive, but it worked out really well.
> :-) Feel free to tell me that it's a bit too clever and that I should make
> it simpler.
>
>
> No, it is okay, I understood what the intended behavior was but somehow I
> didn't see clearly how the macros in vk-util achieved that and got
> confused, I looked at it more carefully now and I think it is clear and
> make the implementation easier too.
>
> +
> + uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords,
> GENX(MI_MATH));
> + build_alu_multiply_gpr0(dw, &num_dwords, N);
> +}
> +
> +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
> +
> static void
> load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
> struct anv_buffer *buffer, uint64_t offset,
> @@ -1777,7 +1889,22 @@ load_indirect_parameters(struct anv_cmd_buffer
> *cmd_buffer,
> uint32_t bo_offset = buffer->offset + offset;
>
> emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
> - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
> +
> + unsigned view_count = anv_subpass_view_count(cmd_buffer-
> >state.subpass);
> + if (view_count > 1) {
> +#if GEN_IS_HASWELL || GEN_GEN >= 8
> + emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4);
> + emit_mul_gpr0(batch, view_count);
> + emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0));
> +#else
> + anv_finishme("Multiview + indirect draw requires MI_MATH\n"
> + "MI_MATH is not supported on Ivy Bridge");
> + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
> 4);
> +#endif
> + } else {
> + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
> 4);
> + }
> +
> emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
>
> if (indexed) {
> @@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(struct
> anv_cmd_buffer *cmd_buffer,
>
> cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
>
> + /* Our implementation of VK_KHR_multiview uses instancing to draw
> the
> + * different views. If the client asks for instancing, we need
> to use the
> + * Instance Data Step Rate to ensure that we repeat the client's
> + * per-instance data once for each view. Since this bit is in
> + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers
> at the top
> + * of each subpass.
> + */
> + if (GEN_GEN == 7)
> + cmd_buffer->state.vb_dirty |= ~0;
> > +
>
> Shouldn't we do:
>
> if (GEN_GEN == 7 && subpass->view_mask != 0)
> cmd_buffer->state.vb_dirty |= ~0;
>
> So this doesn't affect cases that don't use multiview?
>
>
> Yes, we could do that because we're guaranteed that, for each render pass,
> either all subpass view masks are 0 or all are non-zero. This seems safer
> though and I doubt flushing vertex buffers at the subpass boundary will
> hurt anyone.
>
>
> Right, makes sense.
>
> Reviewed-by: Iago Toral Quiroga <itoral at igalia.com>
>
Thanks!
> > /* Perform transitions to the subpass layout before any writes
> > have
> * occurred.
> */
> diff --git a/src/intel/vulkan/genX_pipeline.c
> b/src/intel/vulkan/genX_pipeline.c
> index 85a9e4f..9aaa8a0 100644
> --- a/src/intel/vulkan/genX_pipeline.c
> +++ b/src/intel/vulkan/genX_pipeline.c
> @@ -155,9 +155,13 @@ emit_vertex_input(struct anv_pipeline *pipeline,
> anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING),
> vfi) {
> vfi.InstancingEnable = pipeline->instancing_enable[desc-
> >binding];
> vfi.VertexElementIndex = slot;
> - /* Vulkan so far doesn't have an instance divisor, so
> - * this is always 1 (ignored if not instancing). */
> - vfi.InstanceDataStepRate = 1;
> + /* Our implementation of VK_KHX_multiview uses instancing
> to draw
> + * the different views. If the client asks for instancing,
> we
> + * need to use the Instance Data Step Rate to ensure that
> we
> + * repeat the client's per-instance data once for each
> view.
> + */
> + vfi.InstanceDataStepRate =
> > + _mesa_bitcount(pipeline->subpass->view_mask);
>
> Same comment as above, shouldn't this be:
>
> vfi.InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass);
>
>
> +
> + uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords,
> GENX(MI_MATH));
> + build_alu_multiply_gpr0(dw, &num_dwords, N);
> +}
> +
> +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */
> +
> static void
> load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
> struct anv_buffer *buffer, uint64_t offset,
> @@ -1777,7 +1889,22 @@ load_indirect_parameters(struct anv_cmd_buffer
> *cmd_buffer,
> uint32_t bo_offset = buffer->offset + offset;
>
> emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
> - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
> +
> + unsigned view_count = anv_subpass_view_count(cmd_buffer-
> >state.subpass);
> + if (view_count > 1) {
> +#if GEN_IS_HASWELL || GEN_GEN >= 8
> + emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4);
> + emit_mul_gpr0(batch, view_count);
> + emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0));
> +#else
> + anv_finishme("Multiview + indirect draw requires MI_MATH\n"
> + "MI_MATH is not supported on Ivy Bridge");
> + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
> 4);
> +#endif
> + } else {
> + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset +
> 4);
> + }
> +
> emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
>
> if (indexed) {
> @@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(struct
> anv_cmd_buffer *cmd_buffer,
>
> cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
>
> + /* Our implementation of VK_KHR_multiview uses instancing to draw
> the
> + * different views. If the client asks for instancing, we need
> to use the
> + * Instance Data Step Rate to ensure that we repeat the client's
> + * per-instance data once for each view. Since this bit is in
> + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers
> at the top
> + * of each subpass.
> + */
> + if (GEN_GEN == 7)
> + cmd_buffer->state.vb_dirty |= ~0;
> > +
>
> Shouldn't we do:
>
> if (GEN_GEN == 7 && subpass->view_mask != 0)
> cmd_buffer->state.vb_dirty |= ~0;
>
> So this doesn't affect cases that don't use multiview?
>
>
> Yes, we could do that because we're guaranteed that, for each render pass,
> either all subpass view masks are 0 or all are non-zero. This seems safer
> though and I doubt flushing vertex buffers at the subpass boundary will
> hurt anyone.
>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170327/8c85af80/attachment-0001.html>
More information about the mesa-dev
mailing list