<html><head></head><body><div>On Fri, 2017-03-24 at 16:57 -0700, Jason Ekstrand wrote:</div><blockquote type="cite"><div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Fri, Mar 24, 2017 at 5:53 AM, Iago Toral <<a href="mailto:itoral@igalia.com" target="_blank">itoral@igalia.com</a>> wrote: <blockquote type="cite"><div class="HOEnZb"><div class="h5">On Wed, 2017-03-22 at 21:01 -0700, Jason Ekstrand wrote: --- src/intel/vulkan/anv_private.h | 6 ++ src/intel/vulkan/genX_cmd_buffer.c | 141 ++++++++++++++++++++++++++++++++++++- src/intel/vulkan/genX_pipeline.c | 10 ++- 3 files changed, 152 insertions(+), 5 deletions(-) diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index b402bc3..253dce2 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1980,6 +1980,12 @@ struct anv_subpass { bool has_resolve; }; +static inline unsigned +anv_subpass_view_count(const struct anv_subpass *subpass) +{ + return MAX2(1, _mesa_bitcount(subpass->view_mask)); +} + enum anv_subpass_usage { ANV_SUBPASS_USAGE_DRAW = (1 << 0), ANV_SUBPASS_USAGE_INPUT = (1 << 1), diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index aafe7fd..8c21c33 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -26,6 +26,7 @@ #include "anv_private.h" #include "vk_format_info.h" +#include "util/vk_util.h" #include "common/gen_l3_config.h" #include "genxml/gen_macros.h" @@ -50,6 +51,17 @@ emit_lri(struct anv_batch *batch, uint32_t reg, uint32_t imm) } } +#if GEN_IS_HASWELL || GEN_GEN >= 8 +static void +emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src) +{ + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { + lrr.SourceRegisterAddress = src; + lrr.DestinationRegisterAddress = dst; + } +} +#endif + void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) { @@ -1525,7 +1537,13 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer) .MemoryObjectControlState = GENX(MOCS), #else .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA, - .InstanceDataStepRate = 1, + /* Our implementation of VK_KHR_multiview uses instancing to draw + * the different views. If the client asks for instancing, we + * need to use the Instance Data Step Rate to ensure that we + * repeat the client's per-instance data once for each view. + */ + .InstanceDataStepRate = + _mesa_bitcount(pipeline->subpass->view_mask), </div></div>mmm... shouldn't this be: .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass); so that we set this to 1 when multiview is not in use? (view_mask == 0) </blockquote><div> </div><div>Good call! You're absolutely right. This line is older than anv_subpass_view_count, I think. </div><div> </div><blockquote type="cite"> <div><div class="h5"> .VertexBufferMemoryObjectControlState = GENX(MOCS), #endif @@ -1715,6 +1733,11 @@ void genX(CmdDraw)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. We need to multiply instanceCount by the view count. + */ + instanceCount *= anv_subpass_view_count(cmd_buffer- >state.subpass); + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = SEQUENTIAL; prim.PrimitiveTopologyType = pipeline->topology; @@ -1748,6 +1771,11 @@ void genX(CmdDrawIndexed)( if (vs_prog_data->uses_drawid) emit_draw_index(cmd_buffer, 0); + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. We need to multiply instanceCount by the view count. + */ + instanceCount *= anv_subpass_view_count(cmd_buffer- >state.subpass); + anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { prim.VertexAccessType = RANDOM; prim.PrimitiveTopologyType = pipeline->topology; @@ -1767,6 +1795,90 @@ void genX(CmdDrawIndexed)( #define GEN7_3DPRIM_START_INSTANCE 0x243C #define GEN7_3DPRIM_BASE_VERTEX 0x2440 +/* MI_MATH only exists on Haswell+ */ +#if GEN_IS_HASWELL || GEN_GEN >= 8 + +#define alu_opcode(v) __gen_uint((v), 20, 31) +#define alu_operand1(v) __gen_uint((v), 10, 19) +#define alu_operand2(v) __gen_uint((v), 0, 9) +#define alu(opcode, operand1, operand2) \ + alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2) + +#define OPCODE_NOOP 0x000 +#define OPCODE_LOAD 0x080 +#define OPCODE_LOADINV 0x480 +#define OPCODE_LOAD0 0x081 +#define OPCODE_LOAD1 0x481 +#define OPCODE_ADD 0x100 +#define OPCODE_SUB 0x101 +#define OPCODE_AND 0x102 +#define OPCODE_OR 0x103 +#define OPCODE_XOR 0x104 +#define OPCODE_STORE 0x180 +#define OPCODE_STOREINV 0x580 + +#define OPERAND_R0 0x00 +#define OPERAND_R1 0x01 +#define OPERAND_R2 0x02 +#define OPERAND_R3 0x03 +#define OPERAND_R4 0x04 +#define OPERAND_SRCA 0x20 +#define OPERAND_SRCB 0x21 +#define OPERAND_ACCU 0x31 +#define OPERAND_ZF 0x32 +#define OPERAND_CF 0x33 + +#define CS_GPR(n) (0x2600 + (n) * 8) + +/* Emit dwords to multiply GPR0 by N */ +static void +build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t N) +{ + VK_OUTARRAY_MAKE(out, dw, dw_count); + +#define append_alu(opcode, operand1, operand2) \ + vk_outarray_append(&out, alu_dw) *alu_dw = alu(opcode, operand1, operand2) + + assert(N > 0); + unsigned top_bit = 31 - __builtin_clz(N); + for (int i = top_bit - 1; i >= 0; i--) { + /* We get our initial data in GPR0 and we write the final data out to + * GPR0 but we use GPR1 as our scratch register. + */ + unsigned src_reg = i == top_bit - 1 ? OPERAND_R0 : OPERAND_R1; + unsigned dst_reg = i == 0 ? OPERAND_R0 : OPERAND_R1; + + /* Shift the current value left by 1 */ + append_alu(OPCODE_LOAD, OPERAND_SRCA, src_reg); + append_alu(OPCODE_LOAD, OPERAND_SRCB, src_reg); + append_alu(OPCODE_AND, 0, 0); + + if (N & (1 << i)) { + /* Store ACCU to R1 and add R0 to R1 */ + append_alu(OPCODE_STORE, OPERAND_R1, OPERAND_ACCU); + append_alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0); + append_alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1); + append_alu(OPCODE_AND, 0, 0); + } + + append_alu(OPCODE_STORE, dst_reg, OPERAND_ACCU); + } + +#undef append_alu +} + +static void +emit_mul_gpr0(struct anv_batch *batch, uint32_t N) +{ + uint32_t num_dwords; > + build_alu_multiply_gpr0(NULL, &num_dwords, N); </div></div>I don't get this, it seems that the expectation of this code is that build_alu_multiply_gpr0() fills in num_dwords when we call it with NULL in the first argument, but I don't see where it does that, what am I missing? </blockquote><div> </div><div>build_alu_multiply_gpr0 has the same semantics as the Vulkan query functions that return arrays. If you call it with NULL, it returns the number of dwords required. Otherwise, it assumes num_dwords is the number of dwords that it has available to write. I made use of vk_outarray to get this behavior. Yeah, it's a bit deceptive, but it worked out really well. :-) Feel free to tell me that it's a bit too clever and that I should make it simpler. </div></div></div></div></blockquote><div> </div><div>No, it is okay, I understood what the intended behavior was but somehow I didn't see clearly how the macros in vk-util achieved that and got confused, I looked at it more carefully now and I think it is clear and make the implementation easier too.</div><div> </div><blockquote type="cite"><div dir="ltr"><div class="gmail_extra"><div class="gmail_quote"><blockquote type="cite"><div><div class="h5"> + + uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, GENX(MI_MATH)); + build_alu_multiply_gpr0(dw, &num_dwords, N); +} + +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ + static void load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, struct anv_buffer *buffer, uint64_t offset, @@ -1777,7 +1889,22 @@ load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, uint32_t bo_offset = buffer->offset + offset; emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); + + unsigned view_count = anv_subpass_view_count(cmd_buffer- >state.subpass); + if (view_count > 1) { +#if GEN_IS_HASWELL || GEN_GEN >= 8 + emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4); + emit_mul_gpr0(batch, view_count); + emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); +#else + anv_finishme("Multiview + indirect draw requires MI_MATH\n" + "MI_MATH is not supported on Ivy Bridge"); + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); +#endif + } else { + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); + } + emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); if (indexed) { @@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. If the client asks for instancing, we need to use the + * Instance Data Step Rate to ensure that we repeat the client's + * per-instance data once for each view. Since this bit is in + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers at the top + * of each subpass. + */ + if (GEN_GEN == 7) + cmd_buffer->state.vb_dirty |= ~0; > + </div></div>Shouldn't we do: if (GEN_GEN == 7 && subpass->view_mask != 0) cmd_buffer->state.vb_dirty |= ~0; So this doesn't affect cases that don't use multiview? </blockquote><div> </div>Yes, we could do that because we're guaranteed that, for each render pass, either all subpass view masks are 0 or all are non-zero. This seems safer though and I doubt flushing vertex buffers at the subpass boundary will hurt anyone. </div></div></div></blockquote><div> </div><div>Right, makes sense.</div><div> </div><div>Reviewed-by: Iago Toral Quiroga <<a href="itoral@igalia.com>">itoral@igalia.com></a></div><div> </div><blockquote type="cite"><div dir="ltr"><div class="gmail_extra"><div class="gmail_quote"><blockquote type="cite"><div><div class="h5"> > /* Perform transitions to the subpass layout before any writes > have * occurred. */ diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 85a9e4f..9aaa8a0 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -155,9 +155,13 @@ emit_vertex_input(struct anv_pipeline *pipeline, anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING), vfi) { vfi.InstancingEnable = pipeline->instancing_enable[desc- >binding]; vfi.VertexElementIndex = slot; - /* Vulkan so far doesn't have an instance divisor, so - * this is always 1 (ignored if not instancing). */ - vfi.InstanceDataStepRate = 1; + /* Our implementation of VK_KHX_multiview uses instancing to draw + * the different views. If the client asks for instancing, we + * need to use the Instance Data Step Rate to ensure that we + * repeat the client's per-instance data once for each view. + */ + vfi.InstanceDataStepRate = > + _mesa_bitcount(pipeline->subpass->view_mask); </div></div>Same comment as above, shouldn't this be: vfi.InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass); </blockquote><div> </div><div class="gmail_quote"><blockquote type="cite"><div><div class="h5"> + + uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, GENX(MI_MATH)); + build_alu_multiply_gpr0(dw, &num_dwords, N); +} + +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ + static void load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, struct anv_buffer *buffer, uint64_t offset, @@ -1777,7 +1889,22 @@ load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, uint32_t bo_offset = buffer->offset + offset; emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); + + unsigned view_count = anv_subpass_view_count(cmd_buffer- >state.subpass); + if (view_count > 1) { +#if GEN_IS_HASWELL || GEN_GEN >= 8 + emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4); + emit_mul_gpr0(batch, view_count); + emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); +#else + anv_finishme("Multiview + indirect draw requires MI_MATH\n" + "MI_MATH is not supported on Ivy Bridge"); + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); +#endif + } else { + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); + } + emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); if (indexed) { @@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(struct anv_cmd_buffer *cmd_buffer, cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; + /* Our implementation of VK_KHR_multiview uses instancing to draw the + * different views. If the client asks for instancing, we need to use the + * Instance Data Step Rate to ensure that we repeat the client's + * per-instance data once for each view. Since this bit is in + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers at the top + * of each subpass. + */ + if (GEN_GEN == 7) + cmd_buffer->state.vb_dirty |= ~0; > + </div></div>Shouldn't we do: if (GEN_GEN == 7 && subpass->view_mask != 0) cmd_buffer->state.vb_dirty |= ~0; So this doesn't affect cases that don't use multiview? </blockquote><div> </div>Yes, we could do that because we're guaranteed that, for each render pass, either all subpass view masks are 0 or all are non-zero. This seems safer though and I doubt flushing vertex buffers at the subpass boundary will hurt anyone. </div><div> </div></div></div></div> </blockquote><div> </div></body></html>