<div dir="ltr"><div class="gmail_quote"><div dir="ltr">On Wed, Oct 17, 2018 at 6:59 AM Danylo Piliaiev <<a href="mailto:danylo.piliaiev@gmail.com">danylo.piliaiev@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Signed-off-by: Danylo Piliaiev <<a href="mailto:danylo.piliaiev@globallogic.com" target="_blank">danylo.piliaiev@globallogic.com</a>><br>
---<br>
 src/intel/vulkan/anv_extensions.py |   1 +<br>
 src/intel/vulkan/genX_cmd_buffer.c | 155 +++++++++++++++++++++++++++++<br>
 2 files changed, 156 insertions(+)<br>
<br>
diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py<br>
index d4915c9501..7f44da6648 100644<br>
--- a/src/intel/vulkan/anv_extensions.py<br>
+++ b/src/intel/vulkan/anv_extensions.py<br>
@@ -113,6 +113,7 @@ EXTENSIONS = [<br>
     Extension('VK_KHR_xlib_surface',                      6, 'VK_USE_PLATFORM_XLIB_KHR'),<br>
     Extension('VK_KHR_multiview',                         1, True),<br>
     Extension('VK_KHR_display',                          23, 'VK_USE_PLATFORM_DISPLAY_KHR'),<br>
+    Extension('VK_KHR_draw_indirect_count',               1, 'device->info.gen >= 8 || device->info.is_haswell'),<br>
     Extension('VK_EXT_acquire_xlib_display',              1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),<br>
     Extension('VK_EXT_debug_report',                      8, True),<br>
     Extension('VK_EXT_direct_mode_display',               1, 'VK_USE_PLATFORM_DISPLAY_KHR'),<br>
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c<br>
index 43a02f2256..d7b94efd19 100644<br>
--- a/src/intel/vulkan/genX_cmd_buffer.c<br>
+++ b/src/intel/vulkan/genX_cmd_buffer.c<br>
@@ -2982,6 +2982,161 @@ void genX(CmdDrawIndexedIndirect)(<br>
    }<br>
 }<br>
<br>
+#if GEN_IS_HASWELL || GEN_GEN >= 8<br>
+static void<br>
+emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,<br>
+                          struct anv_address count_address,<br>
+                          uint32_t draw_index)<br>
+{<br>
+   /* Upload the current draw count from the draw parameters buffer to<br>
+    * MI_PREDICATE_SRC0.<br>
+    */<br>
+   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_SRC0, CS_GPR(MI_ALU_REG14));<br></blockquote><div><br></div><div>Do we also need to set MI_PREDICATE_SRC0 + 4 to 0?  I suspect we do.</div><div><br></div><div>Also, we can likely save some batch space if we have a "prepare" function which sets MI_PREDICATE_SRC0, SRC0 + 4, and SRC1 + 4 and only emit one LOAD_REGISTER_IMM and the MI_PREDICATE per-draw.  For lots of primitives, those extra three MI_LOAD_REGISTER_* calls will add up.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+<br>
+   /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */<br>
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1, draw_index);<br>
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);<br>
+<br>
+   if (draw_index == 0) {<br>
+       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {<br>
+          mip.LoadOperation    = LOAD_LOADINV;<br>
+          mip.CombineOperation = COMBINE_SET;<br>
+          mip.CompareOperation = COMPARE_SRCS_EQUAL;<br>
+       }<br>
+   } else {<br>
+       /* While draw_index < draw_count the predicate's result will be<br>
+        *  (draw_index == draw_count) ^ TRUE = TRUE<br>
+        * When draw_index == draw_count the result is<br>
+        *  (TRUE) ^ TRUE = FALSE<br>
+        * After this all results will be:<br>
+        *  (FALSE) ^ FALSE = FALSE<br>
+        */<br>
+       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {<br>
+          mip.LoadOperation    = LOAD_LOAD;<br>
+          mip.CombineOperation = COMBINE_XOR;<br>
+          mip.CompareOperation = COMPARE_SRCS_EQUAL;<br>
+       }<br>
+   }<br>
+}<br>
+<br>
+void genX(CmdDrawIndirectCountKHR)(<br>
+    VkCommandBuffer                             commandBuffer,<br>
+    VkBuffer                                    _buffer,<br>
+    VkDeviceSize                                offset,<br>
+    VkBuffer                                    _countBuffer,<br>
+    VkDeviceSize                                countBufferOffset,<br>
+    uint32_t                                    maxDrawCount,<br>
+    uint32_t                                    stride)<br>
+{<br>
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);<br>
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);<br>
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);<br>
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;<br>
+   struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline;<br>
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);<br>
+<br>
+   if (anv_batch_has_error(&cmd_buffer->batch))<br>
+      return;<br>
+<br>
+   genX(cmd_buffer_flush_state)(cmd_buffer);<br>
+<br>
+   struct anv_address count_address =<br>
+      anv_address_add(count_buffer->address, countBufferOffset);<br>
+<br>
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM<br>
+    * command when loading the values into the predicate source registers.<br>
+    */<br>
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {<br>
+     pc.PipeControlFlushEnable = true;<br>
+   }<br></blockquote><div><br></div><div>Have you seen this be an actual problem?  If not, why?  A documentation citation would be  nice.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+<br>
+   emit_lrm(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14), count_address);<br>
+   emit_lri(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14) + 4, 0);<br>
+<br>
+   for (uint32_t i = 0; i < maxDrawCount; i++) {<br>
+      struct anv_address draw = anv_address_add(buffer->address, offset);<br>
+<br>
+      emit_draw_count_predicate(cmd_buffer, count_address, i);<br>
+<br>
+      if (vs_prog_data->uses_firstvertex ||<br>
+          vs_prog_data->uses_baseinstance)<br>
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));<br>
+      if (vs_prog_data->uses_drawid)<br>
+         emit_draw_index(cmd_buffer, i);<br>
+<br>
+      load_indirect_parameters(cmd_buffer, draw, false);<br>
+<br>
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {<br>
+         prim.IndirectParameterEnable  = true;<br>
+         prim.PredicateEnable          = true;<br>
+         prim.VertexAccessType         = SEQUENTIAL;<br>
+         prim.PrimitiveTopologyType    = pipeline->topology;<br>
+      }<br>
+<br>
+      offset += stride;<br>
+   }<br>
+}<br>
+<br>
+void genX(CmdDrawIndexedIndirectCountKHR)(<br>
+    VkCommandBuffer                             commandBuffer,<br>
+    VkBuffer                                    _buffer,<br>
+    VkDeviceSize                                offset,<br>
+    VkBuffer                                    _countBuffer,<br>
+    VkDeviceSize                                countBufferOffset,<br>
+    uint32_t                                    maxDrawCount,<br>
+    uint32_t                                    stride)<br>
+{<br>
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);<br>
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);<br>
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);<br>
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;<br>
+   struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline;<br>
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);<br>
+<br>
+   if (anv_batch_has_error(&cmd_buffer->batch))<br>
+      return;<br>
+<br>
+   genX(cmd_buffer_flush_state)(cmd_buffer);<br>
+<br>
+   struct anv_address count_address =<br>
+      anv_address_add(count_buffer->address, countBufferOffset);<br>
+<br>
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM<br>
+    * command when loading the values into the predicate source registers.<br>
+    */<br>
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {<br>
+     pc.PipeControlFlushEnable = true;<br>
+   }<br>
+<br>
+   emit_lrm(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14), count_address);<br>
+   emit_lri(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14) + 4, 0);<br>
+<br>
+   for (uint32_t i = 0; i < maxDrawCount; i++) {<br>
+      struct anv_address draw = anv_address_add(buffer->address, offset);<br>
+<br>
+      emit_draw_count_predicate(cmd_buffer, count_address, i);<br>
+<br>
+      /* TODO: We need to stomp base vertex to 0 somehow */<br>
+      if (vs_prog_data->uses_firstvertex ||<br>
+          vs_prog_data->uses_baseinstance)<br>
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));<br>
+      if (vs_prog_data->uses_drawid)<br>
+         emit_draw_index(cmd_buffer, i);<br>
+<br>
+      load_indirect_parameters(cmd_buffer, draw, true);<br>
+<br>
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {<br>
+         prim.IndirectParameterEnable  = true;<br>
+         prim.PredicateEnable          = true;<br>
+         prim.VertexAccessType         = RANDOM;<br>
+         prim.PrimitiveTopologyType    = pipeline->topology;<br>
+      }<br>
+<br>
+      offset += stride;<br>
+   }<br>
+}<br>
+#endif<br>
+<br>
 static VkResult<br>
 flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)<br>
 {<br>
-- <br>
2.18.0<br>
<br>
</blockquote></div></div>