<div dir="ltr"><div class="gmail_quote"><div dir="ltr">On Mon, Oct 15, 2018 at 8:34 AM Lionel Landwerlin <<a href="mailto:lionel.g.landwerlin@intel.com">lionel.g.landwerlin@intel.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">On 13/10/2018 14:09, Jason Ekstrand wrote:<br>
> ---<br>
>   src/intel/vulkan/anv_cmd_buffer.c  |  29 +++++++<br>
>   src/intel/vulkan/anv_device.c      |  24 ++++++<br>
>   src/intel/vulkan/anv_extensions.py |   2 +-<br>
>   src/intel/vulkan/anv_pipeline.c    |  10 ++-<br>
>   src/intel/vulkan/anv_private.h     |  13 +++<br>
>   src/intel/vulkan/genX_cmd_buffer.c | 125 +++++++++++++++++++++++++++++<br>
>   src/intel/vulkan/genX_pipeline.c   | 122 ++++++++++++++++++++++++++++<br>
>   7 files changed, 323 insertions(+), 2 deletions(-)<br>
<br>
<br>
...<br>
<br>
<br>
>      uint32_t                                     topology;<br>
>   <br>
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c<br>
> index c3a7e5c83c3..90469abbf21 100644<br>
> --- a/src/intel/vulkan/genX_cmd_buffer.c<br>
> +++ b/src/intel/vulkan/genX_cmd_buffer.c<br>
> @@ -2571,6 +2571,30 @@ genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)<br>
>   <br>
>      cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;<br>
>   <br>
> +#if GEN_GEN >= 8<br>
> +   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {<br>
> +      /* We don't need any per-buffer dirty tracking because you're not<br>
> +       * allowed to bind different XFB buffers while XFB is enabled.<br>
> +       */<br>
> +      for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {<br>
> +         struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];<br>
> +         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {<br>
> +            sob.SOBufferIndex = idx;<br>
> +<br>
> +            if (cmd_buffer->state.xfb_enabled && xfb->buffer) {<br>
> +               sob.SOBufferEnable = true;<br>
> +               sob.SOBufferMOCS = cmd_buffer->device->default_mocs,<br>
> +               sob.StreamOffsetWriteEnable = false;<br>
> +               sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,<br>
> +                                                        xfb->offset);<br>
> +               /* Size is in DWords - 1 */<br>
> +               sob.SurfaceSize = xfb->size / 4 - 1;<br>
> +            }<br>
> +         }<br>
<br>
<br>
Apparently documentation says we need a PIPE_CONTROL with CS Stall bit <br>
set after 3DSTATE_SO_BUFFER.<br></blockquote><div><br></div><div>So it does.  I've added it for GEN_GEN >= 10.</div><div><br></div><div>--Jason<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
> +      }<br>
> +   }<br>
> +#endif<br>
> +<br>
>      if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {<br>
>         anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);<br>
>   <br>
> @@ -2970,6 +2994,107 @@ void genX(CmdDrawIndexedIndirect)(<br>
>      }<br>
>   }<br>
>   <br>
> +void genX(CmdBeginTransformFeedbackEXT)(<br>
> +    VkCommandBuffer                             commandBuffer,<br>
> +    uint32_t                                    firstCounterBuffer,<br>
> +    uint32_t                                    counterBufferCount,<br>
> +    const VkBuffer*                             pCounterBuffers,<br>
> +    const VkDeviceSize*                         pCounterBufferOffsets)<br>
> +{<br>
> +   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);<br>
> +<br>
> +   assert(firstCounterBuffer < MAX_XFB_BUFFERS);<br>
> +   assert(counterBufferCount < MAX_XFB_BUFFERS);<br>
> +   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);<br>
> +<br>
> +   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:<br>
> +    *<br>
> +    *    "Ssoftware must ensure that no HW stream output operations can be in<br>
> +    *    process or otherwise pending at the point that the MI_LOAD/STORE<br>
> +    *    commands are processed. This will likely require a pipeline flush."<br>
> +    */<br>
> +   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;<br>
> +   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);<br>
> +<br>
> +   for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {<br>
> +      /* If we have a counter buffer, this is a resume so we need to load the<br>
> +       * value into the streamout offset register.  Otherwise, this is a begin<br>
> +       * and we need to reset it to zero.<br>
> +       */<br>
> +      if (pCounterBuffers &&<br>
> +          idx >= firstCounterBuffer &&<br>
> +          idx - firstCounterBuffer < counterBufferCount &&<br>
> +          pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {<br>
> +         uint32_t cb_idx = idx - firstCounterBuffer;<br>
> +         ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);<br>
> +         uint64_t offset = pCounterBufferOffsets ?<br>
> +                           pCounterBufferOffsets[cb_idx] : 0;<br>
> +<br>
> +         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {<br>
> +            lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;<br>
> +            lrm.MemoryAddress    = anv_address_add(counter_buffer->address,<br>
> +                                                   offset);<br>
> +         }<br>
> +      } else {<br>
> +         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {<br>
> +            lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;<br>
> +            lri.DataDWord        = 0;<br>
> +         }<br>
> +      }<br>
> +   }<br>
> +<br>
> +   cmd_buffer->state.xfb_enabled = true;<br>
> +   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;<br>
> +}<br>
> +<br>
> +void genX(CmdEndTransformFeedbackEXT)(<br>
> +    VkCommandBuffer                             commandBuffer,<br>
> +    uint32_t                                    firstCounterBuffer,<br>
> +    uint32_t                                    counterBufferCount,<br>
> +    const VkBuffer*                             pCounterBuffers,<br>
> +    const VkDeviceSize*                         pCounterBufferOffsets)<br>
> +{<br>
> +   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);<br>
> +<br>
> +   assert(firstCounterBuffer < MAX_XFB_BUFFERS);<br>
> +   assert(counterBufferCount < MAX_XFB_BUFFERS);<br>
> +   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);<br>
> +<br>
> +   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:<br>
> +    *<br>
> +    *    "Ssoftware must ensure that no HW stream output operations can be in<br>
> +    *    process or otherwise pending at the point that the MI_LOAD/STORE<br>
> +    *    commands are processed. This will likely require a pipeline flush."<br>
> +    */<br>
> +   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;<br>
> +   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);<br>
> +<br>
> +   for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {<br>
> +      unsigned idx = firstCounterBuffer + cb_idx;<br>
> +<br>
> +      /* If we have a counter buffer, this is a resume so we need to load the<br>
> +       * value into the streamout offset register.  Otherwise, this is a begin<br>
> +       * and we need to reset it to zero.<br>
> +       */<br>
> +      if (pCounterBuffers &&<br>
> +          cb_idx < counterBufferCount &&<br>
> +          pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {<br>
> +         ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);<br>
> +         uint64_t offset = pCounterBufferOffsets ?<br>
> +                           pCounterBufferOffsets[cb_idx] : 0;<br>
> +<br>
> +         anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {<br>
> +            srm.MemoryAddress    = anv_address_add(counter_buffer->address,<br>
> +                                                   offset);<br>
> +            srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;<br>
> +         }<br>
> +      }<br>
> +   }<br>
> +<br>
> +   cmd_buffer->state.xfb_enabled = false;<br>
> +   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;<br>
> +}<br>
> +<br>
>   static VkResult<br>
>   flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)<br>
>   {<br>
> diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c<br>
> index 9595a7133ae..5dd78a18fb5 100644<br>
> --- a/src/intel/vulkan/genX_pipeline.c<br>
> +++ b/src/intel/vulkan/genX_pipeline.c<br>
> @@ -28,6 +28,7 @@<br>
>   <br>
>   #include "common/gen_l3_config.h"<br>
>   #include "common/gen_sample_positions.h"<br>
> +#include "nir/nir_xfb_info.h"<br>
>   #include "vk_util.h"<br>
>   #include "vk_format_info.h"<br>
>   <br>
> @@ -1097,9 +1098,130 @@ static void<br>
>   emit_3dstate_streamout(struct anv_pipeline *pipeline,<br>
>                          const VkPipelineRasterizationStateCreateInfo *rs_info)<br>
>   {<br>
> +#if GEN_GEN >= 8<br>
> +   const struct brw_vue_prog_data *prog_data =<br>
> +      anv_pipeline_get_last_vue_prog_data(pipeline);<br>
> +   const struct brw_vue_map *vue_map = &prog_data->vue_map;<br>
> +#endif<br>
> +<br>
> +   nir_xfb_info *xfb_info;<br>
> +   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))<br>
> +      xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;<br>
> +   else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))<br>
> +      xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;<br>
> +   else<br>
> +      xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;<br>
> +<br>
> +   pipeline->xfb_used = xfb_info ? xfb_info->buffers_written : 0;<br>
> +<br>
>      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_STREAMOUT), so) {<br>
>         so.RenderingDisable = rs_info->rasterizerDiscardEnable;<br>
> +<br>
> +#if GEN_GEN >= 8<br>
> +      if (xfb_info) {<br>
> +         so.SOFunctionEnable = true;<br>
> +<br>
> +         const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =<br>
> +            vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);<br>
> +         so.RenderStreamSelect = stream_info ?<br>
> +                                 stream_info->rasterizationStream : 0;<br>
> +<br>
> +         so.Buffer0SurfacePitch = xfb_info->strides[0];<br>
> +         so.Buffer1SurfacePitch = xfb_info->strides[1];<br>
> +         so.Buffer2SurfacePitch = xfb_info->strides[2];<br>
> +         so.Buffer3SurfacePitch = xfb_info->strides[3];<br>
> +<br>
> +         int urb_entry_read_offset = 0;<br>
> +         int urb_entry_read_length =<br>
> +            (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;<br>
> +<br>
> +         /* We always read the whole vertex.  This could be reduced at some<br>
> +          * point by reading less and offsetting the register index in the<br>
> +          * SO_DECLs.<br>
> +          */<br>
> +         so.Stream0VertexReadOffset = urb_entry_read_offset;<br>
> +         so.Stream0VertexReadLength = urb_entry_read_length - 1;<br>
> +         so.Stream1VertexReadOffset = urb_entry_read_offset;<br>
> +         so.Stream1VertexReadLength = urb_entry_read_length - 1;<br>
> +         so.Stream2VertexReadOffset = urb_entry_read_offset;<br>
> +         so.Stream2VertexReadLength = urb_entry_read_length - 1;<br>
> +         so.Stream3VertexReadOffset = urb_entry_read_offset;<br>
> +         so.Stream3VertexReadLength = urb_entry_read_length - 1;<br>
> +      }<br>
> +#endif /* GEN_GEN >= 8 */<br>
> +   }<br>
> +<br>
> +#if GEN_GEN >= 8<br>
> +   if (xfb_info) {<br>
> +      struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];<br>
> +      int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};<br>
> +      int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};<br>
> +<br>
> +      memset(so_decl, 0, sizeof(so_decl));<br>
> +<br>
> +      for (unsigned i = 0; i < xfb_info->output_count; i++) {<br>
> +         const nir_xfb_output_info *output = &xfb_info->outputs[i];<br>
> +         unsigned buffer = output->buffer;<br>
> +         unsigned stream = xfb_info->buffer_to_stream[buffer];<br>
> +<br>
> +         /* Our hardware is unusual in that it requires us to program SO_DECLs<br>
> +          * for fake "hole" components, rather than simply taking the offset<br>
> +          * for each real varying.  Each hole can have size 1, 2, 3, or 4; we<br>
> +          * program as many size = 4 holes as we can, then a final hole to<br>
> +          * accommodate the final 1, 2, or 3 remaining.<br>
> +          */<br>
> +         int hole_dwords = (output->offset - next_offset[buffer]) / 4;<br>
> +         while (hole_dwords > 0) {<br>
> +            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {<br>
> +               .HoleFlag = 1,<br>
> +               .OutputBufferSlot = buffer,<br>
> +               .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,<br>
> +            };<br>
> +            hole_dwords -= 4;<br>
> +         }<br>
> +<br>
> +         next_offset[buffer] = output->offset +<br>
> +                               __builtin_popcount(output->component_mask) * 4;<br>
> +<br>
> +         so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {<br>
> +            .OutputBufferSlot = buffer,<br>
> +            .RegisterIndex = vue_map->varying_to_slot[output->location],<br>
> +            .ComponentMask = output->component_mask,<br>
> +         };<br>
> +      }<br>
> +<br>
> +      int max_decls = 0;<br>
> +      for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)<br>
> +         max_decls = MAX2(max_decls, decls[s]);<br>
> +<br>
> +      uint8_t sbs[MAX_XFB_STREAMS] = { };<br>
> +      for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {<br>
> +         if (xfb_info->buffers_written & (1 << b))<br>
> +            sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;<br>
> +      }<br>
> +<br>
> +      uint32_t *dw = anv_batch_emitn(&pipeline->batch, 3 + 2 * max_decls,<br>
> +                                     GENX(3DSTATE_SO_DECL_LIST),<br>
> +                                     .StreamtoBufferSelects0 = sbs[0],<br>
> +                                     .StreamtoBufferSelects1 = sbs[1],<br>
> +                                     .StreamtoBufferSelects2 = sbs[2],<br>
> +                                     .StreamtoBufferSelects3 = sbs[3],<br>
> +                                     .NumEntries0 = decls[0],<br>
> +                                     .NumEntries1 = decls[1],<br>
> +                                     .NumEntries2 = decls[2],<br>
> +                                     .NumEntries3 = decls[3]);<br>
> +<br>
> +      for (int i = 0; i < max_decls; i++) {<br>
> +         GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,<br>
> +            &(struct GENX(SO_DECL_ENTRY)) {<br>
> +               .Stream0Decl = so_decl[0][i],<br>
> +               .Stream1Decl = so_decl[1][i],<br>
> +               .Stream2Decl = so_decl[2][i],<br>
> +               .Stream3Decl = so_decl[3][i],<br>
> +            });<br>
> +      }<br>
>      }<br>
> +#endif /* GEN_GEN >= 8 */<br>
>   }<br>
>   <br>
>   static uint32_t<br>
<br>
<br>
</blockquote></div></div>