[Mesa-dev] [PATCH 08/13] anv: Implement the basic form of VK_EXT_transform_feedback

Mon Oct 15 15:38:54 UTC 2018

On Mon, Oct 15, 2018 at 8:34 AM Lionel Landwerlin <
lionel.g.landwerlin at intel.com> wrote:

> On 13/10/2018 14:09, Jason Ekstrand wrote:
> > ---
> >   src/intel/vulkan/anv_cmd_buffer.c  |  29 +++++++
> >   src/intel/vulkan/anv_device.c      |  24 ++++++
> >   src/intel/vulkan/anv_extensions.py |   2 +-
> >   src/intel/vulkan/anv_pipeline.c    |  10 ++-
> >   src/intel/vulkan/anv_private.h     |  13 +++
> >   src/intel/vulkan/genX_cmd_buffer.c | 125 +++++++++++++++++++++++++++++
> >   src/intel/vulkan/genX_pipeline.c   | 122 ++++++++++++++++++++++++++++
> >   7 files changed, 323 insertions(+), 2 deletions(-)
>
>
> ...
>
>
> >      uint32_t                                     topology;
> >
> > diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> b/src/intel/vulkan/genX_cmd_buffer.c
> > index c3a7e5c83c3..90469abbf21 100644
> > --- a/src/intel/vulkan/genX_cmd_buffer.c
> > +++ b/src/intel/vulkan/genX_cmd_buffer.c
> > @@ -2571,6 +2571,30 @@ genX(cmd_buffer_flush_state)(struct
> anv_cmd_buffer *cmd_buffer)
> >
> >      cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
> >
> > +#if GEN_GEN >= 8
> > +   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
> > +      /* We don't need any per-buffer dirty tracking because you're not
> > +       * allowed to bind different XFB buffers while XFB is enabled.
> > +       */
> > +      for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
> > +         struct anv_xfb_binding *xfb =
> &cmd_buffer->state.xfb_bindings[idx];
> > +         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER),
> sob) {
> > +            sob.SOBufferIndex = idx;
> > +
> > +            if (cmd_buffer->state.xfb_enabled && xfb->buffer) {
> > +               sob.SOBufferEnable = true;
> > +               sob.SOBufferMOCS = cmd_buffer->device->default_mocs,
> > +               sob.StreamOffsetWriteEnable = false;
> > +               sob.SurfaceBaseAddress =
> anv_address_add(xfb->buffer->address,
> > +                                                        xfb->offset);
> > +               /* Size is in DWords - 1 */
> > +               sob.SurfaceSize = xfb->size / 4 - 1;
> > +            }
> > +         }
>
>
> Apparently documentation says we need a PIPE_CONTROL with CS Stall bit
> set after 3DSTATE_SO_BUFFER.
>

So it does.  I've added it for GEN_GEN >= 10.

--Jason

> > +      }
> > +   }
> > +#endif
> > +
> >      if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
> >         anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->batch);
> >
> > @@ -2970,6 +2994,107 @@ void genX(CmdDrawIndexedIndirect)(
> >      }
> >   }
> >
> > +void genX(CmdBeginTransformFeedbackEXT)(
> > +    VkCommandBuffer                             commandBuffer,
> > +    uint32_t                                    firstCounterBuffer,
> > +    uint32_t                                    counterBufferCount,
> > +    const VkBuffer*                             pCounterBuffers,
> > +    const VkDeviceSize*                         pCounterBufferOffsets)
> > +{
> > +   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
> > +
> > +   assert(firstCounterBuffer < MAX_XFB_BUFFERS);
> > +   assert(counterBufferCount < MAX_XFB_BUFFERS);
> > +   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
> > +
> > +   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
> > +    *
> > +    *    "Ssoftware must ensure that no HW stream output operations can
> be in
> > +    *    process or otherwise pending at the point that the
> MI_LOAD/STORE
> > +    *    commands are processed. This will likely require a pipeline
> flush."
> > +    */
> > +   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
> > +   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
> > +
> > +   for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
> > +      /* If we have a counter buffer, this is a resume so we need to
> load the
> > +       * value into the streamout offset register.  Otherwise, this is
> a begin
> > +       * and we need to reset it to zero.
> > +       */
> > +      if (pCounterBuffers &&
> > +          idx >= firstCounterBuffer &&
> > +          idx - firstCounterBuffer < counterBufferCount &&
> > +          pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
> > +         uint32_t cb_idx = idx - firstCounterBuffer;
> > +         ANV_FROM_HANDLE(anv_buffer, counter_buffer,
> pCounterBuffers[cb_idx]);
> > +         uint64_t offset = pCounterBufferOffsets ?
> > +                           pCounterBufferOffsets[cb_idx] : 0;
> > +
> > +         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM),
> lrm) {
> > +            lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
> > +            lrm.MemoryAddress    =
> anv_address_add(counter_buffer->address,
> > +                                                   offset);
> > +         }
> > +      } else {
> > +         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM),
> lri) {
> > +            lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
> > +            lri.DataDWord        = 0;
> > +         }
> > +      }
> > +   }
> > +
> > +   cmd_buffer->state.xfb_enabled = true;
> > +   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
> > +}
> > +
> > +void genX(CmdEndTransformFeedbackEXT)(
> > +    VkCommandBuffer                             commandBuffer,
> > +    uint32_t                                    firstCounterBuffer,
> > +    uint32_t                                    counterBufferCount,
> > +    const VkBuffer*                             pCounterBuffers,
> > +    const VkDeviceSize*                         pCounterBufferOffsets)
> > +{
> > +   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
> > +
> > +   assert(firstCounterBuffer < MAX_XFB_BUFFERS);
> > +   assert(counterBufferCount < MAX_XFB_BUFFERS);
> > +   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
> > +
> > +   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
> > +    *
> > +    *    "Ssoftware must ensure that no HW stream output operations can
> be in
> > +    *    process or otherwise pending at the point that the
> MI_LOAD/STORE
> > +    *    commands are processed. This will likely require a pipeline
> flush."
> > +    */
> > +   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
> > +   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
> > +
> > +   for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
> > +      unsigned idx = firstCounterBuffer + cb_idx;
> > +
> > +      /* If we have a counter buffer, this is a resume so we need to
> load the
> > +       * value into the streamout offset register.  Otherwise, this is
> a begin
> > +       * and we need to reset it to zero.
> > +       */
> > +      if (pCounterBuffers &&
> > +          cb_idx < counterBufferCount &&
> > +          pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
> > +         ANV_FROM_HANDLE(anv_buffer, counter_buffer,
> pCounterBuffers[cb_idx]);
> > +         uint64_t offset = pCounterBufferOffsets ?
> > +                           pCounterBufferOffsets[cb_idx] : 0;
> > +
> > +         anv_batch_emit(&cmd_buffer->batch,
> GENX(MI_STORE_REGISTER_MEM), srm) {
> > +            srm.MemoryAddress    =
> anv_address_add(counter_buffer->address,
> > +                                                   offset);
> > +            srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
> > +         }
> > +      }
> > +   }
> > +
> > +   cmd_buffer->state.xfb_enabled = false;
> > +   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
> > +}
> > +
> >   static VkResult
> >   flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
> >   {
> > diff --git a/src/intel/vulkan/genX_pipeline.c
> b/src/intel/vulkan/genX_pipeline.c
> > index 9595a7133ae..5dd78a18fb5 100644
> > --- a/src/intel/vulkan/genX_pipeline.c
> > +++ b/src/intel/vulkan/genX_pipeline.c
> > @@ -28,6 +28,7 @@
> >
> >   #include "common/gen_l3_config.h"
> >   #include "common/gen_sample_positions.h"
> > +#include "nir/nir_xfb_info.h"
> >   #include "vk_util.h"
> >   #include "vk_format_info.h"
> >
> > @@ -1097,9 +1098,130 @@ static void
> >   emit_3dstate_streamout(struct anv_pipeline *pipeline,
> >                          const VkPipelineRasterizationStateCreateInfo
> *rs_info)
> >   {
> > +#if GEN_GEN >= 8
> > +   const struct brw_vue_prog_data *prog_data =
> > +      anv_pipeline_get_last_vue_prog_data(pipeline);
> > +   const struct brw_vue_map *vue_map = &prog_data->vue_map;
> > +#endif
> > +
> > +   nir_xfb_info *xfb_info;
> > +   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
> > +      xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;
> > +   else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
> > +      xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
> > +   else
> > +      xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;
> > +
> > +   pipeline->xfb_used = xfb_info ? xfb_info->buffers_written : 0;
> > +
> >      anv_batch_emit(&pipeline->batch, GENX(3DSTATE_STREAMOUT), so) {
> >         so.RenderingDisable = rs_info->rasterizerDiscardEnable;
> > +
> > +#if GEN_GEN >= 8
> > +      if (xfb_info) {
> > +         so.SOFunctionEnable = true;
> > +
> > +         const VkPipelineRasterizationStateStreamCreateInfoEXT
> *stream_info =
> > +            vk_find_struct_const(rs_info,
> PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
> > +         so.RenderStreamSelect = stream_info ?
> > +                                 stream_info->rasterizationStream : 0;
> > +
> > +         so.Buffer0SurfacePitch = xfb_info->strides[0];
> > +         so.Buffer1SurfacePitch = xfb_info->strides[1];
> > +         so.Buffer2SurfacePitch = xfb_info->strides[2];
> > +         so.Buffer3SurfacePitch = xfb_info->strides[3];
> > +
> > +         int urb_entry_read_offset = 0;
> > +         int urb_entry_read_length =
> > +            (prog_data->vue_map.num_slots + 1) / 2 -
> urb_entry_read_offset;
> > +
> > +         /* We always read the whole vertex.  This could be reduced at
> some
> > +          * point by reading less and offsetting the register index in
> the
> > +          * SO_DECLs.
> > +          */
> > +         so.Stream0VertexReadOffset = urb_entry_read_offset;
> > +         so.Stream0VertexReadLength = urb_entry_read_length - 1;
> > +         so.Stream1VertexReadOffset = urb_entry_read_offset;
> > +         so.Stream1VertexReadLength = urb_entry_read_length - 1;
> > +         so.Stream2VertexReadOffset = urb_entry_read_offset;
> > +         so.Stream2VertexReadLength = urb_entry_read_length - 1;
> > +         so.Stream3VertexReadOffset = urb_entry_read_offset;
> > +         so.Stream3VertexReadLength = urb_entry_read_length - 1;
> > +      }
> > +#endif /* GEN_GEN >= 8 */
> > +   }
> > +
> > +#if GEN_GEN >= 8
> > +   if (xfb_info) {
> > +      struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
> > +      int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
> > +      int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
> > +
> > +      memset(so_decl, 0, sizeof(so_decl));
> > +
> > +      for (unsigned i = 0; i < xfb_info->output_count; i++) {
> > +         const nir_xfb_output_info *output = &xfb_info->outputs[i];
> > +         unsigned buffer = output->buffer;
> > +         unsigned stream = xfb_info->buffer_to_stream[buffer];
> > +
> > +         /* Our hardware is unusual in that it requires us to program
> SO_DECLs
> > +          * for fake "hole" components, rather than simply taking the
> offset
> > +          * for each real varying.  Each hole can have size 1, 2, 3, or
> 4; we
> > +          * program as many size = 4 holes as we can, then a final hole
> to
> > +          * accommodate the final 1, 2, or 3 remaining.
> > +          */
> > +         int hole_dwords = (output->offset - next_offset[buffer]) / 4;
> > +         while (hole_dwords > 0) {
> > +            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
> > +               .HoleFlag = 1,
> > +               .OutputBufferSlot = buffer,
> > +               .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
> > +            };
> > +            hole_dwords -= 4;
> > +         }
> > +
> > +         next_offset[buffer] = output->offset +
> > +
>  __builtin_popcount(output->component_mask) * 4;
> > +
> > +         so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
> > +            .OutputBufferSlot = buffer,
> > +            .RegisterIndex = vue_map->varying_to_slot[output->location],
> > +            .ComponentMask = output->component_mask,
> > +         };
> > +      }
> > +
> > +      int max_decls = 0;
> > +      for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
> > +         max_decls = MAX2(max_decls, decls[s]);
> > +
> > +      uint8_t sbs[MAX_XFB_STREAMS] = { };
> > +      for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
> > +         if (xfb_info->buffers_written & (1 << b))
> > +            sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
> > +      }
> > +
> > +      uint32_t *dw = anv_batch_emitn(&pipeline->batch, 3 + 2 *
> max_decls,
> > +                                     GENX(3DSTATE_SO_DECL_LIST),
> > +                                     .StreamtoBufferSelects0 = sbs[0],
> > +                                     .StreamtoBufferSelects1 = sbs[1],
> > +                                     .StreamtoBufferSelects2 = sbs[2],
> > +                                     .StreamtoBufferSelects3 = sbs[3],
> > +                                     .NumEntries0 = decls[0],
> > +                                     .NumEntries1 = decls[1],
> > +                                     .NumEntries2 = decls[2],
> > +                                     .NumEntries3 = decls[3]);
> > +
> > +      for (int i = 0; i < max_decls; i++) {
> > +         GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
> > +            &(struct GENX(SO_DECL_ENTRY)) {
> > +               .Stream0Decl = so_decl[0][i],
> > +               .Stream1Decl = so_decl[1][i],
> > +               .Stream2Decl = so_decl[2][i],
> > +               .Stream3Decl = so_decl[3][i],
> > +            });
> > +      }
> >      }
> > +#endif /* GEN_GEN >= 8 */
> >   }
> >
> >   static uint32_t
>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20181015/949d32a6/attachment-0001.html>