[Mesa-dev] [Mesa-stable] [PATCH v3] radv: make sure to wait for CP DMA when needed

Dylan Baker dylan at pnwbakers.com
Wed Jul 11 16:52:56 UTC 2018


Quoting Samuel Pitoiset (2018-07-09 09:02:58)
> This might fix some synchronization issues. I don't know if
> that will affect performance but it's required for correctness.
> 
> v3: - wait for CP DMA in CmdPipelineBarrier()
>     - clear the busy value when CP_DMA_SYNC is requested
> v2: - wait for CP DMA in CmdWaitEvents()
>     - track if CP DMA is used
> 
> CC: <mesa-stable at lists.freedesktop.org>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
> ---
>  src/amd/vulkan/radv_cmd_buffer.c | 15 +++++++++++++
>  src/amd/vulkan/radv_private.h    |  5 +++++
>  src/amd/vulkan/si_cmd_buffer.c   | 36 ++++++++++++++++++++++++++++----
>  3 files changed, 52 insertions(+), 4 deletions(-)
> 
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> index 9da42fe03e..5dbdb3d996 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -2596,6 +2596,11 @@ VkResult radv_EndCommandBuffer(
>                 si_emit_cache_flush(cmd_buffer);
>         }
>  
> +       /* Make sure CP DMA is idle at the end of IBs because the kernel
> +        * doesn't wait for it.
> +        */
> +       si_cp_dma_wait_for_idle(cmd_buffer);
> +
>         vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
>  
>         if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
> @@ -4242,6 +4247,11 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer,
>                                              0);
>         }
>  
> +       /* Make sure CP DMA is idle because the driver might have performed a
> +        * DMA operation for copying or filling buffers/images.
> +        */
> +       si_cp_dma_wait_for_idle(cmd_buffer);
> +
>         cmd_buffer->state.flush_bits |= dst_flush_bits;
>  }
>  
> @@ -4292,6 +4302,11 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,
>                 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
>                 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
>  
> +       /* Make sure CP DMA is idle because the driver might have performed a
> +        * DMA operation for copying or filling buffers/images.
> +        */
> +       si_cp_dma_wait_for_idle(cmd_buffer);
> +
>         /* TODO: Emit EOS events for syncing PS/CS stages. */
>  
>         if (!(stageMask & ~top_of_pipe_flags)) {
> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
> index 4e4b3a6037..2400de49a2 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -979,6 +979,9 @@ struct radv_cmd_state {
>         uint32_t last_num_instances;
>         uint32_t last_first_instance;
>         uint32_t last_vertex_offset;
> +
> +       /* Whether CP DMA is busy/idle. */
> +       bool dma_is_busy;
>  };
>  
>  struct radv_cmd_pool {
> @@ -1091,6 +1094,8 @@ void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
>                          unsigned size);
>  void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
>                             uint64_t size, unsigned value);
> +void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
> +
>  void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
>  bool
>  radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
> diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
> index 454fd8c39c..6d566a918d 100644
> --- a/src/amd/vulkan/si_cmd_buffer.c
> +++ b/src/amd/vulkan/si_cmd_buffer.c
> @@ -1040,7 +1040,6 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
>         struct radeon_cmdbuf *cs = cmd_buffer->cs;
>         uint32_t header = 0, command = 0;
>  
> -       assert(size);
>         assert(size <= cp_dma_max_byte_count(cmd_buffer));
>  
>         radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
> @@ -1099,9 +1098,14 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
>          * indices. If we wanted to execute CP DMA in PFP, this packet
>          * should precede it.
>          */
> -       if ((flags & CP_DMA_SYNC) && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
> -               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
> -               radeon_emit(cs, 0);
> +       if (flags & CP_DMA_SYNC) {
> +               if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
> +                       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
> +                       radeon_emit(cs, 0);
> +               }
> +
> +               /* CP will see the sync flag and wait for all DMAs to complete. */
> +               cmd_buffer->state.dma_is_busy = false;
>         }
>  
>         if (unlikely(cmd_buffer->device->trace_bo))
> @@ -1165,6 +1169,8 @@ void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
>         uint64_t main_src_va, main_dest_va;
>         uint64_t skipped_size = 0, realign_size = 0;
>  
> +       /* Assume that we are not going to sync after the last DMA operation. */
> +       cmd_buffer->state.dma_is_busy = true;
>  
>         if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO ||
>             cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) {
> @@ -1228,6 +1234,9 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
>  
>         assert(va % 4 == 0 && size % 4 == 0);
>  
> +       /* Assume that we are not going to sync after the last DMA operation. */
> +       cmd_buffer->state.dma_is_busy = true;
> +
>         while (size) {
>                 unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer));
>                 unsigned dma_flags = CP_DMA_CLEAR;
> @@ -1243,6 +1252,25 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
>         }
>  }
>  
> +void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer)
> +{
> +       if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK)
> +               return;
> +
> +       if (!cmd_buffer->state.dma_is_busy)
> +               return;
> +
> +       /* Issue a dummy DMA that copies zero bytes.
> +        *
> +        * The DMA engine will see that there's no work to do and skip this
> +        * DMA request, however, the CP will see the sync flag and still wait
> +        * for all DMAs to complete.
> +        */
> +       si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC);
> +
> +       cmd_buffer->state.dma_is_busy = false;
> +}
> +
>  /* For MSAA sample positions. */
>  #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
>         (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) |                  \

Hi Samuel,

I'm trying to get this to apply to 18.1 and having a heck of a time, git can't
figure out how to apply this because of 8339ba827bf3f18463824206347665a45989b99e
(radv: optimize vkCmd{Set,Reset}Event() a little bit), which doesn't seem
suitable for stable and has other dependencies. I'm not confident in my ability
to rebase this, if you'd still like this in 18.1, can you provide a patch that
applies to the staging/18.1 branch in the main gitlab repo?

Thanks,
Dylan
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 228 bytes
Desc: signature
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180711/2bd703b3/attachment-0001.sig>


More information about the mesa-dev mailing list