[Mesa-stable] [PATCH v3] radv: make sure to wait for CP DMA when needed

Samuel Pitoiset samuel.pitoiset at gmail.com
Thu Jul 12 08:03:35 UTC 2018



On 07/11/2018 06:52 PM, Dylan Baker wrote:
> Quoting Samuel Pitoiset (2018-07-09 09:02:58)
>> This might fix some synchronization issues. I don't know if
>> that will affect performance but it's required for correctness.
>>
>> v3: - wait for CP DMA in CmdPipelineBarrier()
>>      - clear the busy value when CP_DMA_SYNC is requested
>> v2: - wait for CP DMA in CmdWaitEvents()
>>      - track if CP DMA is used
>>
>> CC: <mesa-stable at lists.freedesktop.org>
>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>> ---
>>   src/amd/vulkan/radv_cmd_buffer.c | 15 +++++++++++++
>>   src/amd/vulkan/radv_private.h    |  5 +++++
>>   src/amd/vulkan/si_cmd_buffer.c   | 36 ++++++++++++++++++++++++++++----
>>   3 files changed, 52 insertions(+), 4 deletions(-)
>>
>> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
>> index 9da42fe03e..5dbdb3d996 100644
>> --- a/src/amd/vulkan/radv_cmd_buffer.c
>> +++ b/src/amd/vulkan/radv_cmd_buffer.c
>> @@ -2596,6 +2596,11 @@ VkResult radv_EndCommandBuffer(
>>                  si_emit_cache_flush(cmd_buffer);
>>          }
>>   
>> +       /* Make sure CP DMA is idle at the end of IBs because the kernel
>> +        * doesn't wait for it.
>> +        */
>> +       si_cp_dma_wait_for_idle(cmd_buffer);
>> +
>>          vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
>>   
>>          if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
>> @@ -4242,6 +4247,11 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer,
>>                                               0);
>>          }
>>   
>> +       /* Make sure CP DMA is idle because the driver might have performed a
>> +        * DMA operation for copying or filling buffers/images.
>> +        */
>> +       si_cp_dma_wait_for_idle(cmd_buffer);
>> +
>>          cmd_buffer->state.flush_bits |= dst_flush_bits;
>>   }
>>   
>> @@ -4292,6 +4302,11 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,
>>                  VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
>>                  VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
>>   
>> +       /* Make sure CP DMA is idle because the driver might have performed a
>> +        * DMA operation for copying or filling buffers/images.
>> +        */
>> +       si_cp_dma_wait_for_idle(cmd_buffer);
>> +
>>          /* TODO: Emit EOS events for syncing PS/CS stages. */
>>   
>>          if (!(stageMask & ~top_of_pipe_flags)) {
>> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
>> index 4e4b3a6037..2400de49a2 100644
>> --- a/src/amd/vulkan/radv_private.h
>> +++ b/src/amd/vulkan/radv_private.h
>> @@ -979,6 +979,9 @@ struct radv_cmd_state {
>>          uint32_t last_num_instances;
>>          uint32_t last_first_instance;
>>          uint32_t last_vertex_offset;
>> +
>> +       /* Whether CP DMA is busy/idle. */
>> +       bool dma_is_busy;
>>   };
>>   
>>   struct radv_cmd_pool {
>> @@ -1091,6 +1094,8 @@ void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
>>                           unsigned size);
>>   void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
>>                              uint64_t size, unsigned value);
>> +void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
>> +
>>   void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
>>   bool
>>   radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
>> diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
>> index 454fd8c39c..6d566a918d 100644
>> --- a/src/amd/vulkan/si_cmd_buffer.c
>> +++ b/src/amd/vulkan/si_cmd_buffer.c
>> @@ -1040,7 +1040,6 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
>>          struct radeon_cmdbuf *cs = cmd_buffer->cs;
>>          uint32_t header = 0, command = 0;
>>   
>> -       assert(size);
>>          assert(size <= cp_dma_max_byte_count(cmd_buffer));
>>   
>>          radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
>> @@ -1099,9 +1098,14 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
>>           * indices. If we wanted to execute CP DMA in PFP, this packet
>>           * should precede it.
>>           */
>> -       if ((flags & CP_DMA_SYNC) && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
>> -               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
>> -               radeon_emit(cs, 0);
>> +       if (flags & CP_DMA_SYNC) {
>> +               if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
>> +                       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
>> +                       radeon_emit(cs, 0);
>> +               }
>> +
>> +               /* CP will see the sync flag and wait for all DMAs to complete. */
>> +               cmd_buffer->state.dma_is_busy = false;
>>          }
>>   
>>          if (unlikely(cmd_buffer->device->trace_bo))
>> @@ -1165,6 +1169,8 @@ void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
>>          uint64_t main_src_va, main_dest_va;
>>          uint64_t skipped_size = 0, realign_size = 0;
>>   
>> +       /* Assume that we are not going to sync after the last DMA operation. */
>> +       cmd_buffer->state.dma_is_busy = true;
>>   
>>          if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO ||
>>              cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) {
>> @@ -1228,6 +1234,9 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
>>   
>>          assert(va % 4 == 0 && size % 4 == 0);
>>   
>> +       /* Assume that we are not going to sync after the last DMA operation. */
>> +       cmd_buffer->state.dma_is_busy = true;
>> +
>>          while (size) {
>>                  unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer));
>>                  unsigned dma_flags = CP_DMA_CLEAR;
>> @@ -1243,6 +1252,25 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
>>          }
>>   }
>>   
>> +void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer)
>> +{
>> +       if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK)
>> +               return;
>> +
>> +       if (!cmd_buffer->state.dma_is_busy)
>> +               return;
>> +
>> +       /* Issue a dummy DMA that copies zero bytes.
>> +        *
>> +        * The DMA engine will see that there's no work to do and skip this
>> +        * DMA request, however, the CP will see the sync flag and still wait
>> +        * for all DMAs to complete.
>> +        */
>> +       si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC);
>> +
>> +       cmd_buffer->state.dma_is_busy = false;
>> +}
>> +
>>   /* For MSAA sample positions. */
>>   #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
>>          (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) |                  \
> 
> Hi Samuel,
> 
> I'm trying to get this to apply to 18.1 and having a heck of a time, git can't
> figure out how to apply this because of 8339ba827bf3f18463824206347665a45989b99e
> (radv: optimize vkCmd{Set,Reset}Event() a little bit), which doesn't seem
> suitable for stable and has other dependencies. I'm not confident in my ability
> to rebase this, if you'd still like this in 18.1, can you provide a patch that
> applies to the staging/18.1 branch in the main gitlab repo?

Hi Dylan,

Backport for 18.1 sent to mesa-stable. Thanks!

> 
> Thanks,
> Dylan
> 


More information about the mesa-stable mailing list