[Mesa-dev] [PATCH 3/3] radeonsi: use DMA for clears with unaligned size

Sat Mar 25 13:30:03 UTC 2017

For the series:

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Marek

On Sat, Mar 25, 2017 at 1:06 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> Only a small tail needs to be uploaded manually.
>
> This is only partly a performance measure (apps are expected to use
> aligned access). Mostly it is preparation for sparse buffers, which the
> old code would incorrectly have attempted to map directly.
> ---
>  src/gallium/drivers/radeonsi/si_cp_dma.c | 46 +++++++++++++++++++-------------
>  1 file changed, 27 insertions(+), 19 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 0cf7b3b..812fcbc 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -178,87 +178,95 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
>
>  static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
>                             uint64_t offset, uint64_t size, unsigned value,
>                             enum r600_coherency coher)
>  {
>         struct si_context *sctx = (struct si_context*)ctx;
>         struct radeon_winsys *ws = sctx->b.ws;
>         struct r600_resource *rdst = r600_resource(dst);
>         unsigned tc_l2_flag = get_tc_l2_flag(sctx, coher);
>         unsigned flush_flags = get_flush_flags(sctx, coher);
> +       uint64_t dma_clear_size;
>         bool is_first = true;
>
>         if (!size)
>                 return;
>
> +       dma_clear_size = size & ~3llu;
> +
>         /* Mark the buffer range of destination as valid (initialized),
>          * so that transfer_map knows it should wait for the GPU when mapping
>          * that range. */
>         util_range_add(&rdst->valid_buffer_range, offset,
> -                      offset + size);
> -
> -       /* Fallback for unaligned clears. */
> -       if (size % 4 != 0) {
> -               uint8_t *map = r600_buffer_map_sync_with_rings(&sctx->b, rdst,
> -                                                              PIPE_TRANSFER_WRITE);
> -               map += offset;
> -               for (uint64_t i = 0; i < size; i++) {
> -                       unsigned byte_within_dword = (offset + i) % 4;
> -                       *map++ = (value >> (byte_within_dword * 8)) & 0xff;
> -               }
> -               return;
> -       }
> +                      offset + dma_clear_size);
>
>         /* dma_clear_buffer can use clear_buffer on failure. Make sure that
>          * doesn't happen. We don't want an infinite recursion: */
>         if (sctx->b.dma.cs &&
>             (offset % 4 == 0) &&
>             /* CP DMA is very slow. Always use SDMA for big clears. This
>              * alone improves DeusEx:MD performance by 70%. */
>             (size > 128 * 1024 ||
>              /* Buffers not used by the GFX IB yet will be cleared by SDMA.
>               * This happens to move most buffer clears to SDMA, including
>               * DCC and CMASK clears, because pipe->clear clears them before
>               * si_emit_framebuffer_state (in a draw call) adds them.
>               * For example, DeusEx:MD has 21 buffer clears per frame and all
>               * of them are moved to SDMA thanks to this. */
>              !ws->cs_is_buffer_referenced(sctx->b.gfx.cs, rdst->buf,
>                                           RADEON_USAGE_READWRITE))) {
> -               sctx->b.dma_clear_buffer(ctx, dst, offset, size, value);
> -       } else {
> +               sctx->b.dma_clear_buffer(ctx, dst, offset, dma_clear_size, value);
> +
> +               offset += dma_clear_size;
> +               size -= dma_clear_size;
> +       } else if (dma_clear_size >= 4) {
>                 uint64_t va = rdst->gpu_address + offset;
>
> +               offset += dma_clear_size;
> +               size -= dma_clear_size;
> +
>                 /* Flush the caches. */
>                 sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>                                  SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>
> -               while (size) {
> -                       unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
> +               while (dma_clear_size) {
> +                       unsigned byte_count = MIN2(dma_clear_size, CP_DMA_MAX_BYTE_COUNT);
>                         unsigned dma_flags = tc_l2_flag  | CP_DMA_CLEAR;
>
> -                       si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0,
> +                       si_cp_dma_prepare(sctx, dst, NULL, byte_count, dma_clear_size, 0,
>                                           &is_first, &dma_flags);
>
>                         /* Emit the clear packet. */
>                         si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, coher);
>
> -                       size -= byte_count;
> +                       dma_clear_size -= byte_count;
>                         va += byte_count;
>                 }
>
>                 if (tc_l2_flag)
>                         rdst->TC_L2_dirty = true;
>
>                 /* If it's not a framebuffer fast clear... */
>                 if (coher == R600_COHERENCY_SHADER)
>                         sctx->b.num_cp_dma_calls++;
>         }
> +
> +       if (size) {
> +               /* Handle non-dword alignment.
> +                *
> +                * This function is called for embedded texture metadata clears,
> +                * but those should always be properly aligned. */
> +               assert(dst->target == PIPE_BUFFER);
> +               assert(size < 4);
> +
> +               pipe_buffer_write(ctx, dst, offset, size, &value);
> +       }
>  }
>
>  /**
>   * Realign the CP DMA engine. This must be done after a copy with an unaligned
>   * size.
>   *
>   * \param size  Remaining size to the CP DMA alignment.
>   */
>  static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
>                                      unsigned user_flags, bool *is_first)
> --
> 2.9.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev