[Mesa-dev] [PATCH 3/3] radeonsi: use DMA for clears with unaligned size
Marek Olšák
maraeo at gmail.com
Sat Mar 25 13:30:03 UTC 2017
For the series:
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Marek
On Sat, Mar 25, 2017 at 1:06 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> Only a small tail needs to be uploaded manually.
>
> This is only partly a performance measure (apps are expected to use
> aligned access). Mostly it is preparation for sparse buffers, which the
> old code would incorrectly have attempted to map directly.
> ---
> src/gallium/drivers/radeonsi/si_cp_dma.c | 46 +++++++++++++++++++-------------
> 1 file changed, 27 insertions(+), 19 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 0cf7b3b..812fcbc 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -178,87 +178,95 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
>
> static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
> uint64_t offset, uint64_t size, unsigned value,
> enum r600_coherency coher)
> {
> struct si_context *sctx = (struct si_context*)ctx;
> struct radeon_winsys *ws = sctx->b.ws;
> struct r600_resource *rdst = r600_resource(dst);
> unsigned tc_l2_flag = get_tc_l2_flag(sctx, coher);
> unsigned flush_flags = get_flush_flags(sctx, coher);
> + uint64_t dma_clear_size;
> bool is_first = true;
>
> if (!size)
> return;
>
> + dma_clear_size = size & ~3llu;
> +
> /* Mark the buffer range of destination as valid (initialized),
> * so that transfer_map knows it should wait for the GPU when mapping
> * that range. */
> util_range_add(&rdst->valid_buffer_range, offset,
> - offset + size);
> -
> - /* Fallback for unaligned clears. */
> - if (size % 4 != 0) {
> - uint8_t *map = r600_buffer_map_sync_with_rings(&sctx->b, rdst,
> - PIPE_TRANSFER_WRITE);
> - map += offset;
> - for (uint64_t i = 0; i < size; i++) {
> - unsigned byte_within_dword = (offset + i) % 4;
> - *map++ = (value >> (byte_within_dword * 8)) & 0xff;
> - }
> - return;
> - }
> + offset + dma_clear_size);
>
> /* dma_clear_buffer can use clear_buffer on failure. Make sure that
> * doesn't happen. We don't want an infinite recursion: */
> if (sctx->b.dma.cs &&
> (offset % 4 == 0) &&
> /* CP DMA is very slow. Always use SDMA for big clears. This
> * alone improves DeusEx:MD performance by 70%. */
> (size > 128 * 1024 ||
> /* Buffers not used by the GFX IB yet will be cleared by SDMA.
> * This happens to move most buffer clears to SDMA, including
> * DCC and CMASK clears, because pipe->clear clears them before
> * si_emit_framebuffer_state (in a draw call) adds them.
> * For example, DeusEx:MD has 21 buffer clears per frame and all
> * of them are moved to SDMA thanks to this. */
> !ws->cs_is_buffer_referenced(sctx->b.gfx.cs, rdst->buf,
> RADEON_USAGE_READWRITE))) {
> - sctx->b.dma_clear_buffer(ctx, dst, offset, size, value);
> - } else {
> + sctx->b.dma_clear_buffer(ctx, dst, offset, dma_clear_size, value);
> +
> + offset += dma_clear_size;
> + size -= dma_clear_size;
> + } else if (dma_clear_size >= 4) {
> uint64_t va = rdst->gpu_address + offset;
>
> + offset += dma_clear_size;
> + size -= dma_clear_size;
> +
> /* Flush the caches. */
> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
>
> - while (size) {
> - unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
> + while (dma_clear_size) {
> + unsigned byte_count = MIN2(dma_clear_size, CP_DMA_MAX_BYTE_COUNT);
> unsigned dma_flags = tc_l2_flag | CP_DMA_CLEAR;
>
> - si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0,
> + si_cp_dma_prepare(sctx, dst, NULL, byte_count, dma_clear_size, 0,
> &is_first, &dma_flags);
>
> /* Emit the clear packet. */
> si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, coher);
>
> - size -= byte_count;
> + dma_clear_size -= byte_count;
> va += byte_count;
> }
>
> if (tc_l2_flag)
> rdst->TC_L2_dirty = true;
>
> /* If it's not a framebuffer fast clear... */
> if (coher == R600_COHERENCY_SHADER)
> sctx->b.num_cp_dma_calls++;
> }
> +
> + if (size) {
> + /* Handle non-dword alignment.
> + *
> + * This function is called for embedded texture metadata clears,
> + * but those should always be properly aligned. */
> + assert(dst->target == PIPE_BUFFER);
> + assert(size < 4);
> +
> + pipe_buffer_write(ctx, dst, offset, size, &value);
> + }
> }
>
> /**
> * Realign the CP DMA engine. This must be done after a copy with an unaligned
> * size.
> *
> * \param size Remaining size to the CP DMA alignment.
> */
> static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
> unsigned user_flags, bool *is_first)
> --
> 2.9.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list