[Mesa-dev] [PATCH 3/3] radeonsi: use DMA for clears with unaligned size
Nicolai Hähnle
nhaehnle at gmail.com
Sat Mar 25 00:06:32 UTC 2017
From: Nicolai Hähnle <nicolai.haehnle at amd.com>
Only a small tail needs to be uploaded manually.
This is only partly a performance measure (apps are expected to use
aligned access). Mostly it is preparation for sparse buffers, which the
old code would incorrectly have attempted to map directly.
---
src/gallium/drivers/radeonsi/si_cp_dma.c | 46 +++++++++++++++++++-------------
1 file changed, 27 insertions(+), 19 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 0cf7b3b..812fcbc 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -178,87 +178,95 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
uint64_t offset, uint64_t size, unsigned value,
enum r600_coherency coher)
{
struct si_context *sctx = (struct si_context*)ctx;
struct radeon_winsys *ws = sctx->b.ws;
struct r600_resource *rdst = r600_resource(dst);
unsigned tc_l2_flag = get_tc_l2_flag(sctx, coher);
unsigned flush_flags = get_flush_flags(sctx, coher);
+ uint64_t dma_clear_size;
bool is_first = true;
if (!size)
return;
+ dma_clear_size = size & ~3llu;
+
/* Mark the buffer range of destination as valid (initialized),
* so that transfer_map knows it should wait for the GPU when mapping
* that range. */
util_range_add(&rdst->valid_buffer_range, offset,
- offset + size);
-
- /* Fallback for unaligned clears. */
- if (size % 4 != 0) {
- uint8_t *map = r600_buffer_map_sync_with_rings(&sctx->b, rdst,
- PIPE_TRANSFER_WRITE);
- map += offset;
- for (uint64_t i = 0; i < size; i++) {
- unsigned byte_within_dword = (offset + i) % 4;
- *map++ = (value >> (byte_within_dword * 8)) & 0xff;
- }
- return;
- }
+ offset + dma_clear_size);
/* dma_clear_buffer can use clear_buffer on failure. Make sure that
* doesn't happen. We don't want an infinite recursion: */
if (sctx->b.dma.cs &&
(offset % 4 == 0) &&
/* CP DMA is very slow. Always use SDMA for big clears. This
* alone improves DeusEx:MD performance by 70%. */
(size > 128 * 1024 ||
/* Buffers not used by the GFX IB yet will be cleared by SDMA.
* This happens to move most buffer clears to SDMA, including
* DCC and CMASK clears, because pipe->clear clears them before
* si_emit_framebuffer_state (in a draw call) adds them.
* For example, DeusEx:MD has 21 buffer clears per frame and all
* of them are moved to SDMA thanks to this. */
!ws->cs_is_buffer_referenced(sctx->b.gfx.cs, rdst->buf,
RADEON_USAGE_READWRITE))) {
- sctx->b.dma_clear_buffer(ctx, dst, offset, size, value);
- } else {
+ sctx->b.dma_clear_buffer(ctx, dst, offset, dma_clear_size, value);
+
+ offset += dma_clear_size;
+ size -= dma_clear_size;
+ } else if (dma_clear_size >= 4) {
uint64_t va = rdst->gpu_address + offset;
+ offset += dma_clear_size;
+ size -= dma_clear_size;
+
/* Flush the caches. */
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
- while (size) {
- unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
+ while (dma_clear_size) {
+ unsigned byte_count = MIN2(dma_clear_size, CP_DMA_MAX_BYTE_COUNT);
unsigned dma_flags = tc_l2_flag | CP_DMA_CLEAR;
- si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0,
+ si_cp_dma_prepare(sctx, dst, NULL, byte_count, dma_clear_size, 0,
&is_first, &dma_flags);
/* Emit the clear packet. */
si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, coher);
- size -= byte_count;
+ dma_clear_size -= byte_count;
va += byte_count;
}
if (tc_l2_flag)
rdst->TC_L2_dirty = true;
/* If it's not a framebuffer fast clear... */
if (coher == R600_COHERENCY_SHADER)
sctx->b.num_cp_dma_calls++;
}
+
+ if (size) {
+ /* Handle non-dword alignment.
+ *
+ * This function is called for embedded texture metadata clears,
+ * but those should always be properly aligned. */
+ assert(dst->target == PIPE_BUFFER);
+ assert(size < 4);
+
+ pipe_buffer_write(ctx, dst, offset, size, &value);
+ }
}
/**
* Realign the CP DMA engine. This must be done after a copy with an unaligned
* size.
*
* \param size Remaining size to the CP DMA alignment.
*/
static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
unsigned user_flags, bool *is_first)
--
2.9.3
More information about the mesa-dev
mailing list