[Mesa-dev] [PATCH 18/18] radeonsi: split si_clear_buffer to remove enum si_method
Marek Olšák
maraeo at gmail.com
Sat Aug 4 07:54:57 UTC 2018
From: Marek Olšák <marek.olsak at amd.com>
---
src/gallium/drivers/radeonsi/si_clear.c | 6 +-
src/gallium/drivers/radeonsi/si_cp_dma.c | 88 ++++++++++---------
src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
src/gallium/drivers/radeonsi/si_pipe.h | 11 ++-
.../drivers/radeonsi/si_test_clearbuffer.c | 4 +-
src/gallium/drivers/radeonsi/si_test_dma.c | 2 +-
6 files changed, 60 insertions(+), 53 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index 654ff0ace78..4e07de81bac 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -249,21 +249,21 @@ void vi_dcc_clear_level(struct si_context *sctx,
* would be more efficient than separate per-layer clear operations.
*/
assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 1);
dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
num_layers;
}
si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
- clear_value, SI_COHERENCY_CB_META, SI_METHOD_BEST);
+ clear_value, SI_COHERENCY_CB_META);
}
/* Set the same micro tile mode as the destination of the last MSAA resolve.
* This allows hitting the MSAA resolve fast path, which requires that both
* src and dst micro tile modes match.
*/
static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
struct si_texture *tex)
{
if (tex->buffer.b.is_shared ||
@@ -482,21 +482,21 @@ static void si_do_fast_color_clear(struct si_context *sctx,
continue;
/* DCC fast clear with MSAA should clear CMASK to 0xC. */
if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
/* TODO: This doesn't work with MSAA. */
if (eliminate_needed)
continue;
si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
tex->cmask_offset, tex->surface.cmask_size,
- 0xCCCCCCCC, SI_COHERENCY_CB_META, SI_METHOD_BEST);
+ 0xCCCCCCCC, SI_COHERENCY_CB_META);
need_decompress_pass = true;
}
vi_dcc_clear_level(sctx, tex, 0, reset_value);
if (eliminate_needed)
need_decompress_pass = true;
tex->separate_dcc_dirty = true;
} else {
@@ -513,21 +513,21 @@ static void si_do_fast_color_clear(struct si_context *sctx,
continue;
/* ensure CMASK is enabled */
si_alloc_separate_cmask(sctx->screen, tex);
if (!tex->cmask_buffer)
continue;
/* Do the fast clear. */
si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
tex->cmask_offset, tex->surface.cmask_size, 0,
- SI_COHERENCY_CB_META, SI_METHOD_BEST);
+ SI_COHERENCY_CB_META);
need_decompress_pass = true;
}
if (need_decompress_pass &&
!(tex->dirty_level_mask & (1 << level))) {
tex->dirty_level_mask |= 1 << level;
p_atomic_inc(&sctx->screen->compressed_colortex_counter);
}
/* We can change the micro tile mode before a full clear. */
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 86eb3529d9b..bae592a4f7d 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -217,95 +217,103 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
*/
if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
byte_count == remaining_size) {
*packet_flags |= CP_DMA_SYNC;
if (coher == SI_COHERENCY_SHADER)
*packet_flags |= CP_DMA_PFP_SYNC_ME;
}
}
+void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+ uint64_t offset, uint64_t size, unsigned value,
+ enum si_coherency coher,
+ enum si_cache_policy cache_policy)
+{
+ struct r600_resource *rdst = r600_resource(dst);
+ uint64_t va = rdst->gpu_address + offset;
+ bool is_first = true;
+
+ assert(size && size % 4 == 0);
+
+ /* Mark the buffer range of destination as valid (initialized),
+ * so that transfer_map knows it should wait for the GPU when mapping
+ * that range. */
+ util_range_add(&rdst->valid_buffer_range, offset, offset + size);
+
+ /* Flush the caches. */
+ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH |
+ get_flush_flags(sctx, coher, cache_policy);
+
+ while (size) {
+ unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
+ unsigned dma_flags = CP_DMA_CLEAR;
+
+ si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0, coher,
+ &is_first, &dma_flags);
+
+ /* Emit the clear packet. */
+ si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, cache_policy);
+
+ size -= byte_count;
+ va += byte_count;
+ }
+
+ if (cache_policy != L2_BYPASS)
+ rdst->TC_L2_dirty = true;
+
+ /* If it's not a framebuffer fast clear... */
+ if (coher == SI_COHERENCY_SHADER)
+ sctx->num_cp_dma_calls++;
+}
+
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
uint64_t offset, uint64_t size, unsigned value,
- enum si_coherency coher, enum si_method xfer)
+ enum si_coherency coher)
{
struct radeon_winsys *ws = sctx->ws;
struct r600_resource *rdst = r600_resource(dst);
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
- unsigned flush_flags = get_flush_flags(sctx, coher, cache_policy);
uint64_t dma_clear_size;
- bool is_first = true;
if (!size)
return;
dma_clear_size = size & ~3ull;
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- util_range_add(&rdst->valid_buffer_range, offset,
- offset + dma_clear_size);
-
/* dma_clear_buffer can use clear_buffer on failure. Make sure that
* doesn't happen. We don't want an infinite recursion: */
if (sctx->dma_cs &&
!(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) &&
(offset % 4 == 0) &&
/* CP DMA is very slow. Always use SDMA for big clears. This
* alone improves DeusEx:MD performance by 70%. */
(size > CP_DMA_CLEAR_PERF_THRESHOLD ||
/* Buffers not used by the GFX IB yet will be cleared by SDMA.
* This happens to move most buffer clears to SDMA, including
* DCC and CMASK clears, because pipe->clear clears them before
* si_emit_framebuffer_state (in a draw call) adds them.
* For example, DeusEx:MD has 21 buffer clears per frame and all
* of them are moved to SDMA thanks to this. */
!ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf,
- RADEON_USAGE_READWRITE)) &&
- /* bypass sdma transfer with param xfer */
- (xfer != SI_METHOD_CP_DMA)) {
+ RADEON_USAGE_READWRITE))) {
sctx->dma_clear_buffer(sctx, dst, offset, dma_clear_size, value);
offset += dma_clear_size;
size -= dma_clear_size;
} else if (dma_clear_size >= 4) {
- uint64_t va = rdst->gpu_address + offset;
+ si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value,
+ coher, cache_policy);
offset += dma_clear_size;
size -= dma_clear_size;
-
- /* Flush the caches. */
- sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
-
- while (dma_clear_size) {
- unsigned byte_count = MIN2(dma_clear_size, cp_dma_max_byte_count(sctx));
- unsigned dma_flags = CP_DMA_CLEAR;
-
- si_cp_dma_prepare(sctx, dst, NULL, byte_count, dma_clear_size, 0,
- coher, &is_first, &dma_flags);
-
- /* Emit the clear packet. */
- si_emit_cp_dma(sctx, va, value, byte_count, dma_flags,
- cache_policy);
-
- dma_clear_size -= byte_count;
- va += byte_count;
- }
-
- if (cache_policy != L2_BYPASS)
- rdst->TC_L2_dirty = true;
-
- /* If it's not a framebuffer fast clear... */
- if (coher == SI_COHERENCY_SHADER)
- sctx->num_cp_dma_calls++;
}
if (size) {
/* Handle non-dword alignment.
*
* This function is called for embedded texture metadata clears,
* but those should always be properly aligned. */
assert(dst->target == PIPE_BUFFER);
assert(size < 4);
@@ -363,21 +371,21 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx,
break;
case 2:
dword_value = *(uint16_t*)clear_value_ptr;
dword_value |= dword_value << 16;
break;
default:
dword_value = *(uint32_t*)clear_value_ptr;
}
si_clear_buffer(sctx, dst, offset, size, dword_value,
- SI_COHERENCY_SHADER, SI_METHOD_BEST);
+ SI_COHERENCY_SHADER);
}
/**
* Realign the CP DMA engine. This must be done after a copy with an unaligned
* size.
*
* \param size Remaining size to the CP DMA alignment.
*/
static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
unsigned user_flags, enum si_coherency coher,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index c1983b86661..3ca53dfed7c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -539,21 +539,21 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
&sctx->null_const_buf);
si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
&sctx->null_const_buf);
si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
&sctx->null_const_buf);
/* Clear the NULL constant buffer, because loads should return zeros. */
si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
sctx->null_const_buf.buffer->width0, 0,
- SI_COHERENCY_SHADER, SI_METHOD_BEST);
+ SI_COHERENCY_SHADER);
}
uint64_t max_threads_per_block;
screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI,
PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
&max_threads_per_block);
/* The maximum number of scratch waves. Scratch space isn't divided
* evenly between CUs. The number is only a function of the number of CUs.
* We can decrease the constant to decrease the scratch buffer size.
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 0b398018c4a..acdc0d11f2c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1109,29 +1109,28 @@ enum si_cache_policy {
L2_BYPASS,
L2_LRU, /* same as SLC=0 */
};
enum si_coherency {
SI_COHERENCY_NONE, /* no cache flushes needed */
SI_COHERENCY_SHADER,
SI_COHERENCY_CB_META,
};
-enum si_method {
- SI_METHOD_CP_DMA,
- SI_METHOD_BEST,
-};
-
void si_cp_dma_wait_for_idle(struct si_context *sctx);
+void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+ uint64_t offset, uint64_t size, unsigned value,
+ enum si_coherency coher,
+ enum si_cache_policy cache_policy);
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
uint64_t offset, uint64_t size, unsigned value,
- enum si_coherency coher, enum si_method xfer);
+ enum si_coherency coher);
void si_copy_buffer(struct si_context *sctx,
struct pipe_resource *dst, struct pipe_resource *src,
uint64_t dst_offset, uint64_t src_offset, unsigned size,
unsigned user_flags);
void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
uint64_t offset, unsigned size);
void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
void si_init_cp_dma_functions(struct si_context *sctx);
/* si_debug.c */
diff --git a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
index c0696da26db..e863381fd15 100644
--- a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
+++ b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
@@ -41,22 +41,22 @@ measure_clearbuf_time(struct pipe_context *ctx,
struct si_context *sctx = (struct si_context*)ctx;
struct pipe_screen *screen = ctx->screen;
buf = pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, memory_size);
query_te = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0);
ctx->begin_query(ctx, query_te);
/* operation */
- si_clear_buffer(sctx, buf, 0, memory_size, 0x00,
- SI_COHERENCY_SHADER, SI_METHOD_CP_DMA);
+ si_cp_dma_clear_buffer(sctx, buf, 0, memory_size, 0x00,
+ SI_COHERENCY_SHADER, L2_LRU);
ctx->end_query(ctx, query_te);
ctx->get_query_result(ctx, query_te, true, &qresult);
/* Cleanup. */
ctx->destroy_query(ctx, query_te);
pipe_resource_reference(&buf, NULL);
/* Report Results */
return qresult.u64;
}
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
index f125769d1cf..c81ec75dde2 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -301,21 +301,21 @@ void si_test_dma(struct si_screen *sscreen)
array_mode_to_string(sscreen, &sdst->surface),
tsrc.width0, tsrc.height0, tsrc.array_size,
array_mode_to_string(sscreen, &ssrc->surface), bpp);
fflush(stdout);
/* set src pixels */
set_random_pixels(ctx, src, &src_cpu);
/* clear dst pixels */
si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0,
- SI_COHERENCY_SHADER, SI_METHOD_BEST);
+ SI_COHERENCY_SHADER);
memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
/* preparation */
max_width = MIN2(tsrc.width0, tdst.width0);
max_height = MIN2(tsrc.height0, tdst.height0);
max_depth = MIN2(tsrc.array_size, tdst.array_size);
num = do_partial_copies ? num_partial_copies : 1;
for (j = 0; j < num; j++) {
int width, height, depth;
--
2.17.1
More information about the mesa-dev
mailing list