[Mesa-dev] [PATCH 4/4] radeonsi: emulate REWIND using INDIRECT_BUFFER for primitive culling on CI
Marek Olšák
maraeo at gmail.com
Thu Feb 14 06:31:05 UTC 2019
From: Marek Olšák <marek.olsak at amd.com>
This increases "Paraview - Many Spheres" performance from 37.91 to 43 fps
on Hawaii.
---
src/gallium/drivers/r300/r300_blit.c | 2 +-
src/gallium/drivers/r300/r300_render.c | 2 +-
src/gallium/drivers/r600/r600_hw_context.c | 2 +-
src/gallium/drivers/r600/r600_pipe_common.c | 2 +-
src/gallium/drivers/radeon/radeon_winsys.h | 6 +++++-
.../radeonsi/si_compute_prim_discard.c | 18 ++++++++---------
src/gallium/drivers/radeonsi/si_dma_cs.c | 2 +-
src/gallium/drivers/radeonsi/si_gfx_cs.c | 2 +-
src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 20 ++++++++++++-------
src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 3 ++-
10 files changed, 35 insertions(+), 24 deletions(-)
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index 01fccfbe7ed..4ac2589a9a7 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -375,21 +375,21 @@ static void r300_clear(struct pipe_context* pipe,
* procedure. */
/* Calculate zmask_clear and hiz_clear atom sizes. */
unsigned dwords =
r300->gpu_flush.size +
(r300->zmask_clear.dirty ? r300->zmask_clear.size : 0) +
(r300->hiz_clear.dirty ? r300->hiz_clear.size : 0) +
(r300->cmask_clear.dirty ? r300->cmask_clear.size : 0) +
r300_get_num_cs_end_dwords(r300);
/* Reserve CS space. */
- if (!r300->rws->cs_check_space(r300->cs, dwords)) {
+ if (!r300->rws->cs_check_space(r300->cs, dwords, false)) {
r300_flush(&r300->context, PIPE_FLUSH_ASYNC, NULL);
}
/* Emit clear packets. */
r300_emit_gpu_flush(r300, r300->gpu_flush.size, r300->gpu_flush.state);
r300->gpu_flush.dirty = FALSE;
if (r300->zmask_clear.dirty) {
r300_emit_zmask_clear(r300, r300->zmask_clear.size,
r300->zmask_clear.state);
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 211d35d0607..ed129e1a306 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -208,21 +208,21 @@ static boolean r300_reserve_cs_dwords(struct r300_context *r300,
if (emit_vertex_arrays)
cs_dwords += 55; /* emit_vertex_arrays */
if (emit_vertex_arrays_swtcl)
cs_dwords += 7; /* emit_vertex_arrays_swtcl */
cs_dwords += r300_get_num_cs_end_dwords(r300);
/* Reserve requested CS space. */
- if (!r300->rws->cs_check_space(r300->cs, cs_dwords)) {
+ if (!r300->rws->cs_check_space(r300->cs, cs_dwords, false)) {
r300_flush(&r300->context, PIPE_FLUSH_ASYNC, NULL);
flushed = TRUE;
}
return flushed;
}
/**
* Validate buffers and emit dirty state.
* \param r300 The context.
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index a2f5f637b20..abf5d03e4f9 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -77,21 +77,21 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
num_dw += 3;
}
/* Count in framebuffer cache flushes at the end of CS. */
num_dw += R600_MAX_FLUSH_CS_DWORDS;
/* The fence at the end of CS. */
num_dw += 10;
/* Flush if there's not enough space. */
- if (!ctx->b.ws->cs_check_space(ctx->b.gfx.cs, num_dw)) {
+ if (!ctx->b.ws->cs_check_space(ctx->b.gfx.cs, num_dw, false)) {
ctx->b.gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
}
}
void r600_flush_emit(struct r600_context *rctx)
{
struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
unsigned cp_coher_cntl = 0;
unsigned wait_until = 0;
diff --git a/src/gallium/drivers/r600/r600_pipe_common.c b/src/gallium/drivers/r600/r600_pipe_common.c
index 3c00ad691ac..d4d4511df3e 100644
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@@ -283,21 +283,21 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
* IBs using too little memory are limited by the IB submission overhead.
* IBs using too much memory are limited by the kernel/TTM overhead.
* Too long IBs create CPU-GPU pipeline bubbles and add latency.
*
* This heuristic makes sure that DMA requests are executed
* very soon after the call is made and lowers memory usage.
* It improves texture upload performance by keeping the DMA
* engine busy while uploads are being submitted.
*/
num_dw++; /* for emit_wait_idle below */
- if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
+ if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw, false) ||
ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
!radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
ctx->dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL);
assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
}
/* Wait for idle if either buffer has been used in the IB before to
* prevent read-after-write hazards.
*/
if ((dst &&
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index c04c014bd2f..814fb9068dc 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -563,22 +563,26 @@ struct radeon_winsys {
* \param cs A command stream to validate.
*/
bool (*cs_validate)(struct radeon_cmdbuf *cs);
/**
* Check whether the given number of dwords is available in the IB.
* Optionally chain a new chunk of the IB if necessary and supported.
*
* \param cs A command stream.
* \param dw Number of CS dwords requested by the caller.
+ * \param force_chaining Chain the IB into a new buffer now to discard
+ * the CP prefetch cache.
+ * \return true if there is enough space
*/
- bool (*cs_check_space)(struct radeon_cmdbuf *cs, unsigned dw);
+ bool (*cs_check_space)(struct radeon_cmdbuf *cs, unsigned dw,
+ bool force_chaining);
/**
* Return the buffer list.
*
* This is the buffer list as passed to the kernel, i.e. it only contains
* the parent buffers of sub-allocated buffers.
*
* \param cs Command stream
* \param list Returned buffer list. Set to NULL to query the count only.
* \return The buffer count.
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index afb7424aa41..34282391bf1 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -178,21 +178,20 @@
#define SPLIT_PRIMS_DRAW_LEVEL PRIMS_PER_BATCH
/* Derived values. */
#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64)
#define SPLIT_PRIMS_PACKET_LEVEL (VERTEX_COUNTER_GDS_MODE == 2 ? \
SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
UINT_MAX & ~(THREADGROUP_SIZE - 1))
#define REWIND_SIGNAL_BIT 0x80000000
/* For emulating the rewind packet on CI. */
-#define REWIND_EMULATION_NOPS 4096 /* minimum for Hawaii, guessed */
#define FORCE_REWIND_EMULATION 0
void si_initialize_prim_discard_tunables(struct si_context *sctx)
{
sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
if (sctx->chip_class == SI || /* SI support is not implemented */
!sctx->screen->info.has_gds_ordered_append ||
sctx->screen->debug_flags & DBG(NO_PD) ||
/* If aux_context == NULL, we are initializing aux_context right now. */
@@ -1045,46 +1044,45 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
if (SI_PRIM_DISCARD_DEBUG)
puts("PD failed: draw call too big, can't be split");
return SI_PRIM_DISCARD_DISABLED;
}
unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
24 * (num_subdraws - 1) + /* subdraws */
20; /* leave some space at the end */
- unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx) +
- num_subdraws * 6; /* DRAW(6) */
+ unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
if (sctx->chip_class <= CIK || FORCE_REWIND_EMULATION)
- need_gfx_dw += num_subdraws * (9 + REWIND_EMULATION_NOPS);
+ need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
else
- need_gfx_dw += num_subdraws * 2; /* use REWIND */
+ need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
if (ring_full ||
(VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
- !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw)) {
+ !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
/* If the current IB is empty but the size is too small, add a NOP
* packet to force a flush and get a bigger IB.
*/
if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
radeon_emit(gfx_cs, 0);
}
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
/* The compute IB is always chained, but we need to call cs_check_space to add more space. */
struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
- bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw);
+ bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
assert(compute_has_space);
assert(si_check_ring_space(sctx, out_indexbuf_size));
return SI_PRIM_DISCARD_ENABLED;
}
void si_compute_signal_gfx(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
unsigned writeback_L2_flags = 0;
@@ -1415,22 +1413,24 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
if (sctx->chip_class <= CIK || FORCE_REWIND_EMULATION) {
radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
radeon_emit(gfx_cs, 0);
si_cp_wait_mem(sctx, gfx_cs,
sctx->compute_rewind_va |
(uint64_t)sctx->screen->info.address32_hi << 32,
REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
- radeon_emit(gfx_cs, PKT3(PKT3_NOP, REWIND_EMULATION_NOPS - 2, 0));
- gfx_cs->current.cdw += REWIND_EMULATION_NOPS - 1;
+ /* Use INDIRECT_BUFFER to chain to a different buffer
+ * to discard the CP prefetch cache.
+ */
+ sctx->ws->cs_check_space(gfx_cs, 0, true);
} else {
radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
radeon_emit(gfx_cs, 0);
}
}
sctx->compute_num_prims_in_batch += num_subdraw_prims;
uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
uint64_t index_va = out_indexbuf_va + start_prim * 12;
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
index 2aafc1f09a0..f04bc2e28da 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -155,21 +155,21 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
* IBs using too little memory are limited by the IB submission overhead.
* IBs using too much memory are limited by the kernel/TTM overhead.
* Too long IBs create CPU-GPU pipeline bubbles and add latency.
*
* This heuristic makes sure that DMA requests are executed
* very soon after the call is made and lowers memory usage.
* It improves texture upload performance by keeping the DMA
* engine busy while uploads are being submitted.
*/
num_dw++; /* for emit_wait_idle below */
- if (!ws->cs_check_space(ctx->dma_cs, num_dw) ||
+ if (!ws->cs_check_space(ctx->dma_cs, num_dw, false) ||
ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 ||
!radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt)) {
si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw);
}
/* Wait for idle if either buffer has been used in the IB before to
* prevent read-after-write hazards.
*/
if ((dst &&
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 9dbe932c66f..779d8106839 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -49,21 +49,21 @@ void si_need_gfx_cs_space(struct si_context *ctx)
ctx->vram, ctx->gtt))) {
ctx->gtt = 0;
ctx->vram = 0;
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
return;
}
ctx->gtt = 0;
ctx->vram = 0;
unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
- if (!ctx->ws->cs_check_space(cs, need_dwords))
+ if (!ctx->ws->cs_check_space(cs, need_dwords, false))
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
struct pipe_fence_handle **fence)
{
struct radeon_cmdbuf *cs = ctx->gfx_cs;
struct radeon_winsys *ws = ctx->ws;
unsigned wait_flags = 0;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 392f69e5fef..5424398c350 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -1067,47 +1067,53 @@ amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *ib,
}
}
return &cs->compute_ib.base;
}
static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
{
return true;
}
-static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
+static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
+ bool force_chaining)
{
struct amdgpu_ib *ib = amdgpu_ib(rcs);
struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib);
unsigned requested_size = rcs->prev_dw + rcs->current.cdw + dw;
unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
uint64_t va;
uint32_t *new_ptr_ib_size;
assert(rcs->current.cdw <= rcs->current.max_dw);
/* 125% of the size for IB epilog. */
unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
ib->max_check_space_size = MAX2(ib->max_check_space_size,
safe_byte_size);
- if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
- return false;
+ /* If force_chaining is true, we can't return. We have to chain. */
+ if (!force_chaining) {
+ if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
+ return false;
- ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
+ ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
- if (rcs->current.max_dw - rcs->current.cdw >= dw)
- return true;
+ if (rcs->current.max_dw - rcs->current.cdw >= dw)
+ return true;
+ }
- if (!amdgpu_cs_has_chaining(cs))
+ if (!amdgpu_cs_has_chaining(cs)) {
+ assert(!force_chaining);
return false;
+ }
/* Allocate a new chunk */
if (rcs->num_prev >= rcs->max_prev) {
unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
struct radeon_cmdbuf_chunk *new_prev;
new_prev = REALLOC(rcs->prev,
sizeof(*new_prev) * rcs->max_prev,
sizeof(*new_prev) * new_max_prev);
if (!new_prev)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 2288c320975..3da4d3ce238 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -417,21 +417,22 @@ static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
assert(cs->base.current.cdw == 0);
if (cs->base.current.cdw != 0) {
fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
}
}
}
return status;
}
-static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
+static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
+ bool force_chaining)
{
assert(rcs->current.cdw <= rcs->current.max_dw);
return rcs->current.max_dw - rcs->current.cdw >= dw;
}
static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
struct radeon_bo_list_item *list)
{
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
int i;
--
2.17.1
More information about the mesa-dev
mailing list