[Mesa-dev] [PATCH 09/12] gallium/radeon: prevent SDMA stalls by detecting RAW hazards in need_dma_space
Marek Olšák
maraeo at gmail.com
Mon Jan 2 22:54:14 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
Call r600_dma_emit_wait_idle only when there is a possibility of
a read-after-write hazard. Buffers not yet used by the SDMA IB don't
have to wait.
---
src/gallium/drivers/r600/evergreen_hw_context.c | 1 -
src/gallium/drivers/r600/evergreen_state.c | 1 -
src/gallium/drivers/r600/r600_hw_context.c | 1 -
src/gallium/drivers/r600/r600_state.c | 1 -
src/gallium/drivers/radeon/r600_pipe_common.c | 48 ++++++++++++++-----------
src/gallium/drivers/radeon/r600_pipe_common.h | 1 -
src/gallium/drivers/radeonsi/cik_sdma.c | 8 -----
src/gallium/drivers/radeonsi/si_dma.c | 2 --
8 files changed, 27 insertions(+), 36 deletions(-)
diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index 06f0348..5352dc0 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -70,21 +70,20 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
RADEON_PRIO_SDMA_BUFFER);
radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize));
radeon_emit(cs, dst_offset & 0xffffffff);
radeon_emit(cs, src_offset & 0xffffffff);
radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
radeon_emit(cs, (src_offset >> 32UL) & 0xff);
dst_offset += csize << shift;
src_offset += csize << shift;
size -= csize;
}
- r600_dma_emit_wait_idle(&rctx->b);
}
/* The max number of bytes to copy per packet. */
#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
struct pipe_resource *dst, uint64_t offset,
unsigned size, uint32_t clear_value,
enum r600_coherency coher)
{
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 015ff02..c5dd9f7 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -3446,21 +3446,20 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16));
radeon_emit(cs, (slice_tile_max << 0));
radeon_emit(cs, (x << 0) | (z << 18));
radeon_emit(cs, (y << 0) | (tile_split << 21) | (nbanks << 25) | (non_disp_tiling << 28));
radeon_emit(cs, addr & 0xfffffffc);
radeon_emit(cs, (addr >> 32UL) & 0xff);
copy_height -= cheight;
addr += cheight * pitch;
y += cheight;
}
- r600_dma_emit_wait_idle(&rctx->b);
}
static void evergreen_dma_copy(struct pipe_context *ctx,
struct pipe_resource *dst,
unsigned dst_level,
unsigned dstx, unsigned dsty, unsigned dstz,
struct pipe_resource *src,
unsigned src_level,
const struct pipe_box *src_box)
{
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index bc6217a..4663d99 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -548,12 +548,11 @@ void r600_dma_copy_buffer(struct r600_context *rctx,
RADEON_PRIO_SDMA_BUFFER);
radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize));
radeon_emit(cs, dst_offset & 0xfffffffc);
radeon_emit(cs, src_offset & 0xfffffffc);
radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
radeon_emit(cs, (src_offset >> 32UL) & 0xff);
dst_offset += csize << 2;
src_offset += csize << 2;
size -= csize;
}
- r600_dma_emit_wait_idle(&rctx->b);
}
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index ba97490..006bb62 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -2897,21 +2897,20 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
(lbpp << 24) | ((height - 1) << 10) |
pitch_tile_max);
radeon_emit(cs, (slice_tile_max << 12) | (z << 0));
radeon_emit(cs, (x << 3) | (y << 17));
radeon_emit(cs, addr & 0xfffffffc);
radeon_emit(cs, (addr >> 32UL) & 0xff);
copy_height -= cheight;
addr += cheight * pitch;
y += cheight;
}
- r600_dma_emit_wait_idle(&rctx->b);
return TRUE;
}
static void r600_dma_copy(struct pipe_context *ctx,
struct pipe_resource *dst,
unsigned dst_level,
unsigned dstx, unsigned dsty, unsigned dstz,
struct pipe_resource *src,
unsigned src_level,
const struct pipe_box *src_box)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 6b7bbaf..4d8bb74 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -217,20 +217,35 @@ void r600_draw_rectangle(struct blitter_context *blitter,
memcpy(vb+12, attrib->f, sizeof(float)*4);
memcpy(vb+20, attrib->f, sizeof(float)*4);
}
/* draw */
util_draw_vertex_buffer(&rctx->b, NULL, buf, blitter->vb_slot, offset,
R600_PRIM_RECTANGLE_LIST, 3, 2);
pipe_resource_reference(&buf, NULL);
}
+static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
+{
+ struct radeon_winsys_cs *cs = rctx->dma.cs;
+
+ /* NOP waits for idle on Evergreen and later. */
+ if (rctx->chip_class >= CIK)
+ radeon_emit(cs, 0x00000000); /* NOP */
+ else if (rctx->chip_class >= EVERGREEN)
+ radeon_emit(cs, 0xf0000000); /* NOP */
+ else {
+ /* TODO: R600-R700 should use the FENCE packet.
+ * CS checker support is required. */
+ }
+}
+
void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
struct r600_resource *dst, struct r600_resource *src)
{
uint64_t vram = ctx->dma.cs->used_vram;
uint64_t gtt = ctx->dma.cs->used_gart;
if (dst) {
vram += dst->vram_usage;
gtt += dst->gart_usage;
}
@@ -254,66 +269,57 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
*
* IBs using too little memory are limited by the IB submission overhead.
* IBs using too much memory are limited by the kernel/TTM overhead.
* Too long IBs create CPU-GPU pipeline bubbles and add latency.
*
* This heuristic makes sure that DMA requests are executed
* very soon after the call is made and lowers memory usage.
* It improves texture upload performance by keeping the DMA
* engine busy while uploads are being submitted.
*/
+ num_dw++; /* for emit_wait_idle below */
if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
!radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
}
+ /* Wait for idle if either buffer has been used in the IB before to
+ * prevent read-after-write hazards.
+ */
+ if ((dst &&
+ ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
+ RADEON_USAGE_READWRITE)) ||
+ (src &&
+ ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
+ RADEON_USAGE_WRITE)))
+ r600_dma_emit_wait_idle(ctx);
+
/* If GPUVM is not supported, the CS checker needs 2 entries
* in the buffer list per packet, which has to be done manually.
*/
if (ctx->screen->info.has_virtual_memory) {
if (dst)
radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
RADEON_USAGE_WRITE,
RADEON_PRIO_SDMA_BUFFER);
if (src)
radeon_add_to_buffer_list(ctx, &ctx->dma, src,
RADEON_USAGE_READ,
RADEON_PRIO_SDMA_BUFFER);
}
/* this function is called before all DMA calls, so increment this. */
ctx->num_dma_calls++;
}
-/* This is required to prevent read-after-write hazards. */
-void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
-{
- struct radeon_winsys_cs *cs = rctx->dma.cs;
-
- r600_need_dma_space(rctx, 1, NULL, NULL);
-
- if (!radeon_emitted(cs, 0)) /* empty queue */
- return;
-
- /* NOP waits for idle on Evergreen and later. */
- if (rctx->chip_class >= CIK)
- radeon_emit(cs, 0x00000000); /* NOP */
- else if (rctx->chip_class >= EVERGREEN)
- radeon_emit(cs, 0xf0000000); /* NOP */
- else {
- /* TODO: R600-R700 should use the FENCE packet.
- * CS checker support is required. */
- }
-}
-
static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
{
}
void r600_preflush_suspend_features(struct r600_common_context *ctx)
{
/* suspend queries */
if (!LIST_IS_EMPTY(&ctx->active_queries))
r600_suspend_queries(ctx);
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 917059c..74f86dc 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -725,21 +725,20 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen,
unsigned processor);
bool r600_extra_shader_checks(struct r600_common_screen *rscreen,
unsigned processor);
void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
uint64_t offset, uint64_t size, unsigned value);
struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
const struct pipe_resource *templ);
const char *r600_get_llvm_processor_name(enum radeon_family family);
void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
struct r600_resource *dst, struct r600_resource *src);
-void r600_dma_emit_wait_idle(struct r600_common_context *rctx);
void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
struct radeon_saved_cs *saved);
void radeon_clear_saved_cs(struct radeon_saved_cs *saved);
bool r600_check_device_reset(struct r600_common_context *rctx);
/* r600_gpu_load.c */
void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen);
uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen);
unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 648b1ca..bee35cd 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -60,21 +60,20 @@ static void cik_sdma_copy_buffer(struct si_context *ctx,
radeon_emit(cs, csize);
radeon_emit(cs, 0); /* src/dst endian swap */
radeon_emit(cs, src_offset);
radeon_emit(cs, src_offset >> 32);
radeon_emit(cs, dst_offset);
radeon_emit(cs, dst_offset >> 32);
dst_offset += csize;
src_offset += csize;
size -= csize;
}
- r600_dma_emit_wait_idle(&ctx->b);
}
static void cik_sdma_clear_buffer(struct pipe_context *ctx,
struct pipe_resource *dst,
uint64_t offset,
uint64_t size,
unsigned clear_value)
{
struct si_context *sctx = (struct si_context *)ctx;
struct radeon_winsys_cs *cs = sctx->b.dma.cs;
@@ -101,21 +100,20 @@ static void cik_sdma_clear_buffer(struct pipe_context *ctx,
csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
0x8000 /* dword copy */));
radeon_emit(cs, offset);
radeon_emit(cs, offset >> 32);
radeon_emit(cs, clear_value);
radeon_emit(cs, csize);
offset += csize;
size -= csize;
}
- r600_dma_emit_wait_idle(&sctx->b);
}
static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
{
width = u_minify(width, level);
return DIV_ROUND_UP(width, blk_w);
}
static unsigned encode_tile_info(struct si_context *sctx,
struct r600_texture *tex, unsigned level,
@@ -244,22 +242,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
radeon_emit(cs, dstx | (dsty << 16));
radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
radeon_emit(cs, dst_slice_pitch - 1);
if (sctx->b.chip_class == CIK) {
radeon_emit(cs, copy_width | (copy_height << 16));
radeon_emit(cs, copy_depth);
} else {
radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
radeon_emit(cs, (copy_depth - 1));
}
-
- r600_dma_emit_wait_idle(&sctx->b);
return true;
}
/* Tiled <-> linear sub-window copy. */
if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
struct r600_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? rsrc : rdst;
struct r600_texture *linear = tiled == rsrc ? rdst : rsrc;
unsigned tiled_level = tiled == rsrc ? src_level : dst_level;
unsigned linear_level = linear == rsrc ? src_level : dst_level;
unsigned tiled_x = tiled == rsrc ? srcx : dstx;
@@ -410,22 +406,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
radeon_emit(cs, linear_x | (linear_y << 16));
radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
radeon_emit(cs, linear_slice_pitch - 1);
if (sctx->b.chip_class == CIK) {
radeon_emit(cs, copy_width_aligned | (copy_height << 16));
radeon_emit(cs, copy_depth);
} else {
radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16));
radeon_emit(cs, (copy_depth - 1));
}
-
- r600_dma_emit_wait_idle(&sctx->b);
return true;
}
}
/* Tiled -> Tiled sub-window copy. */
if (dst_mode >= RADEON_SURF_MODE_1D &&
src_mode >= RADEON_SURF_MODE_1D &&
/* check if these fit into the bitfields */
src_address % 256 == 0 &&
dst_address % 256 == 0 &&
@@ -508,22 +502,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
radeon_emit(cs, encode_tile_info(sctx, rdst, dst_level, false));
if (sctx->b.chip_class == CIK) {
radeon_emit(cs, copy_width_aligned |
(copy_height_aligned << 16));
radeon_emit(cs, copy_depth);
} else {
radeon_emit(cs, (copy_width_aligned - 8) |
((copy_height_aligned - 8) << 16));
radeon_emit(cs, (copy_depth - 1));
}
-
- r600_dma_emit_wait_idle(&sctx->b);
return true;
}
}
return false;
}
static void cik_sdma_copy(struct pipe_context *ctx,
struct pipe_resource *dst,
unsigned dst_level,
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 8d186c3..1009bb2 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -69,21 +69,20 @@ static void si_dma_copy_buffer(struct si_context *ctx,
csize = size < max_csize ? size : max_csize;
radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize));
radeon_emit(cs, dst_offset);
radeon_emit(cs, src_offset);
radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
radeon_emit(cs, (src_offset >> 32UL) & 0xff);
dst_offset += csize << shift;
src_offset += csize << shift;
size -= csize;
}
- r600_dma_emit_wait_idle(&ctx->b);
}
static void si_dma_copy_tile(struct si_context *ctx,
struct pipe_resource *dst,
unsigned dst_level,
unsigned dst_x,
unsigned dst_y,
unsigned dst_z,
struct pipe_resource *src,
unsigned src_level,
@@ -170,21 +169,20 @@ static void si_dma_copy_tile(struct si_context *ctx,
radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16));
radeon_emit(cs, (slice_tile_max << 0) | (pipe_config << 26));
radeon_emit(cs, (tiled_x << 0) | (tiled_z << 18));
radeon_emit(cs, (tiled_y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27));
radeon_emit(cs, addr & 0xfffffffc);
radeon_emit(cs, (addr >> 32UL) & 0xff);
copy_height -= cheight;
addr += cheight * pitch;
tiled_y += cheight;
}
- r600_dma_emit_wait_idle(&ctx->b);
}
static void si_dma_copy(struct pipe_context *ctx,
struct pipe_resource *dst,
unsigned dst_level,
unsigned dstx, unsigned dsty, unsigned dstz,
struct pipe_resource *src,
unsigned src_level,
const struct pipe_box *src_box)
{
--
2.7.4
More information about the mesa-dev
mailing list