[Mesa-dev] [PATCH 09/12] gallium/radeon: prevent SDMA stalls by detecting RAW hazards in need_dma_space

Marek Olšák maraeo at gmail.com
Mon Jan 2 22:54:14 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

Call r600_dma_emit_wait_idle only when there is a possibility of
a read-after-write hazard. Buffers not yet used by the SDMA IB don't
have to wait.
---
 src/gallium/drivers/r600/evergreen_hw_context.c |  1 -
 src/gallium/drivers/r600/evergreen_state.c      |  1 -
 src/gallium/drivers/r600/r600_hw_context.c      |  1 -
 src/gallium/drivers/r600/r600_state.c           |  1 -
 src/gallium/drivers/radeon/r600_pipe_common.c   | 48 ++++++++++++++-----------
 src/gallium/drivers/radeon/r600_pipe_common.h   |  1 -
 src/gallium/drivers/radeonsi/cik_sdma.c         |  8 -----
 src/gallium/drivers/radeonsi/si_dma.c           |  2 --
 8 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index 06f0348..5352dc0 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -70,21 +70,20 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
 				      RADEON_PRIO_SDMA_BUFFER);
 		radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize));
 		radeon_emit(cs, dst_offset & 0xffffffff);
 		radeon_emit(cs, src_offset & 0xffffffff);
 		radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
 		radeon_emit(cs, (src_offset >> 32UL) & 0xff);
 		dst_offset += csize << shift;
 		src_offset += csize << shift;
 		size -= csize;
 	}
-	r600_dma_emit_wait_idle(&rctx->b);
 }
 
 /* The max number of bytes to copy per packet. */
 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
 
 void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 				   struct pipe_resource *dst, uint64_t offset,
 				   unsigned size, uint32_t clear_value,
 				   enum r600_coherency coher)
 {
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 015ff02..c5dd9f7 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -3446,21 +3446,20 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 		radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16));
 		radeon_emit(cs, (slice_tile_max << 0));
 		radeon_emit(cs, (x << 0) | (z << 18));
 		radeon_emit(cs, (y << 0) | (tile_split << 21) | (nbanks << 25) | (non_disp_tiling << 28));
 		radeon_emit(cs, addr & 0xfffffffc);
 		radeon_emit(cs, (addr >> 32UL) & 0xff);
 		copy_height -= cheight;
 		addr += cheight * pitch;
 		y += cheight;
 	}
-	r600_dma_emit_wait_idle(&rctx->b);
 }
 
 static void evergreen_dma_copy(struct pipe_context *ctx,
 			       struct pipe_resource *dst,
 			       unsigned dst_level,
 			       unsigned dstx, unsigned dsty, unsigned dstz,
 			       struct pipe_resource *src,
 			       unsigned src_level,
 			       const struct pipe_box *src_box)
 {
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index bc6217a..4663d99 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -548,12 +548,11 @@ void r600_dma_copy_buffer(struct r600_context *rctx,
 				      RADEON_PRIO_SDMA_BUFFER);
 		radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize));
 		radeon_emit(cs, dst_offset & 0xfffffffc);
 		radeon_emit(cs, src_offset & 0xfffffffc);
 		radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
 		radeon_emit(cs, (src_offset >> 32UL) & 0xff);
 		dst_offset += csize << 2;
 		src_offset += csize << 2;
 		size -= csize;
 	}
-	r600_dma_emit_wait_idle(&rctx->b);
 }
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index ba97490..006bb62 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -2897,21 +2897,20 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 				(lbpp << 24) | ((height - 1) << 10) |
 				pitch_tile_max);
 		radeon_emit(cs, (slice_tile_max << 12) | (z << 0));
 		radeon_emit(cs, (x << 3) | (y << 17));
 		radeon_emit(cs, addr & 0xfffffffc);
 		radeon_emit(cs, (addr >> 32UL) & 0xff);
 		copy_height -= cheight;
 		addr += cheight * pitch;
 		y += cheight;
 	}
-	r600_dma_emit_wait_idle(&rctx->b);
 	return TRUE;
 }
 
 static void r600_dma_copy(struct pipe_context *ctx,
 			  struct pipe_resource *dst,
 			  unsigned dst_level,
 			  unsigned dstx, unsigned dsty, unsigned dstz,
 			  struct pipe_resource *src,
 			  unsigned src_level,
 			  const struct pipe_box *src_box)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 6b7bbaf..4d8bb74 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -217,20 +217,35 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 		memcpy(vb+12, attrib->f, sizeof(float)*4);
 		memcpy(vb+20, attrib->f, sizeof(float)*4);
 	}
 
 	/* draw */
 	util_draw_vertex_buffer(&rctx->b, NULL, buf, blitter->vb_slot, offset,
 				R600_PRIM_RECTANGLE_LIST, 3, 2);
 	pipe_resource_reference(&buf, NULL);
 }
 
+static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
+{
+	struct radeon_winsys_cs *cs = rctx->dma.cs;
+
+	/* NOP waits for idle on Evergreen and later. */
+	if (rctx->chip_class >= CIK)
+		radeon_emit(cs, 0x00000000); /* NOP */
+	else if (rctx->chip_class >= EVERGREEN)
+		radeon_emit(cs, 0xf0000000); /* NOP */
+	else {
+		/* TODO: R600-R700 should use the FENCE packet.
+		 * CS checker support is required. */
+	}
+}
+
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
                          struct r600_resource *dst, struct r600_resource *src)
 {
 	uint64_t vram = ctx->dma.cs->used_vram;
 	uint64_t gtt = ctx->dma.cs->used_gart;
 
 	if (dst) {
 		vram += dst->vram_usage;
 		gtt += dst->gart_usage;
 	}
@@ -254,66 +269,57 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 	 *
 	 * IBs using too little memory are limited by the IB submission overhead.
 	 * IBs using too much memory are limited by the kernel/TTM overhead.
 	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 	 *
 	 * This heuristic makes sure that DMA requests are executed
 	 * very soon after the call is made and lowers memory usage.
 	 * It improves texture upload performance by keeping the DMA
 	 * engine busy while uploads are being submitted.
 	 */
+	num_dw++; /* for emit_wait_idle below */
 	if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
 	    ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 ||
 	    !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) {
 		ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 		assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw);
 	}
 
+	/* Wait for idle if either buffer has been used in the IB before to
+	 * prevent read-after-write hazards.
+	 */
+	if ((dst &&
+	     ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
+					      RADEON_USAGE_READWRITE)) ||
+	    (src &&
+	     ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
+					      RADEON_USAGE_WRITE)))
+		r600_dma_emit_wait_idle(ctx);
+
 	/* If GPUVM is not supported, the CS checker needs 2 entries
 	 * in the buffer list per packet, which has to be done manually.
 	 */
 	if (ctx->screen->info.has_virtual_memory) {
 		if (dst)
 			radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
 						  RADEON_USAGE_WRITE,
 						  RADEON_PRIO_SDMA_BUFFER);
 		if (src)
 			radeon_add_to_buffer_list(ctx, &ctx->dma, src,
 						  RADEON_USAGE_READ,
 						  RADEON_PRIO_SDMA_BUFFER);
 	}
 
 	/* this function is called before all DMA calls, so increment this. */
 	ctx->num_dma_calls++;
 }
 
-/* This is required to prevent read-after-write hazards. */
-void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
-{
-	struct radeon_winsys_cs *cs = rctx->dma.cs;
-
-	r600_need_dma_space(rctx, 1, NULL, NULL);
-
-	if (!radeon_emitted(cs, 0)) /* empty queue */
-		return;
-
-	/* NOP waits for idle on Evergreen and later. */
-	if (rctx->chip_class >= CIK)
-		radeon_emit(cs, 0x00000000); /* NOP */
-	else if (rctx->chip_class >= EVERGREEN)
-		radeon_emit(cs, 0xf0000000); /* NOP */
-	else {
-		/* TODO: R600-R700 should use the FENCE packet.
-		 * CS checker support is required. */
-	}
-}
-
 static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 {
 }
 
 void r600_preflush_suspend_features(struct r600_common_context *ctx)
 {
 	/* suspend queries */
 	if (!LIST_IS_EMPTY(&ctx->active_queries))
 		r600_suspend_queries(ctx);
 
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 917059c..74f86dc 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -725,21 +725,20 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen,
 			  unsigned processor);
 bool r600_extra_shader_checks(struct r600_common_screen *rscreen,
 			      unsigned processor);
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
 			      uint64_t offset, uint64_t size, unsigned value);
 struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
 						  const struct pipe_resource *templ);
 const char *r600_get_llvm_processor_name(enum radeon_family family);
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 			 struct r600_resource *dst, struct r600_resource *src);
-void r600_dma_emit_wait_idle(struct r600_common_context *rctx);
 void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
 		    struct radeon_saved_cs *saved);
 void radeon_clear_saved_cs(struct radeon_saved_cs *saved);
 bool r600_check_device_reset(struct r600_common_context *rctx);
 
 /* r600_gpu_load.c */
 void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen);
 uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen);
 unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
 
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 648b1ca..bee35cd 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -60,21 +60,20 @@ static void cik_sdma_copy_buffer(struct si_context *ctx,
 		radeon_emit(cs, csize);
 		radeon_emit(cs, 0); /* src/dst endian swap */
 		radeon_emit(cs, src_offset);
 		radeon_emit(cs, src_offset >> 32);
 		radeon_emit(cs, dst_offset);
 		radeon_emit(cs, dst_offset >> 32);
 		dst_offset += csize;
 		src_offset += csize;
 		size -= csize;
 	}
-	r600_dma_emit_wait_idle(&ctx->b);
 }
 
 static void cik_sdma_clear_buffer(struct pipe_context *ctx,
 				  struct pipe_resource *dst,
 				  uint64_t offset,
 				  uint64_t size,
 				  unsigned clear_value)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct radeon_winsys_cs *cs = sctx->b.dma.cs;
@@ -101,21 +100,20 @@ static void cik_sdma_clear_buffer(struct pipe_context *ctx,
 		csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
 		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
 						0x8000 /* dword copy */));
 		radeon_emit(cs, offset);
 		radeon_emit(cs, offset >> 32);
 		radeon_emit(cs, clear_value);
 		radeon_emit(cs, csize);
 		offset += csize;
 		size -= csize;
 	}
-	r600_dma_emit_wait_idle(&sctx->b);
 }
 
 static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
 {
 	width = u_minify(width, level);
 	return DIV_ROUND_UP(width, blk_w);
 }
 
 static unsigned encode_tile_info(struct si_context *sctx,
 				 struct r600_texture *tex, unsigned level,
@@ -244,22 +242,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
 		radeon_emit(cs, dstx | (dsty << 16));
 		radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
 		radeon_emit(cs, dst_slice_pitch - 1);
 		if (sctx->b.chip_class == CIK) {
 			radeon_emit(cs, copy_width | (copy_height << 16));
 			radeon_emit(cs, copy_depth);
 		} else {
 			radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
 			radeon_emit(cs, (copy_depth - 1));
 		}
-
-		r600_dma_emit_wait_idle(&sctx->b);
 		return true;
 	}
 
 	/* Tiled <-> linear sub-window copy. */
 	if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
 		struct r600_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? rsrc : rdst;
 		struct r600_texture *linear = tiled == rsrc ? rdst : rsrc;
 		unsigned tiled_level =	tiled	== rsrc ? src_level : dst_level;
 		unsigned linear_level =	linear	== rsrc ? src_level : dst_level;
 		unsigned tiled_x =	tiled	== rsrc ? srcx : dstx;
@@ -410,22 +406,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
 			radeon_emit(cs, linear_x | (linear_y << 16));
 			radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
 			radeon_emit(cs, linear_slice_pitch - 1);
 			if (sctx->b.chip_class == CIK) {
 				radeon_emit(cs, copy_width_aligned | (copy_height << 16));
 				radeon_emit(cs, copy_depth);
 			} else {
 				radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16));
 				radeon_emit(cs, (copy_depth - 1));
 			}
-
-			r600_dma_emit_wait_idle(&sctx->b);
 			return true;
 		}
 	}
 
 	/* Tiled -> Tiled sub-window copy. */
 	if (dst_mode >= RADEON_SURF_MODE_1D &&
 	    src_mode >= RADEON_SURF_MODE_1D &&
 	    /* check if these fit into the bitfields */
 	    src_address % 256 == 0 &&
 	    dst_address % 256 == 0 &&
@@ -508,22 +502,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
 			radeon_emit(cs, encode_tile_info(sctx, rdst, dst_level, false));
 			if (sctx->b.chip_class == CIK) {
 				radeon_emit(cs, copy_width_aligned |
 						(copy_height_aligned << 16));
 				radeon_emit(cs, copy_depth);
 			} else {
 				radeon_emit(cs, (copy_width_aligned - 8) |
 						((copy_height_aligned - 8) << 16));
 				radeon_emit(cs, (copy_depth - 1));
 			}
-
-			r600_dma_emit_wait_idle(&sctx->b);
 			return true;
 		}
 	}
 
 	return false;
 }
 
 static void cik_sdma_copy(struct pipe_context *ctx,
 			  struct pipe_resource *dst,
 			  unsigned dst_level,
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 8d186c3..1009bb2 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -69,21 +69,20 @@ static void si_dma_copy_buffer(struct si_context *ctx,
 		csize = size < max_csize ? size : max_csize;
 		radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize));
 		radeon_emit(cs, dst_offset);
 		radeon_emit(cs, src_offset);
 		radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
 		radeon_emit(cs, (src_offset >> 32UL) & 0xff);
 		dst_offset += csize << shift;
 		src_offset += csize << shift;
 		size -= csize;
 	}
-	r600_dma_emit_wait_idle(&ctx->b);
 }
 
 static void si_dma_copy_tile(struct si_context *ctx,
 			     struct pipe_resource *dst,
 			     unsigned dst_level,
 			     unsigned dst_x,
 			     unsigned dst_y,
 			     unsigned dst_z,
 			     struct pipe_resource *src,
 			     unsigned src_level,
@@ -170,21 +169,20 @@ static void si_dma_copy_tile(struct si_context *ctx,
 		radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16));
 		radeon_emit(cs, (slice_tile_max << 0) | (pipe_config << 26));
 		radeon_emit(cs, (tiled_x << 0) | (tiled_z << 18));
 		radeon_emit(cs, (tiled_y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27));
 		radeon_emit(cs, addr & 0xfffffffc);
 		radeon_emit(cs, (addr >> 32UL) & 0xff);
 		copy_height -= cheight;
 		addr += cheight * pitch;
 		tiled_y += cheight;
 	}
-	r600_dma_emit_wait_idle(&ctx->b);
 }
 
 static void si_dma_copy(struct pipe_context *ctx,
 			struct pipe_resource *dst,
 			unsigned dst_level,
 			unsigned dstx, unsigned dsty, unsigned dstz,
 			struct pipe_resource *src,
 			unsigned src_level,
 			const struct pipe_box *src_box)
 {
-- 
2.7.4



More information about the mesa-dev mailing list