[Mesa-dev] [PATCH 10/15] radeonsi: merge SI and CI dma_clear_buffer and remove the callback

Wed Aug 29 20:13:06 UTC 2018

From: Marek Olšák <marek.olsak at amd.com>

also use assertions for the requirements that offset and size are a multiple
of 4.
---
 src/gallium/drivers/radeon/radeon_video.c     |  3 +-
 src/gallium/drivers/radeonsi/cik_sdma.c       | 41 -------------
 src/gallium/drivers/radeonsi/si_cp_dma.c      |  2 +-
 src/gallium/drivers/radeonsi/si_dma.c         | 40 ------------
 src/gallium/drivers/radeonsi/si_dma_cs.c      | 61 ++++++++++++++++++-
 src/gallium/drivers/radeonsi/si_pipe.c        |  2 +-
 src/gallium/drivers/radeonsi/si_pipe.h        |  5 +-
 .../drivers/radeonsi/si_test_dma_perf.c       |  2 +-
 8 files changed, 66 insertions(+), 90 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 749f30c2306..a39ce4cc73e 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -113,22 +113,21 @@ error:
 	si_vid_destroy_buffer(new_buf);
 	*new_buf = old_buf;
 	return false;
 }
 
 /* clear the buffer with zeros */
 void si_vid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer)
 {
 	struct si_context *sctx = (struct si_context*)context;
 
-	sctx->dma_clear_buffer(sctx, &buffer->res->b.b, 0,
-			       buffer->res->buf->size, 0);
+	si_sdma_clear_buffer(sctx, &buffer->res->b.b, 0, buffer->res->buf->size, 0);
 	context->flush(context, NULL, 0);
 }
 
 /**
  * join surfaces into the same buffer with identical tiling params
  * sumup their sizes and replace the backend buffers with a single bo
  */
 void si_vid_join_surfaces(struct si_context *sctx,
 			  struct pb_buffer** buffers[VL_NUM_COMPONENTS],
 			  struct radeon_surf *surfaces[VL_NUM_COMPONENTS])
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 595f8d49a80..1c2fd0f7b1c 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -60,60 +60,20 @@ static void cik_sdma_copy_buffer(struct si_context *ctx,
 		radeon_emit(cs, src_offset);
 		radeon_emit(cs, src_offset >> 32);
 		radeon_emit(cs, dst_offset);
 		radeon_emit(cs, dst_offset >> 32);
 		dst_offset += csize;
 		src_offset += csize;
 		size -= csize;
 	}
 }
 
-static void cik_sdma_clear_buffer(struct si_context *sctx,
-				  struct pipe_resource *dst,
-				  uint64_t offset,
-				  uint64_t size,
-				  unsigned clear_value)
-{
-	struct radeon_cmdbuf *cs = sctx->dma_cs;
-	unsigned i, ncopy, csize;
-	struct r600_resource *rdst = r600_resource(dst);
-
-	if (!cs || offset % 4 != 0 || size % 4 != 0 ||
-	    dst->flags & PIPE_RESOURCE_FLAG_SPARSE) {
-		sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
-		return;
-	}
-
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&rdst->valid_buffer_range, offset, offset + size);
-
-	offset += rdst->gpu_address;
-
-	/* the same maximum size as for copying */
-	ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
-	si_need_dma_space(sctx, ncopy * 5, rdst, NULL);
-
-	for (i = 0; i < ncopy; i++) {
-		csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
-		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
-						0x8000 /* dword copy */));
-		radeon_emit(cs, offset);
-		radeon_emit(cs, offset >> 32);
-		radeon_emit(cs, clear_value);
-		radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
-		offset += csize;
-		size -= csize;
-	}
-}
-
 static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
 {
 	width = u_minify(width, level);
 	return DIV_ROUND_UP(width, blk_w);
 }
 
 static unsigned encode_tile_info(struct si_context *sctx,
 				 struct si_texture *tex, unsigned level,
 				 bool set_bpp)
 {
@@ -547,12 +507,11 @@ static void cik_sdma_copy(struct pipe_context *ctx,
 		return;
 
 fallback:
 	si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
 				src, src_level, src_box);
 }
 
 void cik_init_sdma_functions(struct si_context *sctx)
 {
 	sctx->dma_copy = cik_sdma_copy;
-	sctx->dma_clear_buffer = cik_sdma_clear_buffer;
 }
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 486ae75c77f..598d5ecf0dc 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -294,21 +294,21 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 	     * alone improves DeusEx:MD performance by 70%. */
 	    (size > CP_DMA_CLEAR_PERF_THRESHOLD ||
 	     /* Buffers not used by the GFX IB yet will be cleared by SDMA.
 	      * This happens to move most buffer clears to SDMA, including
 	      * DCC and CMASK clears, because pipe->clear clears them before
 	      * si_emit_framebuffer_state (in a draw call) adds them.
 	      * For example, DeusEx:MD has 21 buffer clears per frame and all
 	      * of them are moved to SDMA thanks to this. */
 	     !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf,
 				          RADEON_USAGE_READWRITE))) {
-		sctx->dma_clear_buffer(sctx, dst, offset, dma_clear_size, value);
+		si_sdma_clear_buffer(sctx, dst, offset, dma_clear_size, value);
 
 		offset += dma_clear_size;
 		size -= dma_clear_size;
 	} else if (dma_clear_size >= 4) {
 		si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value,
 				       coher, cache_policy);
 
 		offset += dma_clear_size;
 		size -= dma_clear_size;
 	}
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index da5bd47b5dd..046d8445ce3 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -70,59 +70,20 @@ static void si_dma_copy_buffer(struct si_context *ctx,
 		radeon_emit(cs, dst_offset);
 		radeon_emit(cs, src_offset);
 		radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
 		radeon_emit(cs, (src_offset >> 32UL) & 0xff);
 		dst_offset += count;
 		src_offset += count;
 		size -= count;
 	}
 }
 
-static void si_dma_clear_buffer(struct si_context *sctx,
-				struct pipe_resource *dst,
-				uint64_t offset,
-				uint64_t size,
-				unsigned clear_value)
-{
-	struct radeon_cmdbuf *cs = sctx->dma_cs;
-	unsigned i, ncopy, csize;
-	struct r600_resource *rdst = r600_resource(dst);
-
-	if (!cs || offset % 4 != 0 || size % 4 != 0 ||
-	    dst->flags & PIPE_RESOURCE_FLAG_SPARSE) {
-		sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
-		return;
-	}
-
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&rdst->valid_buffer_range, offset, offset + size);
-
-	offset += rdst->gpu_address;
-
-	/* the same maximum size as for copying */
-	ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
-	si_need_dma_space(sctx, ncopy * 4, rdst, NULL);
-
-	for (i = 0; i < ncopy; i++) {
-		csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
-		radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
-					      csize / 4));
-		radeon_emit(cs, offset);
-		radeon_emit(cs, clear_value);
-		radeon_emit(cs, (offset >> 32) << 16);
-		offset += csize;
-		size -= csize;
-	}
-}
-
 static void si_dma_copy_tile(struct si_context *ctx,
 			     struct pipe_resource *dst,
 			     unsigned dst_level,
 			     unsigned dst_x,
 			     unsigned dst_y,
 			     unsigned dst_z,
 			     struct pipe_resource *src,
 			     unsigned src_level,
 			     unsigned src_x,
 			     unsigned src_y,
@@ -318,12 +279,11 @@ static void si_dma_copy(struct pipe_context *ctx,
 	return;
 
 fallback:
 	si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
 				src, src_level, src_box);
 }
 
 void si_init_dma_functions(struct si_context *sctx)
 {
 	sctx->dma_copy = si_dma_copy;
-	sctx->dma_clear_buffer = si_dma_clear_buffer;
 }
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
index 7db9570af3c..ffa2f5ae69b 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -57,20 +57,79 @@ void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst,
 	si_need_dma_space(sctx, 4, dst, NULL);
 	si_dma_emit_wait_idle(sctx);
 
 	radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
 					SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
 					0));
 	radeon_emit(cs, va);
 	radeon_emit(cs, va >> 32);
 }
 
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+			  uint64_t offset, uint64_t size, unsigned clear_value)
+{
+	struct radeon_cmdbuf *cs = sctx->dma_cs;
+	unsigned i, ncopy, csize;
+	struct r600_resource *rdst = r600_resource(dst);
+
+	assert(offset % 4 == 0);
+	assert(size);
+	assert(size % 4 == 0);
+
+	if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE) {
+		sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
+		return;
+	}
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&rdst->valid_buffer_range, offset, offset + size);
+
+	offset += rdst->gpu_address;
+
+	if (sctx->chip_class == SI) {
+		/* the same maximum size as for copying */
+		ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+		si_need_dma_space(sctx, ncopy * 4, rdst, NULL);
+
+		for (i = 0; i < ncopy; i++) {
+			csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
+			radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
+						      csize / 4));
+			radeon_emit(cs, offset);
+			radeon_emit(cs, clear_value);
+			radeon_emit(cs, (offset >> 32) << 16);
+			offset += csize;
+			size -= csize;
+		}
+		return;
+	}
+
+	/* The following code is for CI, VI, Vega/Raven, etc. */
+	/* the same maximum size as for copying */
+	ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
+	si_need_dma_space(sctx, ncopy * 5, rdst, NULL);
+
+	for (i = 0; i < ncopy; i++) {
+		csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
+		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
+						0x8000 /* dword copy */));
+		radeon_emit(cs, offset);
+		radeon_emit(cs, offset >> 32);
+		radeon_emit(cs, clear_value);
+		radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
+		offset += csize;
+		size -= csize;
+	}
+}
+
 void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
 		       struct r600_resource *dst, struct r600_resource *src)
 {
 	uint64_t vram = ctx->dma_cs->used_vram;
 	uint64_t gtt = ctx->dma_cs->used_gart;
 
 	if (dst) {
 		vram += dst->vram_usage;
 		gtt += dst->gart_usage;
 	}
@@ -163,14 +222,14 @@ void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
 		si_clear_saved_cs(&saved);
 	}
 }
 
 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
 			    uint64_t offset, uint64_t size, unsigned value)
 {
 	struct si_context *ctx = (struct si_context*)sscreen->aux_context;
 
 	mtx_lock(&sscreen->aux_context_lock);
-	ctx->dma_clear_buffer(ctx, dst, offset, size, value);
+	si_sdma_clear_buffer(ctx, dst, offset, size, value);
 	sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
 	mtx_unlock(&sscreen->aux_context_lock);
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index c259c260550..bce7b3f550e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -728,21 +728,21 @@ static void si_test_vmfault(struct si_screen *sscreen)
 	}
 
 	r600_resource(buf)->gpu_address = 0; /* cause a VM fault */
 
 	if (sscreen->debug_flags & DBG(TEST_VMFAULT_CP)) {
 		si_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, -1);
 		ctx->flush(ctx, NULL, 0);
 		puts("VM fault test: CP - done.");
 	}
 	if (sscreen->debug_flags & DBG(TEST_VMFAULT_SDMA)) {
-		sctx->dma_clear_buffer(sctx, buf, 0, 4, 0);
+		si_sdma_clear_buffer(sctx, buf, 0, 4, 0);
 		ctx->flush(ctx, NULL, 0);
 		puts("VM fault test: SDMA - done.");
 	}
 	if (sscreen->debug_flags & DBG(TEST_VMFAULT_SHADER)) {
 		util_test_constant_buffer(ctx, buf);
 		puts("VM fault test: Shader - done.");
 	}
 	exit(0);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index ef4f06f41d5..a6f09b65f74 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1022,23 +1022,20 @@ struct si_context {
 
 	/* Copy one resource to another using async DMA. */
 	void (*dma_copy)(struct pipe_context *ctx,
 			 struct pipe_resource *dst,
 			 unsigned dst_level,
 			 unsigned dst_x, unsigned dst_y, unsigned dst_z,
 			 struct pipe_resource *src,
 			 unsigned src_level,
 			 const struct pipe_box *src_box);
 
-	void (*dma_clear_buffer)(struct si_context *sctx, struct pipe_resource *dst,
-				 uint64_t offset, uint64_t size, unsigned value);
-
 	struct si_tracked_regs			tracked_regs;
 };
 
 /* cik_sdma.c */
 void cik_init_sdma_functions(struct si_context *sctx);
 
 /* si_blit.c */
 enum si_blitter_op /* bitmask */
 {
 	SI_SAVE_TEXTURES      = 1,
@@ -1152,20 +1149,22 @@ void si_init_debug_functions(struct si_context *sctx);
 void si_check_vm_faults(struct si_context *sctx,
 			struct radeon_saved_cs *saved, enum ring_type ring);
 bool si_replace_shader(unsigned num, struct ac_shader_binary *binary);
 
 /* si_dma.c */
 void si_init_dma_functions(struct si_context *sctx);
 
 /* si_dma_cs.c */
 void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst,
 			   uint64_t offset);
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+			  uint64_t offset, uint64_t size, unsigned clear_value);
 void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
 		       struct r600_resource *dst, struct r600_resource *src);
 void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
 		     struct pipe_fence_handle **fence);
 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
 			    uint64_t offset, uint64_t size, unsigned value);
 
 /* si_fence.c */
 void si_gfx_write_event_eop(struct si_context *ctx,
 			    unsigned event, unsigned event_flags,
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
index be2ad079e1a..f097a642999 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -184,21 +184,21 @@ void si_test_dma_perf(struct si_screen *sscreen)
 							si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value,
 									       SI_COHERENCY_NONE, cache_policy);
 						}
 					} else if (test_sdma) {
 						/* SDMA */
 						if (is_copy) {
 							struct pipe_box box;
 							u_box_1d(0, size, &box);
 							sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
 						} else {
-							sctx->dma_clear_buffer(sctx, dst, 0, size, clear_value);
+							si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
 						}
 					} else {
 						/* Compute */
 						/* The memory accesses are coalesced, meaning that the 1st instruction writes
 						 * the 1st contiguous block of data for the whole wave, the 2nd instruction
 						 * writes the 2nd contiguous block of data, etc.
 						 */
 						unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
 						unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
 						unsigned dwords_per_wave = cs_dwords_per_thread * 64;
-- 
2.17.1