[Mesa-dev] [PATCH 18/18] radeonsi: split si_clear_buffer to remove enum si_method

Sat Aug 4 07:54:57 UTC 2018

From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_clear.c       |  6 +-
 src/gallium/drivers/radeonsi/si_cp_dma.c      | 88 ++++++++++---------
 src/gallium/drivers/radeonsi/si_pipe.c        |  2 +-
 src/gallium/drivers/radeonsi/si_pipe.h        | 11 ++-
 .../drivers/radeonsi/si_test_clearbuffer.c    |  4 +-
 src/gallium/drivers/radeonsi/si_test_dma.c    |  2 +-
 6 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index 654ff0ace78..4e07de81bac 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -249,21 +249,21 @@ void vi_dcc_clear_level(struct si_context *sctx,
 		 * would be more efficient than separate per-layer clear operations.
 		 */
 		assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 1);
 
 		dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
 		clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
 			     num_layers;
 	}
 
 	si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
-			clear_value, SI_COHERENCY_CB_META, SI_METHOD_BEST);
+			clear_value, SI_COHERENCY_CB_META);
 }
 
 /* Set the same micro tile mode as the destination of the last MSAA resolve.
  * This allows hitting the MSAA resolve fast path, which requires that both
  * src and dst micro tile modes match.
  */
 static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
 					   struct si_texture *tex)
 {
 	if (tex->buffer.b.is_shared ||
@@ -482,21 +482,21 @@ static void si_do_fast_color_clear(struct si_context *sctx,
 				continue;
 
 			/* DCC fast clear with MSAA should clear CMASK to 0xC. */
 			if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
 				/* TODO: This doesn't work with MSAA. */
 				if (eliminate_needed)
 					continue;
 
 				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 						tex->cmask_offset, tex->surface.cmask_size,
-						0xCCCCCCCC, SI_COHERENCY_CB_META, SI_METHOD_BEST);
+						0xCCCCCCCC, SI_COHERENCY_CB_META);
 				need_decompress_pass = true;
 			}
 
 			vi_dcc_clear_level(sctx, tex, 0, reset_value);
 
 			if (eliminate_needed)
 				need_decompress_pass = true;
 
 			tex->separate_dcc_dirty = true;
 		} else {
@@ -513,21 +513,21 @@ static void si_do_fast_color_clear(struct si_context *sctx,
 				continue;
 
 			/* ensure CMASK is enabled */
 			si_alloc_separate_cmask(sctx->screen, tex);
 			if (!tex->cmask_buffer)
 				continue;
 
 			/* Do the fast clear. */
 			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 					tex->cmask_offset, tex->surface.cmask_size, 0,
-					SI_COHERENCY_CB_META, SI_METHOD_BEST);
+					SI_COHERENCY_CB_META);
 			need_decompress_pass = true;
 		}
 
 		if (need_decompress_pass &&
 		    !(tex->dirty_level_mask & (1 << level))) {
 			tex->dirty_level_mask |= 1 << level;
 			p_atomic_inc(&sctx->screen->compressed_colortex_counter);
 		}
 
 		/* We can change the micro tile mode before a full clear. */
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 86eb3529d9b..bae592a4f7d 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -217,95 +217,103 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
 	 */
 	if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
 	    byte_count == remaining_size) {
 		*packet_flags |= CP_DMA_SYNC;
 
 		if (coher == SI_COHERENCY_SHADER)
 			*packet_flags |= CP_DMA_PFP_SYNC_ME;
 	}
 }
 
+void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+			    uint64_t offset, uint64_t size, unsigned value,
+			    enum si_coherency coher,
+			    enum si_cache_policy cache_policy)
+{
+	struct r600_resource *rdst = r600_resource(dst);
+	uint64_t va = rdst->gpu_address + offset;
+	bool is_first = true;
+
+	assert(size && size % 4 == 0);
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&rdst->valid_buffer_range, offset, offset + size);
+
+	/* Flush the caches. */
+	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+		       SI_CONTEXT_CS_PARTIAL_FLUSH |
+		       get_flush_flags(sctx, coher, cache_policy);
+
+	while (size) {
+		unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
+		unsigned dma_flags = CP_DMA_CLEAR;
+
+		si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0, coher,
+				  &is_first, &dma_flags);
+
+		/* Emit the clear packet. */
+		si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, cache_policy);
+
+		size -= byte_count;
+		va += byte_count;
+	}
+
+	if (cache_policy != L2_BYPASS)
+		rdst->TC_L2_dirty = true;
+
+	/* If it's not a framebuffer fast clear... */
+	if (coher == SI_COHERENCY_SHADER)
+		sctx->num_cp_dma_calls++;
+}
+
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, unsigned value,
-		     enum si_coherency coher, enum si_method xfer)
+		     enum si_coherency coher)
 {
 	struct radeon_winsys *ws = sctx->ws;
 	struct r600_resource *rdst = r600_resource(dst);
 	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
-	unsigned flush_flags = get_flush_flags(sctx, coher, cache_policy);
 	uint64_t dma_clear_size;
-	bool is_first = true;
 
 	if (!size)
 		return;
 
 	dma_clear_size = size & ~3ull;
 
-	/* Mark the buffer range of destination as valid (initialized),
-	 * so that transfer_map knows it should wait for the GPU when mapping
-	 * that range. */
-	util_range_add(&rdst->valid_buffer_range, offset,
-		       offset + dma_clear_size);
-
 	/* dma_clear_buffer can use clear_buffer on failure. Make sure that
 	 * doesn't happen. We don't want an infinite recursion: */
 	if (sctx->dma_cs &&
 	    !(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) &&
 	    (offset % 4 == 0) &&
 	    /* CP DMA is very slow. Always use SDMA for big clears. This
 	     * alone improves DeusEx:MD performance by 70%. */
 	    (size > CP_DMA_CLEAR_PERF_THRESHOLD ||
 	     /* Buffers not used by the GFX IB yet will be cleared by SDMA.
 	      * This happens to move most buffer clears to SDMA, including
 	      * DCC and CMASK clears, because pipe->clear clears them before
 	      * si_emit_framebuffer_state (in a draw call) adds them.
 	      * For example, DeusEx:MD has 21 buffer clears per frame and all
 	      * of them are moved to SDMA thanks to this. */
 	     !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf,
-				          RADEON_USAGE_READWRITE)) &&
-	    /* bypass sdma transfer with param xfer */
-	    (xfer != SI_METHOD_CP_DMA)) {
+				          RADEON_USAGE_READWRITE))) {
 		sctx->dma_clear_buffer(sctx, dst, offset, dma_clear_size, value);
 
 		offset += dma_clear_size;
 		size -= dma_clear_size;
 	} else if (dma_clear_size >= 4) {
-		uint64_t va = rdst->gpu_address + offset;
+		si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value,
+				       coher, cache_policy);
 
 		offset += dma_clear_size;
 		size -= dma_clear_size;
-
-		/* Flush the caches. */
-		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			       SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
-
-		while (dma_clear_size) {
-			unsigned byte_count = MIN2(dma_clear_size, cp_dma_max_byte_count(sctx));
-			unsigned dma_flags = CP_DMA_CLEAR;
-
-			si_cp_dma_prepare(sctx, dst, NULL, byte_count, dma_clear_size, 0,
-					  coher, &is_first, &dma_flags);
-
-			/* Emit the clear packet. */
-			si_emit_cp_dma(sctx, va, value, byte_count, dma_flags,
-				       cache_policy);
-
-			dma_clear_size -= byte_count;
-			va += byte_count;
-		}
-
-		if (cache_policy != L2_BYPASS)
-			rdst->TC_L2_dirty = true;
-
-		/* If it's not a framebuffer fast clear... */
-		if (coher == SI_COHERENCY_SHADER)
-			sctx->num_cp_dma_calls++;
 	}
 
 	if (size) {
 		/* Handle non-dword alignment.
 		 *
 		 * This function is called for embedded texture metadata clears,
 		 * but those should always be properly aligned. */
 		assert(dst->target == PIPE_BUFFER);
 		assert(size < 4);
 
@@ -363,21 +371,21 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx,
 		break;
 	case 2:
 		dword_value = *(uint16_t*)clear_value_ptr;
 		dword_value |= dword_value << 16;
 		break;
 	default:
 		dword_value = *(uint32_t*)clear_value_ptr;
 	}
 
 	si_clear_buffer(sctx, dst, offset, size, dword_value,
-			SI_COHERENCY_SHADER, SI_METHOD_BEST);
+			SI_COHERENCY_SHADER);
 }
 
 /**
  * Realign the CP DMA engine. This must be done after a copy with an unaligned
  * size.
  *
  * \param size  Remaining size to the CP DMA alignment.
  */
 static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
 				     unsigned user_flags, enum si_coherency coher,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index c1983b86661..3ca53dfed7c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -539,21 +539,21 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 		si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
 				 &sctx->null_const_buf);
 		si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
 				 &sctx->null_const_buf);
 		si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
 				 &sctx->null_const_buf);
 
 		/* Clear the NULL constant buffer, because loads should return zeros. */
 		si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
 				sctx->null_const_buf.buffer->width0, 0,
-				SI_COHERENCY_SHADER, SI_METHOD_BEST);
+				SI_COHERENCY_SHADER);
 	}
 
 	uint64_t max_threads_per_block;
 	screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI,
 				  PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
 				  &max_threads_per_block);
 
 	/* The maximum number of scratch waves. Scratch space isn't divided
 	 * evenly between CUs. The number is only a function of the number of CUs.
 	 * We can decrease the constant to decrease the scratch buffer size.
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 0b398018c4a..acdc0d11f2c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1109,29 +1109,28 @@ enum si_cache_policy {
 	L2_BYPASS,
 	L2_LRU,    /* same as SLC=0 */
 };
 
 enum si_coherency {
 	SI_COHERENCY_NONE, /* no cache flushes needed */
 	SI_COHERENCY_SHADER,
 	SI_COHERENCY_CB_META,
 };
 
-enum si_method {
-	SI_METHOD_CP_DMA,
-	SI_METHOD_BEST,
-};
-
 void si_cp_dma_wait_for_idle(struct si_context *sctx);
+void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+			    uint64_t offset, uint64_t size, unsigned value,
+			    enum si_coherency coher,
+			    enum si_cache_policy cache_policy);
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, unsigned value,
-		     enum si_coherency coher, enum si_method xfer);
+		     enum si_coherency coher);
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
 		    unsigned user_flags);
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
 			      uint64_t offset, unsigned size);
 void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
 void si_init_cp_dma_functions(struct si_context *sctx);
 
 /* si_debug.c */
diff --git a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
index c0696da26db..e863381fd15 100644
--- a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
+++ b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c
@@ -41,22 +41,22 @@ measure_clearbuf_time(struct pipe_context *ctx,
 
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct pipe_screen *screen = ctx->screen;
 
 	buf = pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, memory_size);
 
 	query_te = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0);
 
 	ctx->begin_query(ctx, query_te);
 	/* operation  */
-	si_clear_buffer(sctx, buf, 0, memory_size, 0x00,
-			SI_COHERENCY_SHADER, SI_METHOD_CP_DMA);
+	si_cp_dma_clear_buffer(sctx, buf, 0, memory_size, 0x00,
+			       SI_COHERENCY_SHADER, L2_LRU);
 	ctx->end_query(ctx, query_te);
 	ctx->get_query_result(ctx, query_te, true, &qresult);
 
 	/* Cleanup. */
 	ctx->destroy_query(ctx, query_te);
 	pipe_resource_reference(&buf, NULL);
 
 	/* Report Results */
 	return qresult.u64;
 }
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
index f125769d1cf..c81ec75dde2 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -301,21 +301,21 @@ void si_test_dma(struct si_screen *sscreen)
 		       array_mode_to_string(sscreen, &sdst->surface),
 		       tsrc.width0, tsrc.height0, tsrc.array_size,
 		       array_mode_to_string(sscreen, &ssrc->surface), bpp);
 		fflush(stdout);
 
 		/* set src pixels */
 		set_random_pixels(ctx, src, &src_cpu);
 
 		/* clear dst pixels */
 		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0,
-		                SI_COHERENCY_SHADER, SI_METHOD_BEST);
+		                SI_COHERENCY_SHADER);
 		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
 
 		/* preparation */
 		max_width = MIN2(tsrc.width0, tdst.width0);
 		max_height = MIN2(tsrc.height0, tdst.height0);
 		max_depth = MIN2(tsrc.array_size, tdst.array_size);
 
 		num = do_partial_copies ? num_partial_copies : 1;
 		for (j = 0; j < num; j++) {
 			int width, height, depth;
-- 
2.17.1