[Mesa-dev] [PATCH 09/13] radeonsi: use TC L2 for CP DMA operations with shader resources on CIK

Marek Olšák maraeo at gmail.com
Mon Jan 5 12:20:59 PST 2015


From: Marek Olšák <marek.olsak at amd.com>

So that TC L2 doesn't need to be flushed.

The only problem is with index buffers, which don't use TC.
A simple solution is added that flushes TC L2 before a draw call (TC_L2_dirty).
---
 src/gallium/drivers/radeon/r600_pipe_common.h | 12 ++++++++++
 src/gallium/drivers/radeonsi/si_descriptors.c | 32 ++++++++++++++++++---------
 src/gallium/drivers/radeonsi/si_state_draw.c  |  5 +++++
 3 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index a9416b6..60b8fae 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -138,6 +138,18 @@ struct r600_resource {
 	 * the unsynchronized map flag and expect the driver to figure it out.
          */
 	struct util_range		valid_buffer_range;
+
+	/* For buffers only. This indicates that a write operation has been
+	 * performed by TC L2, but the cache hasn't been flushed.
+	 * Any hw block which doesn't use or bypasses TC L2 should check this
+	 * flag and flush the cache before using the buffer.
+	 *
+	 * For example, TC L2 must be flushed if a buffer which has been
+	 * modified by a shader store instruction is about to be used as
+	 * an index buffer. The reason is that VGT DMA index fetching doesn't
+	 * use TC L2.
+	 */
+	bool				TC_L2_dirty;
 };
 
 struct r600_transfer {
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index e2da476..454e12c 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1073,7 +1073,7 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 			    bool is_framebuffer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	unsigned flush_flags;
+	unsigned flush_flags, tc_l2_flag;
 
 	if (!size)
 		return;
@@ -1098,19 +1098,22 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 	uint64_t va = r600_resource(dst)->gpu_address + offset;
 
 	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer)
+	if (is_framebuffer) {
 		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-	else
+		tc_l2_flag = 0;
+	} else {
 		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      SI_CONTEXT_INV_TC_L2 |
+			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
 			      SI_CONTEXT_INV_KCACHE;
+		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	}
 
 	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 			 flush_flags;
 
 	while (size) {
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
-		unsigned dma_flags = 0;
+		unsigned dma_flags = tc_l2_flag;
 
 		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0),
 				 FALSE);
@@ -1141,6 +1144,9 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 	/* Flush the caches again in case the 3D engine has been prefetching
 	 * the resource. */
 	sctx->b.flags |= flush_flags;
+
+	if (tc_l2_flag)
+		r600_resource(dst)->TC_L2_dirty = true;
 }
 
 void si_copy_buffer(struct si_context *sctx,
@@ -1148,7 +1154,7 @@ void si_copy_buffer(struct si_context *sctx,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
 		    bool is_framebuffer)
 {
-	unsigned flush_flags;
+	unsigned flush_flags, tc_l2_flag;
 
 	if (!size)
 		return;
@@ -1163,18 +1169,21 @@ void si_copy_buffer(struct si_context *sctx,
 	src_offset += r600_resource(src)->gpu_address;
 
 	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer)
+	if (is_framebuffer) {
 		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-	else
+		tc_l2_flag = 0;
+	} else {
 		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      SI_CONTEXT_INV_TC_L2 |
+			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
 			      SI_CONTEXT_INV_KCACHE;
+		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	}
 
 	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 			 flush_flags;
 
 	while (size) {
-		unsigned sync_flags = 0;
+		unsigned sync_flags = tc_l2_flag;
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
 
 		si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE);
@@ -1206,6 +1215,9 @@ void si_copy_buffer(struct si_context *sctx,
 	/* Flush the caches again in case the 3D engine has been prefetching
 	 * the resource. */
 	sctx->b.flags |= flush_flags;
+
+	if (tc_l2_flag)
+		r600_resource(dst)->TC_L2_dirty = true;
 }
 
 /* INIT/DEINIT */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index ccc44d5..e6916c1 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -551,6 +551,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		}
 	}
 
+	if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) {
+		sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
+		r600_resource(ib.buffer)->TC_L2_dirty = false;
+	}
+
 	/* Check flush flags. */
 	if (sctx->b.flags)
 		sctx->atoms.s.cache_flush->dirty = true;
-- 
2.1.0



More information about the mesa-dev mailing list