[Mesa-dev] [PATCH v3 1/2] radeonsi: do not do two full flushes on every compute dispatch

Mon Apr 18 23:39:38 UTC 2016

v2: Add more CS_PARTIAL_FLUSH events.

Essentially every place with waits on finishing for pixel shaders
also has a write after read hazard with compute shaders.

Invalidating L2 waits implicitly on pixel and compute shaders,
so, we don't need a CS_PARTIAL_FLUSH for switching FBO.

v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.

According to Marek the INV_GLOBAL_L2 events don't wait for compute
shaders to finish, so wait for them explicitly.

Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
---
 src/gallium/drivers/radeonsi/si_compute.c     | 17 ++---------------
 src/gallium/drivers/radeonsi/si_cp_dma.c      |  6 ++++--
 src/gallium/drivers/radeonsi/si_descriptors.c |  3 ++-
 src/gallium/drivers/radeonsi/si_hw_context.c  |  1 +
 src/gallium/drivers/radeonsi/si_state.c       | 12 ++++++++----
 5 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 10b88b3..6803334 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -439,13 +439,8 @@ static void si_launch_grid(
 	if (!sctx->cs_shader_state.initialized)
 		si_initialize_compute(sctx);
 
-	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-			 SI_CONTEXT_INV_GLOBAL_L2 |
-			 SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_SMEM_L1 |
-			 SI_CONTEXT_FLUSH_WITH_INV_L2 |
-			 SI_CONTEXT_FLAG_COMPUTE;
-	si_emit_cache_flush(sctx, NULL);
+	if (sctx->b.flags)
+		si_emit_cache_flush(sctx, NULL);
 
 	if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
 		return;
@@ -478,14 +473,6 @@ static void si_launch_grid(
 		si_setup_tgsi_grid(sctx, info);
 
 	si_emit_dispatch_packets(sctx, info);
-
-	sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-			 SI_CONTEXT_INV_VMEM_L1 |
-			 SI_CONTEXT_INV_GLOBAL_L2 |
-			 SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_SMEM_L1 |
-			 SI_CONTEXT_FLAG_COMPUTE;
-	si_emit_cache_flush(sctx, NULL);
 }
 
 
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 001ddd4..38e0ee6 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 	uint64_t va = r600_resource(dst)->gpu_address + offset;
 
 	/* Flush the caches. */
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+	                 SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
 	while (size) {
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
@@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
 	}
 
 	/* Flush the caches. */
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+	                 SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
 	/* This is the main part doing the copying. Src is always aligned. */
 	main_dst_offset = dst_offset + skipped_size;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 5b65fae..98ad3a7 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -940,7 +940,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 	 * start writing to the targets.
 	 */
 	if (num_targets)
-		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+		                 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
 	/* Streamout buffers must be bound in 2 places:
 	 * 1) in VGT by setting the VGT_STRMOUT registers
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 9862f07..b179092e 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -84,6 +84,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
 			SI_CONTEXT_INV_VMEM_L1 |
 			SI_CONTEXT_INV_GLOBAL_L2 |
+			SI_CONTEXT_CS_PARTIAL_FLUSH |
 			/* this is probably not needed anymore */
 			SI_CONTEXT_PS_PARTIAL_FLUSH;
 	si_emit_cache_flush(ctx, NULL);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index af9ffdd..305a70b 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	 */
 	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
 			 SI_CONTEXT_INV_GLOBAL_L2 |
-			 SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+			 SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
+			 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
 	/* Take the maximum of the old and new count. If the new count is lower,
 	 * dirtying is needed to disable the unbound colorbuffers.
@@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
 
 	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
 			 SI_CONTEXT_INV_GLOBAL_L2 |
-			 SI_CONTEXT_FLUSH_AND_INV_CB;
+			 SI_CONTEXT_FLUSH_AND_INV_CB |
+			 SI_CONTEXT_CS_PARTIAL_FLUSH;
 }
 
 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
@@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 
 	/* Subsequent commands must wait for all shader invocations to
 	 * complete. */
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+	                 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
 	if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
 		sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
@@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 		     PIPE_BARRIER_SHADER_BUFFER |
 		     PIPE_BARRIER_TEXTURE |
 		     PIPE_BARRIER_IMAGE |
-		     PIPE_BARRIER_STREAMOUT_BUFFER)) {
+		     PIPE_BARRIER_STREAMOUT_BUFFER |
+		     PIPE_BARRIER_GLOBAL_BUFFER)) {
 		/* As far as I can tell, L1 contents are written back to L2
 		 * automatically at end of shader, but the contents of other
 		 * L1 caches might still be stale. */
-- 
2.8.0