[Mesa-dev] [PATCH 16/17] radeonsi: don't emit partial flushes at the end of IBs (v3) (revisited)

Wed Apr 4 01:59:21 UTC 2018

From: Marek Olšák <marek.olsak at amd.com>

Now draw calls from multiple IBs can be executed in parallel.

v2: do emit partial flushes on SI
v3: invalidate all shader caches at the beginning of IBs

If we artificially limit the number of draw calls per IB to 5, we'll get
a lot more IBs, leading to a lot more partial flushes. Let's see how
the removal of partial flushes changes GPU utilization in that scenario:

With partial flushes (time busy):
    CP: 99%
    SPI: 86%
    CB: 73:

Without partial flushes (time busy):
    CP: 99%
    SPI: 93%
    CB: 81%
---
 src/gallium/drivers/radeonsi/si_hw_context.c | 39 ++++++++++++++++++----------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 61c8d7067a1..b32b841a628 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -99,27 +99,31 @@ void si_context_gfx_flush(void *context, unsigned flags,
 
 	if (!LIST_IS_EMPTY(&ctx->b.active_queries))
 		si_suspend_queries(&ctx->b);
 
 	ctx->streamout.suspended = false;
 	if (ctx->streamout.begin_emitted) {
 		si_emit_streamout_end(ctx);
 		ctx->streamout.suspended = true;
 	}
 
-	ctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-			SI_CONTEXT_PS_PARTIAL_FLUSH;
-
-	/* DRM 3.1.0 doesn't flush TC for VI correctly. */
-	if (ctx->b.chip_class == VI && ctx->b.screen->info.drm_minor <= 1)
-		ctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2 |
-				SI_CONTEXT_INV_VMEM_L1;
+	if (ctx->b.chip_class == VI && ctx->b.screen->info.drm_minor <= 1) {
+		/* DRM 3.1.0 doesn't flush TC for VI correctly. */
+		ctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+				SI_CONTEXT_CS_PARTIAL_FLUSH |
+				SI_CONTEXT_INV_GLOBAL_L2;
+	} else if (ctx->b.chip_class == SI) {
+		/* The kernel doesn't wait for idle before flushing and
+		 * invalidating TC L2. */
+		ctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+				SI_CONTEXT_CS_PARTIAL_FLUSH;
+	}
 
 	si_emit_cache_flush(ctx);
 
 	if (ctx->current_saved_cs) {
 		si_trace_emit(ctx);
 		si_log_hw_flush(ctx);
 
 		/* Save the IB for debug contexts. */
 		si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
 		ctx->current_saved_cs->flushed = true;
@@ -180,26 +184,35 @@ static void si_begin_cs_debug(struct si_context *ctx)
 
 	radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, ctx->current_saved_cs->trace_buf,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 }
 
 void si_begin_new_cs(struct si_context *ctx)
 {
 	if (ctx->is_debug)
 		si_begin_cs_debug(ctx);
 
-	/* Flush read caches at the beginning of CS not flushed by the kernel. */
-	if (ctx->b.chip_class >= CIK)
-		ctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
-				SI_CONTEXT_INV_ICACHE;
-
-	ctx->b.flags |= SI_CONTEXT_START_PIPELINE_STATS;
+	/* Always invalidate caches at the beginning of IBs, because external
+	 * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
+	 * buffers.
+	 *
+	 * Note that the cache flush done by the kernel at the end of GFX IBs
+	 * isn't useful here, because that flush can finish after the following
+	 * IB starts drawing.
+	 *
+	 * TODO: Do we also need to invalidate CB & DB caches?
+	 */
+	ctx->b.flags |= SI_CONTEXT_INV_ICACHE |
+			SI_CONTEXT_INV_SMEM_L1 |
+			SI_CONTEXT_INV_VMEM_L1 |
+			SI_CONTEXT_INV_GLOBAL_L2 |
+			SI_CONTEXT_START_PIPELINE_STATS;
 
 	/* set all valid group as dirty so they get reemited on
 	 * next draw command
 	 */
 	si_pm4_reset_emitted(ctx);
 
 	/* The CS initialization should be emitted before everything else. */
 	si_pm4_emit(ctx, ctx->init_config);
 	if (ctx->init_config_gs_rings)
 		si_pm4_emit(ctx, ctx->init_config_gs_rings);
-- 
2.15.1