[Mesa-dev] [PATCH 09/15] radeonsi/gfx9: don't flush L2 metadata for CB if not needed

Marek Olšák maraeo at gmail.com
Mon Aug 21 21:54:08 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_blit.c       |  8 +++++---
 src/gallium/drivers/radeonsi/si_pipe.h       | 23 +++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_state.c      | 19 +++++++++++++++----
 src/gallium/drivers/radeonsi/si_state_draw.c | 11 +++++------
 4 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index ae7f809..3228933 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -399,21 +399,22 @@ si_decompress_depth(struct si_context *sctx,
 			if (inplace_planes & PIPE_MASK_Z)
 				tex->dirty_level_mask = 0;
 			if (inplace_planes & PIPE_MASK_S)
 				tex->stencil_dirty_level_mask = 0;
 		}
 	}
 	/* set_framebuffer_state takes care of coherency for single-sample.
 	 * The DB->CB copy uses CB for the final writes.
 	 */
 	if (copy_planes && tex->resource.b.b.nr_samples > 1)
-		si_make_CB_shader_coherent(sctx, tex->resource.b.b.nr_samples);
+		si_make_CB_shader_coherent(sctx, tex->resource.b.b.nr_samples,
+					   false);
 }
 
 static void
 si_decompress_sampler_depth_textures(struct si_context *sctx,
 				     struct si_textures_info *textures)
 {
 	unsigned i;
 	unsigned mask = textures->needs_depth_decompress_mask;
 
 	while (mask) {
@@ -504,21 +505,22 @@ static void si_blit_decompress_color(struct pipe_context *ctx,
 		}
 
 		/* The texture will always be dirty if some layers aren't flushed.
 		 * I don't think this case occurs often though. */
 		if (first_layer == 0 && last_layer >= max_layer) {
 			rtex->dirty_level_mask &= ~(1 << level);
 		}
 	}
 
 	sctx->decompression_enabled = false;
-	si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples);
+	si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples,
+				   vi_dcc_enabled(rtex, first_level));
 }
 
 static void
 si_decompress_color_texture(struct si_context *sctx, struct r600_texture *tex,
 			    unsigned first_level, unsigned last_level)
 {
 	/* CMASK or DCC can be discarded and we can still end up here. */
 	if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
 		return;
 
@@ -1193,21 +1195,21 @@ static void si_do_CB_resolve(struct si_context *sctx,
 
 	si_blitter_begin(&sctx->b.b, SI_COLOR_RESOLVE |
 			 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
 	util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z,
 					  info->src.resource, info->src.box.z,
 					  ~0, sctx->custom_blend_resolve,
 					  format);
 	si_blitter_end(&sctx->b.b);
 
 	/* Flush caches for possible texturing. */
-	si_make_CB_shader_coherent(sctx, 1);
+	si_make_CB_shader_coherent(sctx, 1, false);
 }
 
 static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
 				     const struct pipe_blit_info *info)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct r600_texture *src = (struct r600_texture*)info->src.resource;
 	struct r600_texture *dst = (struct r600_texture*)info->dst.resource;
 	MAYBE_UNUSED struct r600_texture *rtmp;
 	unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 671c488..3e59e21 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -50,21 +50,24 @@
 #define SI_CONTEXT_INV_ICACHE		(R600_CONTEXT_PRIVATE_FLAG << 0)
 /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
 #define SI_CONTEXT_INV_SMEM_L1		(R600_CONTEXT_PRIVATE_FLAG << 1)
 /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
 #define SI_CONTEXT_INV_VMEM_L1		(R600_CONTEXT_PRIVATE_FLAG << 2)
 /* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
 #define SI_CONTEXT_INV_GLOBAL_L2	(R600_CONTEXT_PRIVATE_FLAG << 3)
 /* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't
  * invalidate L2. SI-CIK can't do it, so they will do complete invalidation. */
 #define SI_CONTEXT_WRITEBACK_GLOBAL_L2	(R600_CONTEXT_PRIVATE_FLAG << 4)
-/* gaps */
+/* Writeback & invalidate the L2 metadata cache. It can only be coupled with
+ * a CB or DB flush. */
+#define SI_CONTEXT_INV_L2_METADATA	(R600_CONTEXT_PRIVATE_FLAG << 5)
+/* gap */
 /* Framebuffer caches. */
 #define SI_CONTEXT_FLUSH_AND_INV_DB	(R600_CONTEXT_PRIVATE_FLAG << 7)
 #define SI_CONTEXT_FLUSH_AND_INV_CB	(R600_CONTEXT_PRIVATE_FLAG << 8)
 /* Engine synchronization. */
 #define SI_CONTEXT_VS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 9)
 #define SI_CONTEXT_PS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 10)
 #define SI_CONTEXT_CS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 11)
 #define SI_CONTEXT_VGT_FLUSH		(R600_CONTEXT_PRIVATE_FLAG << 12)
 #define SI_CONTEXT_VGT_STREAMOUT_SYNC	(R600_CONTEXT_PRIVATE_FLAG << 13)
 
@@ -190,20 +193,21 @@ struct si_framebuffer {
 	unsigned			spi_shader_col_format_blend;
 	unsigned			spi_shader_col_format_blend_alpha;
 	ubyte				nr_samples:5; /* at most 16xAA */
 	ubyte				log_samples:3; /* at most 4 = 16xAA */
 	ubyte				compressed_cb_mask;
 	ubyte				color_is_int8;
 	ubyte				color_is_int10;
 	ubyte				dirty_cbufs;
 	bool				dirty_zsbuf;
 	bool				any_dst_linear;
+	bool				CB_has_shader_readable_metadata;
 };
 
 struct si_clip_state {
 	struct r600_atom		atom;
 	struct pipe_clip_state		state;
 	bool				any_nonzeros;
 };
 
 struct si_sample_locs {
 	struct r600_atom	atom;
@@ -588,28 +592,39 @@ si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
 	 * the whole thing will fit into a cache line if we align it to its size.
 	 * The idea is that multiple small uploads can share a cache line.
 	 * If the upload size is greater, align it to the cache line size.
 	 */
 	alignment = util_next_power_of_two(upload_size);
 	tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size;
 	return MIN2(alignment, tcc_cache_line_size);
 }
 
 static inline void
-si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples)
+si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
+			   bool shaders_read_metadata)
 {
 	sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
 			 SI_CONTEXT_INV_VMEM_L1;
 
-	/* Single-sample color is coherent with shaders on GFX9. */
-	if (sctx->b.chip_class <= VI || num_samples >= 2)
+	if (sctx->b.chip_class >= GFX9) {
+		/* Single-sample color is coherent with shaders on GFX9, but
+		 * L2 metadata must be flushed if shaders read metadata.
+		 * (DCC, CMASK).
+		 */
+		if (num_samples >= 2)
+			sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+		else if (shaders_read_metadata)
+			sctx->b.flags |= SI_CONTEXT_INV_L2_METADATA;
+	} else {
+		/* SI-CI-VI */
 		sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+	}
 }
 
 static inline void
 si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
 			   bool include_stencil)
 {
 	sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
 			 SI_CONTEXT_INV_VMEM_L1;
 
 	/* Single-sample depth (not stencil) is coherent with shaders on GFX9. */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index d116c07..e5d8d21 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2566,21 +2566,22 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	 *
 	 * When MSAA is enabled, CB and TC caches are flushed on demand
 	 * (after FMASK decompression). Shader write -> FB read transitions
 	 * cannot happen for MSAA textures, because MSAA shader images are
 	 * not supported.
 	 *
 	 * Only flush and wait for CB if there is actually a bound color buffer.
 	 */
 	if (sctx->framebuffer.nr_samples <= 1 &&
 	    sctx->framebuffer.state.nr_cbufs)
-		si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples);
+		si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+					   sctx->framebuffer.CB_has_shader_readable_metadata);
 
 	sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 
 	/* u_blitter doesn't invoke depth decompression when it does multiple
 	 * blits in a row, but the only case when it matters for DB is when
 	 * doing generate_mipmap. So here we flush DB manually between
 	 * individual generate_mipmap blits.
 	 * Note that lower mipmap levels aren't compressed.
 	 */
 	if (sctx->generate_mipmap_for_depth)
@@ -2601,20 +2602,21 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	sctx->framebuffer.spi_shader_col_format_alpha = 0;
 	sctx->framebuffer.spi_shader_col_format_blend = 0;
 	sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
 	sctx->framebuffer.color_is_int8 = 0;
 	sctx->framebuffer.color_is_int10 = 0;
 
 	sctx->framebuffer.compressed_cb_mask = 0;
 	sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
 	sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
 	sctx->framebuffer.any_dst_linear = false;
+	sctx->framebuffer.CB_has_shader_readable_metadata = false;
 
 	for (i = 0; i < state->nr_cbufs; i++) {
 		if (!state->cbufs[i])
 			continue;
 
 		surf = (struct r600_surface*)state->cbufs[i];
 		rtex = (struct r600_texture*)surf->base.texture;
 
 		if (!surf->color_initialized) {
 			si_initialize_color_surface(sctx, surf);
@@ -2635,20 +2637,23 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		if (surf->color_is_int10)
 			sctx->framebuffer.color_is_int10 |= 1 << i;
 
 		if (rtex->fmask.size) {
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
 		}
 
 		if (rtex->surface.is_linear)
 			sctx->framebuffer.any_dst_linear = true;
 
+		if (vi_dcc_enabled(rtex, surf->base.u.tex.level))
+			sctx->framebuffer.CB_has_shader_readable_metadata = true;
+
 		r600_context_add_resource_size(ctx, surf->base.texture);
 
 		p_atomic_inc(&rtex->framebuffers_bound);
 
 		if (rtex->dcc_gather_statistics) {
 			/* Dirty tracking must be enabled for DCC usage analysis. */
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
 			vi_separate_dcc_start_query(ctx, rtex);
 		}
 	}
@@ -4015,21 +4020,22 @@ static void si_set_tess_state(struct pipe_context *ctx,
 
 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
 	si_update_fb_dirtiness_after_rendering(sctx);
 
 	/* Multisample surfaces are flushed in si_decompress_textures. */
 	if (sctx->framebuffer.nr_samples <= 1 &&
 	    sctx->framebuffer.state.nr_cbufs)
-		si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples);
+		si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+					   sctx->framebuffer.CB_has_shader_readable_metadata);
 }
 
 /* This only ensures coherency for shader image/buffer stores. */
 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
 	/* Subsequent commands must wait for all shader invocations to
 	 * complete. */
 	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
@@ -4060,23 +4066,28 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 	}
 
 	/* MSAA color, any depth and any stencil are flushed in
 	 * si_decompress_textures when needed.
 	 */
 	if (flags & PIPE_BARRIER_FRAMEBUFFER &&
 	    sctx->framebuffer.nr_samples <= 1 &&
 	    sctx->framebuffer.state.nr_cbufs) {
 		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
 
-		/* Single-sample color is coherent with TC on GFX9. */
-		if (sctx->screen->b.chip_class <= VI)
+		if (sctx->b.chip_class >= GFX9) {
+			/* Single-sample color is coherent with TC on GFX9. */
+			if (sctx->framebuffer.CB_has_shader_readable_metadata)
+				sctx->b.flags |= SI_CONTEXT_INV_L2_METADATA;
+		} else {
+			/* SI-CI-VI */
 			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+		}
 	}
 
 	/* Indirect buffers use TC L2 on GFX9, but not older hw. */
 	if (sctx->screen->b.chip_class <= VI &&
 	    flags & PIPE_BARRIER_INDIRECT_BUFFER)
 		sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 }
 
 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
 {
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 2796427..b981676 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -980,27 +980,26 @@ void si_emit_cache_flush(struct si_context *sctx)
 		 * All operations that invalidate L2 also seem to invalidate
 		 * metadata. Volatile (VOL) and WC flushes are not listed here.
 		 *
 		 * TC    | TC_WB         = writeback & invalidate L2 & L1
 		 * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
 		 *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
 		 * TC            | TC_NC = invalidate L2 for MTYPE == NC
 		 * TC    | TC_MD         = writeback & invalidate L2 metadata (DCC, etc.)
 		 * TCL1                  = invalidate L1
 		 */
+		tc_flags = 0;
 
-		/* When flushing CB or DB, L2 metadata should always be invali-
-		 * dated before texturing. Invalidating L2 data is not needed
-		 * in some cases.
-		 */
-		tc_flags = EVENT_TC_ACTION_ENA |
-			   EVENT_TC_MD_ACTION_ENA;
+		if (rctx->flags & SI_CONTEXT_INV_L2_METADATA) {
+			tc_flags = EVENT_TC_ACTION_ENA |
+				   EVENT_TC_MD_ACTION_ENA;
+		}
 
 		/* Ideally flush TC together with CB/DB. */
 		if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
 			/* Writeback and invalidate everything in L2 & L1. */
 			tc_flags = EVENT_TC_ACTION_ENA |
 				   EVENT_TC_WB_ACTION_ENA;
 
 			/* Clear the flags. */
 			rctx->flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 |
 					 SI_CONTEXT_WRITEBACK_GLOBAL_L2 |
-- 
2.7.4



More information about the mesa-dev mailing list