[Mesa-dev] [PATCH 08/15] radeonsi/gfx9: don't flush TC L2 between rendering and texturing if not needed

Marek Olšák maraeo at gmail.com
Mon Aug 21 21:54:07 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_blit.c  | 29 ++++++++++++-----------------
 src/gallium/drivers/radeonsi/si_pipe.h  | 23 +++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_state.c | 29 ++++++++++++-----------------
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index caa4c3c..ae7f809 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -377,43 +377,43 @@ si_decompress_depth(struct si_context *sctx,
 		if (!tex->tc_compatible_htile) {
 			si_blit_decompress_zs_in_place(
 						sctx, tex,
 						levels_z, levels_s,
 						first_layer, last_layer);
 		}
 
 		/* Only in-place decompression needs to flush DB caches, or
 		 * when we don't decompress but TC-compatible planes are dirty.
 		 */
-		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
-				 SI_CONTEXT_INV_GLOBAL_L2 |
-				 SI_CONTEXT_INV_VMEM_L1;
+		si_make_DB_shader_coherent(sctx, tex->resource.b.b.nr_samples,
+					   inplace_planes & PIPE_MASK_S);
 
 		/* If we flush DB caches for TC-compatible depth, the dirty
 		 * state becomes 0 for the whole mipmap tree and all planes.
 		 * (there is nothing else to flush)
 		 */
 		if (tex->tc_compatible_htile) {
-			if (r600_can_sample_zs(tex, false))
+			/* Only clear the mask that we are flushing, because
+			 * si_make_DB_shader_coherent() can treat depth and
+			 * stencil differently.
+			 */
+			if (inplace_planes & PIPE_MASK_Z)
 				tex->dirty_level_mask = 0;
-			if (r600_can_sample_zs(tex, true))
+			if (inplace_planes & PIPE_MASK_S)
 				tex->stencil_dirty_level_mask = 0;
 		}
 	}
 	/* set_framebuffer_state takes care of coherency for single-sample.
 	 * The DB->CB copy uses CB for the final writes.
 	 */
-	if (copy_planes && tex->resource.b.b.nr_samples > 1) {
-		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-				 SI_CONTEXT_INV_GLOBAL_L2 |
-				 SI_CONTEXT_FLUSH_AND_INV_CB;
-	}
+	if (copy_planes && tex->resource.b.b.nr_samples > 1)
+		si_make_CB_shader_coherent(sctx, tex->resource.b.b.nr_samples);
 }
 
 static void
 si_decompress_sampler_depth_textures(struct si_context *sctx,
 				     struct si_textures_info *textures)
 {
 	unsigned i;
 	unsigned mask = textures->needs_depth_decompress_mask;
 
 	while (mask) {
@@ -504,24 +504,21 @@ static void si_blit_decompress_color(struct pipe_context *ctx,
 		}
 
 		/* The texture will always be dirty if some layers aren't flushed.
 		 * I don't think this case occurs often though. */
 		if (first_layer == 0 && last_layer >= max_layer) {
 			rtex->dirty_level_mask &= ~(1 << level);
 		}
 	}
 
 	sctx->decompression_enabled = false;
-
-	sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-			 SI_CONTEXT_INV_GLOBAL_L2 |
-			 SI_CONTEXT_INV_VMEM_L1;
+	si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples);
 }
 
 static void
 si_decompress_color_texture(struct si_context *sctx, struct r600_texture *tex,
 			    unsigned first_level, unsigned last_level)
 {
 	/* CMASK or DCC can be discarded and we can still end up here. */
 	if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
 		return;
 
@@ -1196,23 +1193,21 @@ static void si_do_CB_resolve(struct si_context *sctx,
 
 	si_blitter_begin(&sctx->b.b, SI_COLOR_RESOLVE |
 			 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
 	util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z,
 					  info->src.resource, info->src.box.z,
 					  ~0, sctx->custom_blend_resolve,
 					  format);
 	si_blitter_end(&sctx->b.b);
 
 	/* Flush caches for possible texturing. */
-	sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-			 SI_CONTEXT_INV_GLOBAL_L2 |
-			 SI_CONTEXT_INV_VMEM_L1;
+	si_make_CB_shader_coherent(sctx, 1);
 }
 
 static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
 				     const struct pipe_blit_info *info)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct r600_texture *src = (struct r600_texture*)info->src.resource;
 	struct r600_texture *dst = (struct r600_texture*)info->dst.resource;
 	MAYBE_UNUSED struct r600_texture *rtmp;
 	unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index bb5e189..671c488 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -587,11 +587,34 @@ si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size)
 	/* If the upload size is less than the cache line size (e.g. 16, 32),
 	 * the whole thing will fit into a cache line if we align it to its size.
 	 * The idea is that multiple small uploads can share a cache line.
 	 * If the upload size is greater, align it to the cache line size.
 	 */
 	alignment = util_next_power_of_two(upload_size);
 	tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size;
 	return MIN2(alignment, tcc_cache_line_size);
 }
 
+static inline void
+si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples)
+{
+	sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
+			 SI_CONTEXT_INV_VMEM_L1;
+
+	/* Single-sample color is coherent with shaders on GFX9. */
+	if (sctx->b.chip_class <= VI || num_samples >= 2)
+		sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+}
+
+static inline void
+si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
+			   bool include_stencil)
+{
+	sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
+			 SI_CONTEXT_INV_VMEM_L1;
+
+	/* Single-sample depth (not stencil) is coherent with shaders on GFX9. */
+	if (sctx->b.chip_class <= VI || num_samples >= 2 || include_stencil)
+		sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+}
+
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 8010df6..d116c07 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2565,38 +2565,33 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	 * DB caches are flushed on demand (using si_decompress_textures).
 	 *
 	 * When MSAA is enabled, CB and TC caches are flushed on demand
 	 * (after FMASK decompression). Shader write -> FB read transitions
 	 * cannot happen for MSAA textures, because MSAA shader images are
 	 * not supported.
 	 *
 	 * Only flush and wait for CB if there is actually a bound color buffer.
 	 */
 	if (sctx->framebuffer.nr_samples <= 1 &&
-	    sctx->framebuffer.state.nr_cbufs) {
-		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-				 SI_CONTEXT_INV_GLOBAL_L2 |
-				 SI_CONTEXT_FLUSH_AND_INV_CB;
-	}
+	    sctx->framebuffer.state.nr_cbufs)
+		si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples);
+
 	sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 
 	/* u_blitter doesn't invoke depth decompression when it does multiple
 	 * blits in a row, but the only case when it matters for DB is when
 	 * doing generate_mipmap. So here we flush DB manually between
 	 * individual generate_mipmap blits.
 	 * Note that lower mipmap levels aren't compressed.
 	 */
-	if (sctx->generate_mipmap_for_depth) {
-		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-				 SI_CONTEXT_INV_GLOBAL_L2 |
-				 SI_CONTEXT_FLUSH_AND_INV_DB;
-	}
+	if (sctx->generate_mipmap_for_depth)
+		si_make_DB_shader_coherent(sctx, 1, false);
 
 	/* Take the maximum of the old and new count. If the new count is lower,
 	 * dirtying is needed to disable the unbound colorbuffers.
 	 */
 	sctx->framebuffer.dirty_cbufs |=
 		(1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
 	sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
 
 	si_dec_framebuffer_counters(&sctx->framebuffer.state);
 	util_copy_framebuffer_state(&sctx->framebuffer.state, state);
@@ -4019,25 +4014,22 @@ static void si_set_tess_state(struct pipe_context *ctx,
 }
 
 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
 	si_update_fb_dirtiness_after_rendering(sctx);
 
 	/* Multisample surfaces are flushed in si_decompress_textures. */
 	if (sctx->framebuffer.nr_samples <= 1 &&
-	    sctx->framebuffer.state.nr_cbufs) {
-		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-				 SI_CONTEXT_INV_GLOBAL_L2 |
-				 SI_CONTEXT_FLUSH_AND_INV_CB;
-	}
+	    sctx->framebuffer.state.nr_cbufs)
+		si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples);
 }
 
 /* This only ensures coherency for shader image/buffer stores. */
 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
 	/* Subsequent commands must wait for all shader invocations to
 	 * complete. */
 	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
@@ -4066,22 +4058,25 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 		if (sctx->screen->b.chip_class <= CIK)
 			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 	}
 
 	/* MSAA color, any depth and any stencil are flushed in
 	 * si_decompress_textures when needed.
 	 */
 	if (flags & PIPE_BARRIER_FRAMEBUFFER &&
 	    sctx->framebuffer.nr_samples <= 1 &&
 	    sctx->framebuffer.state.nr_cbufs) {
-		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-				 SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
+
+		/* Single-sample color is coherent with TC on GFX9. */
+		if (sctx->screen->b.chip_class <= VI)
+			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 	}
 
 	/* Indirect buffers use TC L2 on GFX9, but not older hw. */
 	if (sctx->screen->b.chip_class <= VI &&
 	    flags & PIPE_BARRIER_INDIRECT_BUFFER)
 		sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 }
 
 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
 {
-- 
2.7.4



More information about the mesa-dev mailing list