[Mesa-dev] [PATCH 2/9] radeonsi: flush DB caches only when transitioning from DB to texturing

Marek Olšák maraeo at gmail.com
Fri Jun 16 12:57:58 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

Use the mechanism of si_decompress_textures, but instead of doing
the actual decompression, just flag the DB cache flush there.

This removes a lot of unnecessary DB cache flushes.
---
 src/gallium/drivers/radeonsi/si_blit.c        | 36 +++++++++++++++++++++------
 src/gallium/drivers/radeonsi/si_descriptors.c | 17 +++++++------
 src/gallium/drivers/radeonsi/si_pipe.h        |  1 +
 src/gallium/drivers/radeonsi/si_state.c       | 21 ++++++++++++----
 src/gallium/drivers/radeonsi/si_state_draw.c  |  6 ++---
 5 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 9c38ae9..f0abfdc 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -337,24 +337,20 @@ si_decompress_depth(struct si_context *sctx,
 		levels_s = level_mask & tex->stencil_dirty_level_mask;
 
 		if (levels_s) {
 			if (r600_can_sample_zs(tex, true))
 				inplace_planes |= PIPE_MASK_S;
 			else
 				copy_planes |= PIPE_MASK_S;
 		}
 	}
 
-	assert(!tex->tc_compatible_htile || levels_z == 0);
-	assert(!tex->tc_compatible_htile || levels_s == 0 ||
-	       !r600_can_sample_zs(tex, true));
-
 	/* We may have to allocate the flushed texture here when called from
 	 * si_decompress_subresource.
 	 */
 	if (copy_planes &&
 	    (tex->flushed_depth_texture ||
 	     r600_init_flushed_depth_texture(&sctx->b.b, &tex->resource.b.b, NULL))) {
 		struct r600_texture *dst = tex->flushed_depth_texture;
 		unsigned fully_copied_levels;
 		unsigned levels = 0;
 
@@ -377,24 +373,44 @@ si_decompress_depth(struct si_context *sctx,
 			first_layer, last_layer,
 			0, u_max_sample(&tex->resource.b.b));
 
 		if (copy_planes & PIPE_MASK_Z)
 			tex->dirty_level_mask &= ~fully_copied_levels;
 		if (copy_planes & PIPE_MASK_S)
 			tex->stencil_dirty_level_mask &= ~fully_copied_levels;
 	}
 
 	if (inplace_planes) {
-		si_blit_decompress_zs_in_place(
-			sctx, tex,
-			levels_z, levels_s,
-			first_layer, last_layer);
+		if (!tex->tc_compatible_htile) {
+			si_blit_decompress_zs_in_place(
+						sctx, tex,
+						levels_z, levels_s,
+						first_layer, last_layer);
+		}
+
+		/* Only in-place decompression needs to flush DB caches, or
+		 * when we don't decompress but TC-compatible planes are dirty.
+		 */
+		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
+				 SI_CONTEXT_INV_GLOBAL_L2 |
+				 SI_CONTEXT_INV_VMEM_L1;
+
+		/* If we flush DB caches for TC-compatible depth, the dirty
+		 * state becomes 0 for the whole mipmap tree and all planes.
+		 * (there is nothing else to flush)
+		 */
+		if (tex->tc_compatible_htile) {
+			if (r600_can_sample_zs(tex, false))
+				tex->dirty_level_mask = 0;
+			if (r600_can_sample_zs(tex, true))
+				tex->stencil_dirty_level_mask = 0;
+		}
 	}
 }
 
 static void
 si_decompress_sampler_depth_textures(struct si_context *sctx,
 				     struct si_textures_info *textures)
 {
 	unsigned i;
 	unsigned mask = textures->needs_depth_decompress_mask;
 
@@ -1343,25 +1359,29 @@ static boolean si_generate_mipmap(struct pipe_context *ctx,
 	vi_disable_dcc_if_incompatible_format(&sctx->b, tex, base_level,
 					      format);
 	si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS,
 				  base_level, first_layer, last_layer);
 
 	/* Clear dirty_level_mask for the levels that will be overwritten. */
 	assert(base_level < last_level);
 	rtex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1,
 						     last_level - base_level);
 
+	sctx->generate_mipmap_for_depth = rtex->is_depth;
+
 	si_blitter_begin(ctx, SI_BLIT | SI_DISABLE_RENDER_COND);
 	util_blitter_generate_mipmap(sctx->blitter, tex, format,
 				     base_level, last_level,
 				     first_layer, last_layer);
 	si_blitter_end(ctx);
+
+	sctx->generate_mipmap_for_depth = false;
 	return true;
 }
 
 static void si_flush_resource(struct pipe_context *ctx,
 			      struct pipe_resource *res)
 {
 	struct r600_texture *rtex = (struct r600_texture*)res;
 
 	assert(res->target != PIPE_BUFFER);
 	assert(!rtex->dcc_separate_buffer || rtex->dcc_gather_statistics);
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 375bcae..ee2e80a 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -577,26 +577,28 @@ static void si_set_sampler_view(struct si_context *sctx,
 	sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader);
 }
 
 static bool color_needs_decompression(struct r600_texture *rtex)
 {
 	return rtex->fmask.size ||
 	       (rtex->dirty_level_mask &&
 		(rtex->cmask.size || rtex->dcc_offset));
 }
 
-static bool depth_needs_decompression(struct r600_texture *rtex,
-				      struct si_sampler_view *sview)
+static bool depth_needs_decompression(struct r600_texture *rtex)
 {
-	return rtex->db_compatible &&
-	       (!rtex->tc_compatible_htile ||
-		!r600_can_sample_zs(rtex, sview->is_stencil_sampler));
+	/* If the depth/stencil texture is TC-compatible, no decompression
+	 * will be done. The decompression function will only flush DB caches
+	 * to make it coherent with shaders. That's necessary because the driver
+	 * doesn't flush DB caches in any other case.
+	 */
+	return rtex->db_compatible;
 }
 
 static void si_update_shader_needs_decompress_mask(struct si_context *sctx,
 						   unsigned shader)
 {
 	struct si_textures_info *samplers = &sctx->samplers[shader];
 	unsigned shader_bit = 1 << shader;
 
 	if (samplers->needs_depth_decompress_mask ||
 	    samplers->needs_color_decompress_mask ||
@@ -626,23 +628,22 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 			samplers->needs_color_decompress_mask &= ~(1u << slot);
 			si_set_sampler_view(sctx, shader, slot, NULL, false);
 			continue;
 		}
 
 		si_set_sampler_view(sctx, shader, slot, views[i], false);
 
 		if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
 			struct r600_texture *rtex =
 				(struct r600_texture*)views[i]->texture;
-			struct si_sampler_view *rview = (struct si_sampler_view *)views[i];
 
-			if (depth_needs_decompression(rtex, rview)) {
+			if (depth_needs_decompression(rtex)) {
 				samplers->needs_depth_decompress_mask |= 1u << slot;
 			} else {
 				samplers->needs_depth_decompress_mask &= ~(1u << slot);
 			}
 			if (color_needs_decompression(rtex)) {
 				samplers->needs_color_decompress_mask |= 1u << slot;
 			} else {
 				samplers->needs_color_decompress_mask &= ~(1u << slot);
 			}
 
@@ -2373,21 +2374,21 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx,
 
 	tex_handle = (struct si_texture_handle *)entry->data;
 	sview = (struct si_sampler_view *)tex_handle->view;
 
 	if (resident) {
 		if (sview->base.texture->target != PIPE_BUFFER) {
 			struct r600_texture *rtex =
 				(struct r600_texture *)sview->base.texture;
 
 			tex_handle->needs_depth_decompress =
-				depth_needs_decompression(rtex, sview);
+				depth_needs_decompression(rtex);
 			tex_handle->needs_color_decompress =
 				color_needs_decompression(rtex);
 
 			if (rtex->dcc_offset &&
 			    p_atomic_read(&rtex->framebuffers_bound))
 				sctx->need_check_render_feedback = true;
 		} else {
 			si_invalidate_bindless_buf_desc(sctx, tex_handle->desc,
 							sview->base.texture,
 							sview->base.u.buf.offset);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 427ac1c..e734595 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -357,20 +357,21 @@ struct si_context {
 	unsigned		dbcb_copy_sample;
 	bool			dbcb_depth_copy_enabled:1;
 	bool			dbcb_stencil_copy_enabled:1;
 	bool			db_flush_depth_inplace:1;
 	bool			db_flush_stencil_inplace:1;
 	bool			db_depth_clear:1;
 	bool			db_depth_disable_expclear:1;
 	bool			db_stencil_clear:1;
 	bool			db_stencil_disable_expclear:1;
 	bool			occlusion_queries_disabled:1;
+	bool			generate_mipmap_for_depth:1;
 
 	/* Emitted draw state. */
 	bool			gs_tri_strip_adj_fix:1;
 	int			last_index_size;
 	int			last_base_vertex;
 	int			last_start_instance;
 	int			last_drawid;
 	int			last_sh_base_reg;
 	int			last_primitive_restart_en;
 	int			last_restart_index;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 1cd1f91..44e5f1c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2518,29 +2518,40 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 			if (!r600_texture_disable_dcc(&sctx->b, rtex))
 				sctx->b.decompress_dcc(ctx, rtex);
 
 		surf->dcc_incompatible = false;
 	}
 
 	/* Only flush TC when changing the framebuffer state, because
 	 * the only client not using TC that can change textures is
 	 * the framebuffer.
 	 *
-	 * Flush all CB and DB caches here because all buffers can be used
-	 * for write by both TC (with shader image stores) and CB/DB.
+	 * Wait for compute shaders because of possible transitions:
+	 * - FB write -> shader read
+	 * - shader write -> FB read
+	 *
+	 * DB caches are flushed on demand (using si_decompress_textures).
 	 */
 	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
 			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_FLUSH_AND_INV_CB |
-			 SI_CONTEXT_FLUSH_AND_INV_DB |
 			 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
+	/* u_blitter doesn't invoke depth decompression when it does multiple
+	 * blits in a row, but the only case when it matters for DB is when
+	 * doing generate_mipmap. So here we flush DB manually between
+	 * individual generate_mipmap blits.
+	 * Note that lower mipmap levels aren't compressed.
+	 */
+	if (sctx->generate_mipmap_for_depth)
+		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB;
+
 	/* Take the maximum of the old and new count. If the new count is lower,
 	 * dirtying is needed to disable the unbound colorbuffers.
 	 */
 	sctx->framebuffer.dirty_cbufs |=
 		(1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
 	sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
 
 	si_dec_framebuffer_counters(&sctx->framebuffer.state);
 	util_copy_framebuffer_state(&sctx->framebuffer.state, state);
 
@@ -3977,23 +3988,23 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 	}
 
 	if (flags & PIPE_BARRIER_INDEX_BUFFER) {
 		/* Indices are read through TC L2 since VI.
 		 * L1 isn't used.
 		 */
 		if (sctx->screen->b.chip_class <= CIK)
 			sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 	}
 
+	/* Depth and stencil are flushed in si_decompress_textures when needed. */
 	if (flags & PIPE_BARRIER_FRAMEBUFFER)
-		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-				 SI_CONTEXT_FLUSH_AND_INV_DB;
+		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
 
 	if (flags & (PIPE_BARRIER_FRAMEBUFFER |
 		     PIPE_BARRIER_INDIRECT_BUFFER))
 		sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
 }
 
 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
 {
 	struct pipe_blend_state blend;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index d039e01..d13c8b7 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1395,25 +1395,23 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	    r600_get_strmout_en(&sctx->b)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
 	}
 
 	if (sctx->framebuffer.do_update_surf_dirtiness) {
 		/* Set the depth buffer as dirty. */
 		if (sctx->framebuffer.state.zsbuf) {
 			struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
 			struct r600_texture *rtex = (struct r600_texture *)surf->texture;
 
-			if (!rtex->tc_compatible_htile)
-				rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+			rtex->dirty_level_mask |= 1 << surf->u.tex.level;
 
-			if (rtex->surface.flags & RADEON_SURF_SBUFFER &&
-			    (!rtex->tc_compatible_htile || !rtex->can_sample_s))
+			if (rtex->surface.flags & RADEON_SURF_SBUFFER)
 				rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
 		}
 		if (sctx->framebuffer.compressed_cb_mask) {
 			struct pipe_surface *surf;
 			struct r600_texture *rtex;
 			unsigned mask = sctx->framebuffer.compressed_cb_mask;
 
 			do {
 				unsigned i = u_bit_scan(&mask);
 				surf = sctx->framebuffer.state.cbufs[i];
-- 
2.7.4



More information about the mesa-dev mailing list