[Mesa-dev] [PATCH 1/8] gallium/radeon: add a heuristic enabling DCC for scanout surfaces (v2)

Wed Jun 29 14:20:02 UTC 2016

From: Marek Olšák <marek.olsak at amd.com>

DCC for displayable surfaces is allocated in a separate buffer and is
enabled or disabled based on PS invocations from 2 frames ago (to let
queries go idle) and the number of slow clears from the current frame.

At least an equivalent of 5 fullscreen draws or slow clears must be done
to enable DCC. (PS invocations / (width * height) + num_slow_clears >= 5)

Pipeline statistic queries are always active if a color buffer that can
have separate DCC is bound, even if separate DCC is disabled. That means
the window color buffer is always monitored and DCC is enabled only when
the situation is right.

The tracking of per-texture queries in r600_common_context is quite ugly,
but I don't see a better way.

The first fast clear always enables DCC. DCC decompression can disable it.
A later fast clear can enable it again. Enable/disable typically happens
only once per frame.

The impact is expected to be negligible because games usually don't have
a high level of overdraw. DCC usually activates when too much blending
is happening (smoke rendering) or when testing glClear performance and
CMASK isn't supported (Stoney).

v2: rename stuff, add assertions
---
 src/gallium/drivers/radeon/r600_pipe_common.c |  15 ++
 src/gallium/drivers/radeon/r600_pipe_common.h |  45 ++++-
 src/gallium/drivers/radeon/r600_texture.c     | 247 ++++++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_blit.c        |  15 +-
 src/gallium/drivers/radeonsi/si_state.c       |  15 ++
 src/gallium/drivers/radeonsi/si_state_draw.c  |   5 +-
 6 files changed, 338 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 06388d5..1a892ab 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -472,6 +472,21 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 
 void r600_common_context_cleanup(struct r600_common_context *rctx)
 {
+	unsigned i,j;
+
+	/* Release DCC stats. */
+	for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
+		assert(!rctx->dcc_stats[i].query_active);
+
+		for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
+			if (rctx->dcc_stats[i].ps_stats[j])
+				rctx->b.destroy_query(&rctx->b,
+						      rctx->dcc_stats[i].ps_stats[j]);
+
+		pipe_resource_reference((struct pipe_resource**)
+					&rctx->dcc_stats[i].tex, NULL);
+	}
+
 	if (rctx->gfx.cs)
 		rctx->ws->cs_destroy(rctx->gfx.cs);
 	if (rctx->dma.cs)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index b411b23..ca8ec54 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -268,10 +268,30 @@ struct r600_texture {
 	/* Whether the texture is a displayable back buffer and needs DCC
 	 * decompression, which is expensive. Therefore, it's enabled only
 	 * if statistics suggest that it will pay off and it's allocated
-	 * separately. Limited to target == 2D and last_level == 0. If enabled,
-	 * dcc_offset contains the absolute GPUVM address, not the relative one.
+	 * separately. It can't be bound as a sampler by apps. Limited to
+	 * target == 2D and last_level == 0. If enabled, dcc_offset contains
+	 * the absolute GPUVM address, not the relative one.
 	 */
 	struct r600_resource		*dcc_separate_buffer;
+	/* When DCC is temporarily disabled, the separate buffer is here. */
+	struct r600_resource		*last_dcc_separate_buffer;
+	/* We need to track DCC dirtiness, because st/dri usually calls
+	 * flush_resource twice per frame (not a bug) and we don't wanna
+	 * decompress DCC twice. Also, the dirty tracking must be done even
+	 * if DCC isn't used, because it's required by the DCC usage analysis
+	 * for a possible future enablement.
+	 */
+	bool				separate_dcc_dirty;
+	/* Statistics gathering for the DCC enablement heuristic. */
+	bool				dcc_gather_statistics;
+	/* Estimate of how much this color buffer is written to in units of
+	 * full-screen draws: ps_invocations / (width * height)
+	 * Shader kills, late Z, and blending with trivial discards make it
+	 * inaccurate (we need to count CB updates, not PS invocations).
+	 */
+	unsigned			ps_draw_ratio;
+	/* The number of clears since the last DCC usage analysis. */
+	unsigned			num_slow_clears;
 
 	/* Counter that should be non-zero if the texture is bound to a
 	 * framebuffer. Implemented in radeonsi only.
@@ -544,6 +564,21 @@ struct r600_common_context {
 	float				sample_locations_8x[8][2];
 	float				sample_locations_16x[16][2];
 
+	/* Statistics gathering for the DCC enablement heuristic. It can't be
+	 * in r600_texture because r600_texture can be shared by multiple
+	 * contexts. This is for back buffers only. We shouldn't get too many
+	 * of those.
+	 */
+	struct {
+		struct r600_texture		*tex;
+		/* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
+		struct pipe_query		*ps_stats[3];
+		/* If all slots are used and another slot is needed,
+		 * the least recently used slot is evicted based on this. */
+		int64_t				last_use_timestamp;
+		bool				query_active;
+	} dcc_stats[2];
+
 	/* The list of all texture buffer objects in this context.
 	 * This list is walked when a buffer is invalidated/reallocated and
 	 * the GPU addresses are updated. */
@@ -703,6 +738,12 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
 						const struct pipe_surface *templ,
 						unsigned width, unsigned height);
 unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap);
+void vi_separate_dcc_start_query(struct pipe_context *ctx,
+				 struct r600_texture *tex);
+void vi_separate_dcc_stop_query(struct pipe_context *ctx,
+				struct r600_texture *tex);
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+					     struct r600_texture *tex);
 void vi_dcc_clear_level(struct r600_common_context *rctx,
 			struct r600_texture *rtex,
 			unsigned level, unsigned clear_value);
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index c45c5f2..5ed94c2 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -26,9 +26,11 @@
  */
 #include "r600_pipe_common.h"
 #include "r600_cs.h"
+#include "r600_query.h"
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_pack_color.h"
+#include "os/os_time.h"
 #include <errno.h>
 #include <inttypes.h>
 
@@ -567,6 +569,7 @@ static void r600_texture_destroy(struct pipe_screen *screen,
 	}
 	pb_reference(&resource->buf, NULL);
 	r600_resource_reference(&rtex->dcc_separate_buffer, NULL);
+	r600_resource_reference(&rtex->last_dcc_separate_buffer, NULL);
 	FREE(rtex);
 }
 
@@ -1018,6 +1021,13 @@ r600_texture_create_object(struct pipe_screen *screen,
 	/* Applies to GCN. */
 	rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
 
+	/* We want to optimistically enable separate DCC on the first clear
+	 * for apps that don't call SwapBuffers or call it too late, so set
+	 * a very high number at the beginning. If the PS/draw ratio is too
+	 * low, the first DCC decompression will disable DCC.
+	 */
+	rtex->ps_draw_ratio = 100;
+
 	if (rtex->is_depth) {
 		if (!(base->flags & (R600_RESOURCE_FLAG_TRANSFER |
 				     R600_RESOURCE_FLAG_FLUSHED_DEPTH)) &&
@@ -1705,6 +1715,226 @@ unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap)
 	return ~0U;
 }
 
+/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
+
+/**
+ * Return the per-context slot where DCC statistics queries for the texture live.
+ */
+static unsigned vi_get_context_dcc_stats_index(struct r600_common_context *rctx,
+					       struct r600_texture *tex)
+{
+	int i, empty_slot = -1;
+
+	for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
+		/* Return if found. */
+		if (rctx->dcc_stats[i].tex == tex) {
+			rctx->dcc_stats[i].last_use_timestamp = os_time_get();
+			return i;
+		}
+
+		/* Record the first seen empty slot. */
+		if (empty_slot == -1 && !rctx->dcc_stats[i].tex)
+			empty_slot = i;
+	}
+
+	/* Not found. Remove the oldest member to make space in the array. */
+	if (empty_slot == -1) {
+		int oldest_slot = 0;
+
+		/* Find the oldest slot. */
+		for (i = 1; i < ARRAY_SIZE(rctx->dcc_stats); i++)
+			if (rctx->dcc_stats[oldest_slot].last_use_timestamp >
+			    rctx->dcc_stats[i].last_use_timestamp)
+				oldest_slot = i;
+
+		/* Clean up the oldest slot. */
+		if (rctx->dcc_stats[oldest_slot].query_active)
+			vi_separate_dcc_stop_query(&rctx->b,
+						   rctx->dcc_stats[oldest_slot].tex);
+
+		for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats[oldest_slot].ps_stats); i++)
+			if (rctx->dcc_stats[oldest_slot].ps_stats[i]) {
+				rctx->b.destroy_query(&rctx->b,
+						      rctx->dcc_stats[oldest_slot].ps_stats[i]);
+				rctx->dcc_stats[oldest_slot].ps_stats[i] = NULL;
+			}
+
+		pipe_resource_reference((struct pipe_resource**)
+					&rctx->dcc_stats[oldest_slot].tex, NULL);
+		empty_slot = oldest_slot;
+	}
+
+	/* Add the texture to the new slot. */
+	pipe_resource_reference((struct pipe_resource**)&rctx->dcc_stats[empty_slot].tex,
+				&tex->resource.b.b);
+	rctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
+	return empty_slot;
+}
+
+static struct pipe_query *
+vi_create_resuming_pipestats_query(struct pipe_context *ctx)
+{
+	struct r600_query_hw *query = (struct r600_query_hw*)
+		ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
+
+	query->flags |= R600_QUERY_HW_FLAG_BEGIN_RESUMES;
+	return (struct pipe_query*)query;
+}
+
+/**
+ * Called when binding a color buffer.
+ */
+void vi_separate_dcc_start_query(struct pipe_context *ctx,
+				 struct r600_texture *tex)
+{
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+	unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+
+	assert(!rctx->dcc_stats[i].query_active);
+
+	if (!rctx->dcc_stats[i].ps_stats[0])
+		rctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(ctx);
+
+	/* begin or resume the query */
+	ctx->begin_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
+	rctx->dcc_stats[i].query_active = true;
+}
+
+/**
+ * Called when unbinding a color buffer.
+ */
+void vi_separate_dcc_stop_query(struct pipe_context *ctx,
+				struct r600_texture *tex)
+{
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+	unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+
+	assert(rctx->dcc_stats[i].query_active);
+	assert(rctx->dcc_stats[i].ps_stats[0]);
+
+	/* pause or end the query */
+	ctx->end_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
+	rctx->dcc_stats[i].query_active = false;
+}
+
+static bool vi_should_enable_separate_dcc(struct r600_texture *tex)
+{
+	/* The minimum number of fullscreen draws per frame that is required
+	 * to enable DCC. */
+	return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
+}
+
+/* Called by fast clear. */
+static void vi_separate_dcc_try_enable(struct r600_common_context *rctx,
+				       struct r600_texture *tex)
+{
+	/* The intent is to use this with shared displayable back buffers,
+	 * but it's not strictly limited only to them.
+	 */
+	if (!tex->resource.is_shared ||
+	    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+	    tex->resource.b.b.target != PIPE_TEXTURE_2D ||
+	    tex->surface.last_level > 0 ||
+	    !tex->surface.dcc_size)
+		return;
+
+	if (tex->dcc_offset)
+		return; /* already enabled */
+
+	if (!vi_should_enable_separate_dcc(tex))
+		return; /* stats show that DCC decompression is too expensive */
+
+	assert(tex->surface.level[0].dcc_enabled);
+	assert(!tex->dcc_separate_buffer);
+
+	r600_texture_discard_cmask(rctx->screen, tex);
+
+	/* Get a DCC buffer. */
+	if (tex->last_dcc_separate_buffer) {
+		assert(tex->dcc_gather_statistics);
+		assert(!tex->dcc_separate_buffer);
+		tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
+		tex->last_dcc_separate_buffer = NULL;
+	} else {
+		tex->dcc_separate_buffer = (struct r600_resource*)
+			r600_aligned_buffer_create(rctx->b.screen, 0,
+						   PIPE_USAGE_DEFAULT,
+						   tex->surface.dcc_size,
+						   tex->surface.dcc_alignment);
+		if (!tex->dcc_separate_buffer)
+			return;
+
+		/* Enabling for the first time, so start the query. */
+		tex->dcc_gather_statistics = true;
+		vi_separate_dcc_start_query(&rctx->b, tex);
+	}
+
+	/* dcc_offset is the absolute GPUVM address. */
+	tex->dcc_offset = tex->dcc_separate_buffer->gpu_address;
+
+	/* no need to flag anything since this is called by fast clear that
+	 * flags framebuffer state
+	 */
+}
+
+/**
+ * Called by pipe_context::flush_resource, the place where DCC decompression
+ * takes place.
+ */
+void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
+					     struct r600_texture *tex)
+{
+	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
+	unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
+	bool query_active = rctx->dcc_stats[i].query_active;
+	bool disable = false;
+
+	if (rctx->dcc_stats[i].ps_stats[2]) {
+		union pipe_query_result result;
+
+		/* Read the results. */
+		ctx->get_query_result(ctx, rctx->dcc_stats[i].ps_stats[2],
+				      true, &result);
+		ctx->destroy_query(ctx, rctx->dcc_stats[i].ps_stats[2]);
+		rctx->dcc_stats[i].ps_stats[2] = NULL;
+
+		/* Compute the approximate number of fullscreen draws. */
+		tex->ps_draw_ratio =
+			result.pipeline_statistics.ps_invocations /
+			(tex->resource.b.b.width0 * tex->resource.b.b.height0);
+
+		disable = tex->dcc_separate_buffer &&
+			  !vi_should_enable_separate_dcc(tex);
+	}
+
+	tex->num_slow_clears = 0;
+
+	/* stop the statistics query for ps_stats[0] */
+	if (query_active)
+		vi_separate_dcc_stop_query(ctx, tex);
+
+	/* Move the queries in the queue by one. */
+	rctx->dcc_stats[i].ps_stats[2] = rctx->dcc_stats[i].ps_stats[1];
+	rctx->dcc_stats[i].ps_stats[1] = rctx->dcc_stats[i].ps_stats[0];
+	rctx->dcc_stats[i].ps_stats[0] = NULL;
+
+	/* create and start a new query as ps_stats[0] */
+	if (query_active)
+		vi_separate_dcc_start_query(ctx, tex);
+
+	if (disable) {
+		assert(!tex->last_dcc_separate_buffer);
+		tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
+		tex->dcc_separate_buffer = NULL;
+		tex->dcc_offset = 0;
+		/* no need to flag anything since this is called after
+		 * decompression that re-sets framebuffer state
+		 */
+	}
+}
+
+/* FAST COLOR CLEAR */
+
 static void evergreen_set_clear_color(struct r600_texture *rtex,
 				      enum pipe_format surface_format,
 				      const union pipe_color_union *color)
@@ -1966,6 +2196,22 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 			continue;
 		}
 
+		/* Fast clear is the most appropriate place to enable DCC for
+		 * displayable surfaces.
+		 */
+		if (rctx->chip_class >= VI) {
+			vi_separate_dcc_try_enable(rctx, tex);
+
+			/* Stoney can't do a CMASK-based clear, so all clears are
+			 * considered to be hypothetically slow clears, which
+			 * is weighed when determining to enable separate DCC.
+			 */
+			if (tex->dcc_gather_statistics &&
+			    rctx->family == CHIP_STONEY)
+				tex->num_slow_clears++;
+		}
+
+		/* Try to clear DCC first, otherwise try CMASK. */
 		if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) {
 			uint32_t reset_value;
 			bool clear_words_needed;
@@ -1982,6 +2228,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 
 			if (clear_words_needed)
 				tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
+			tex->separate_dcc_dirty = true;
 		} else {
 			/* Stoney/RB+ doesn't work with CMASK fast clear. */
 			if (rctx->family == CHIP_STONEY)
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 46daeac..a48c5bc 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -1055,10 +1055,23 @@ static void si_flush_resource(struct pipe_context *ctx,
 	struct r600_texture *rtex = (struct r600_texture*)res;
 
 	assert(res->target != PIPE_BUFFER);
+	assert(!rtex->dcc_separate_buffer || rtex->dcc_gather_statistics);
+
+	/* st/dri calls flush twice per frame (not a bug), this prevents double
+	 * decompression. */
+	if (rtex->dcc_separate_buffer && !rtex->separate_dcc_dirty)
+		return;
 
 	if (!rtex->is_depth && (rtex->cmask.size || rtex->dcc_offset)) {
 		si_blit_decompress_color(ctx, rtex, 0, res->last_level,
-					 0, util_max_layer(res, 0), false);
+					 0, util_max_layer(res, 0),
+					 rtex->dcc_separate_buffer != NULL);
+	}
+
+	/* Always do the analysis even if DCC is disabled at the moment. */
+	if (rtex->dcc_gather_statistics && rtex->separate_dcc_dirty) {
+		rtex->separate_dcc_dirty = false;
+		vi_separate_dcc_process_and_reset_stats(ctx, rtex);
 	}
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 1cecb46..3718acf 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2233,6 +2233,15 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	unsigned old_nr_samples = sctx->framebuffer.nr_samples;
 	int i;
 
+	for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
+		if (!sctx->framebuffer.state.cbufs[i])
+			continue;
+
+		rtex = (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
+		if (rtex->dcc_gather_statistics)
+			vi_separate_dcc_stop_query(ctx, rtex);
+	}
+
 	/* Only flush TC when changing the framebuffer state, because
 	 * the only client not using TC that can change textures is
 	 * the framebuffer.
@@ -2304,6 +2313,12 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		r600_context_add_resource_size(ctx, surf->base.texture);
 
 		p_atomic_inc(&rtex->framebuffers_bound);
+
+		if (rtex->dcc_gather_statistics) {
+			/* Dirty tracking must be enabled for DCC usage analysis. */
+			sctx->framebuffer.compressed_cb_mask |= 1 << i;
+			vi_separate_dcc_start_query(ctx, rtex);
+		}
 	}
 	/* Set the second SPI format for possible dual-src blending. */
 	if (i == 1 && surf) {
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 5f866d5..def75ad 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1039,7 +1039,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 			surf = sctx->framebuffer.state.cbufs[i];
 			rtex = (struct r600_texture*)surf->texture;
 
-			rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+			if (rtex->fmask.size)
+				rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+			if (rtex->dcc_gather_statistics)
+				rtex->separate_dcc_dirty = true;
 		} while (mask);
 	}
 
-- 
2.7.4