[Mesa-dev] [PATCH 5/6] gallium/radeon: add a heuristic dynamically enabling DCC for scanout surfaces

Fri Jun 24 11:09:13 UTC 2016

On 22.06.2016 20:29, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> DCC for displayable surfaces is allocated in a separate buffer and is
> enabled or disabled based on PS invocations from 2 frames ago (to let
> queries go idle) and the number of slow clears from the current frame.
>
> At least an equivalent of 5 fullscreen draws or slow clears must be done
> to enable DCC. (PS invocations / (width * height) + num_slow_clears >= 5)
>
> Pipeline statistic queries are always active if a color buffer that can
> have separate DCC is bound, even if separate DCC is disabled. That means
> the window color buffer is always monitored and DCC is enabled only when
> the situation is right.
>
> The tracking of per-texture queries in r600_common_context is quite ugly,
> but I don't see a better way.
>
> The first fast clear always enables DCC. DCC decompression can disable it.
> A later fast clear can enable it again. Enable/disable typically happens
> only once per frame.
>
> The impact is expected to be negligible because games usually don't have
> a high level of overdraw. DCC usually activates when too much blending
> is happening (smoke rendering) or when testing glClear performance and
> CMASK isn't supported (Stoney).

Nice stuff. One corner case to think of: what happens when DCC is 
enabled for a texture that is currently bound? Needs the same treatment 
as when DCC is disabled, right?

More comments below...

> ---
>   src/gallium/drivers/radeon/r600_pipe_common.c |  15 ++
>   src/gallium/drivers/radeon/r600_pipe_common.h |  40 +++++
>   src/gallium/drivers/radeon/r600_texture.c     | 239 ++++++++++++++++++++++++++
>   src/gallium/drivers/radeonsi/si_blit.c        |  14 +-
>   src/gallium/drivers/radeonsi/si_state.c       |  15 ++
>   src/gallium/drivers/radeonsi/si_state_draw.c  |   5 +-
>   6 files changed, 326 insertions(+), 2 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
> index 5d4a679..66afcfa 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -397,6 +397,21 @@ bool r600_common_context_init(struct r600_common_context *rctx,
>
>   void r600_common_context_cleanup(struct r600_common_context *rctx)
>   {
> +	unsigned i,j;
> +
> +	/* Release DCC stats. */
> +	for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
> +		assert(!rctx->dcc_stats[i].query_active);
> +
> +		for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
> +			if (rctx->dcc_stats[i].ps_stats[j])
> +				rctx->b.destroy_query(&rctx->b,
> +						      rctx->dcc_stats[i].ps_stats[j]);
> +
> +		pipe_resource_reference((struct pipe_resource**)
> +					&rctx->dcc_stats[i].tex, NULL);
> +	}
> +
>   	if (rctx->gfx.cs)
>   		rctx->ws->cs_destroy(rctx->gfx.cs);
>   	if (rctx->dma.cs)
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
> index 92cba13..cdec907 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
> @@ -272,6 +272,25 @@ struct r600_texture {
>   	 * dcc_offset contains the absolute GPUVM address, not the relative one.
>   	 */
>   	struct r600_resource		*dcc_separate_buffer;
> +	/* When DCC is temporarily disabled, the separate buffer is here. */
> +	struct r600_resource		*last_dcc_separate_buffer;
> +	/* We need to track DCC dirtiness, because st/dri usually calls
> +	 * flush_resource twice per frame (not a bug) and we don't wanna
> +	 * decompress DCC twice. Also, the dirty tracking must be done even
> +	 * if DCC isn't used, because it's required by the DCC usage analysis
> +	 * for a possible future enablement.
> +	 */
> +	bool				separate_dcc_dirty;
> +	/* Statistics gathering for the DCC enablement heuristic. */
> +	bool				dcc_gather_statistics;
> +	/* Estimate of how much this color buffer is written to in units of
> +	 * full-screen draws: ps_invocations / (width * height)
> +	 * Shader kills, late Z, and blending with trivial discards make it
> +	 * inaccurate (we need to count CB updates, not PS invocations).
> +	 */
> +	unsigned			ps_draw_ratio;
> +	/* The number of clears since the last DCC usage analysis. */
> +	unsigned			num_slow_clears;
>
>   	/* Counter that should be non-zero if the texture is bound to a
>   	 * framebuffer. Implemented in radeonsi only.
> @@ -536,6 +555,21 @@ struct r600_common_context {
>   	float				sample_locations_8x[8][2];
>   	float				sample_locations_16x[16][2];
>
> +	/* Statistics gathering for the DCC enablement heuristic. It can't be
> +	 * in r600_texture because r600_texture can be shared by multiple
> +	 * contexts. This is for back buffers only. We shouldn't get too many
> +	 * of those.
> +	 */
> +	struct {
> +		struct r600_texture		*tex;
> +		/* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
> +		struct pipe_query		*ps_stats[3];
> +		/* If all slots are used and another slot is needed,
> +		 * the least recently used slot is evicted based on this. */
> +		int64_t				last_use_timestamp;
> +		bool				query_active;
> +	} dcc_stats[2];
> +
>   	/* The list of all texture buffer objects in this context.
>   	 * This list is walked when a buffer is invalidated/reallocated and
>   	 * the GPU addresses are updated. */
> @@ -688,6 +722,12 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
>   						const struct pipe_surface *templ,
>   						unsigned width, unsigned height);
>   unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap);
> +void vi_separate_dcc_start_query(struct pipe_context *ctx,
> +				 struct r600_texture *tex);
> +void vi_separate_dcc_stop_query(struct pipe_context *ctx,
> +				struct r600_texture *tex);
> +void vi_separate_dcc_analyze_stats(struct pipe_context *ctx,
> +				   struct r600_texture *tex);
>   void vi_dcc_clear_level(struct r600_common_context *rctx,
>   			struct r600_texture *rtex,
>   			unsigned level, unsigned clear_value);
> diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
> index 23be5ed..7295ab6 100644
> --- a/src/gallium/drivers/radeon/r600_texture.c
> +++ b/src/gallium/drivers/radeon/r600_texture.c
> @@ -26,9 +26,11 @@
>    */
>   #include "r600_pipe_common.h"
>   #include "r600_cs.h"
> +#include "r600_query.h"
>   #include "util/u_format.h"
>   #include "util/u_memory.h"
>   #include "util/u_pack_color.h"
> +#include "os/os_time.h"
>   #include <errno.h>
>   #include <inttypes.h>
>
> @@ -567,6 +569,7 @@ static void r600_texture_destroy(struct pipe_screen *screen,
>   	}
>   	pb_reference(&resource->buf, NULL);
>   	r600_resource_reference(&rtex->dcc_separate_buffer, NULL);
> +	r600_resource_reference(&rtex->last_dcc_separate_buffer, NULL);
>   	FREE(rtex);
>   }
>
> @@ -1017,6 +1020,7 @@ r600_texture_create_object(struct pipe_screen *screen,
>   	rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D;
>   	/* Applies to GCN. */
>   	rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
> +	rtex->ps_draw_ratio = 100; /* start with a sufficiently high number */
>
>   	if (rtex->is_depth) {
>   		if (!(base->flags & (R600_RESOURCE_FLAG_TRANSFER |
> @@ -1705,6 +1709,224 @@ unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap)
>   	return ~0U;
>   }
>
> +/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
> +
> +/**
> + * Return the per-context slot where DCC statistics queries for the texture live.
> + */
> +static unsigned vi_get_context_dcc_stats_index(struct r600_common_context *rctx,
> +					       struct r600_texture *tex)
> +{
> +	int i, empty_slot = -1;
> +
> +	for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
> +		/* Return if found. */
> +		if (rctx->dcc_stats[i].tex == tex) {
> +			rctx->dcc_stats[i].last_use_timestamp = os_time_get();
> +			return i;
> +		}
> +
> +		/* Record the first seen empty slot. */
> +		if (empty_slot == -1 && !rctx->dcc_stats[i].tex)
> +			empty_slot = i;
> +	}
> +
> +	/* Not found. Remove the oldest member to make space in the array. */
> +	if (empty_slot == -1) {
> +		int oldest_slot = 0;
> +
> +		/* Find the oldest slot. */
> +		for (i = 1; i < ARRAY_SIZE(rctx->dcc_stats); i++)
> +			if (rctx->dcc_stats[oldest_slot].last_use_timestamp >
> +			    rctx->dcc_stats[i].last_use_timestamp)
> +				oldest_slot = i;
> +
> +		/* Clean up the oldest slot. */
> +		if (rctx->dcc_stats[oldest_slot].query_active)
> +			vi_separate_dcc_stop_query(&rctx->b,
> +						   rctx->dcc_stats[oldest_slot].tex);
> +
> +		for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats[oldest_slot].ps_stats); i++)
> +			if (rctx->dcc_stats[oldest_slot].ps_stats[i]) {
> +				rctx->b.destroy_query(&rctx->b,
> +						      rctx->dcc_stats[oldest_slot].ps_stats[i]);
> +				rctx->dcc_stats[oldest_slot].ps_stats[i] = NULL;
> +			}
> +
> +		pipe_resource_reference((struct pipe_resource**)
> +					&rctx->dcc_stats[oldest_slot].tex, NULL);
> +		empty_slot = oldest_slot;
> +	}
> +
> +	/* Add the texture to the new slot. */
> +	pipe_resource_reference((struct pipe_resource**)&rctx->dcc_stats[empty_slot].tex,
> +				&tex->resource.b.b);
> +	rctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
> +	return empty_slot;
> +}
> +
> +static struct pipe_query *
> +vi_create_resuming_pipestats_query(struct pipe_context *ctx)
> +{
> +	struct r600_query_hw *query = (struct r600_query_hw*)
> +		ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
> +
> +	query->flags |= R600_QUERY_HW_FLAG_BEGIN_RESUMES;
> +	return (struct pipe_query*)query;
> +}
> +
> +/**
> + * Called when binding a color buffer.
> + */
> +void vi_separate_dcc_start_query(struct pipe_context *ctx,
> +				 struct r600_texture *tex)
> +{
> +	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
> +	unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
> +
> +	assert(!rctx->dcc_stats[i].query_active);
> +
> +	if (!rctx->dcc_stats[i].ps_stats[0])
> +		rctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(ctx);
> +
> +	/* begin or resume the query */
> +	ctx->begin_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
> +	rctx->dcc_stats[i].query_active = true;
> +}
> +
> +/**
> + * Called when unbinding a color buffer.
> + */
> +void vi_separate_dcc_stop_query(struct pipe_context *ctx,
> +				struct r600_texture *tex)
> +{
> +	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
> +	unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
> +
> +	assert(rctx->dcc_stats[i].query_active);
> +	assert(rctx->dcc_stats[i].ps_stats[0]);
> +
> +	/* pause or end the query */
> +	ctx->end_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
> +	rctx->dcc_stats[i].query_active = false;
> +}
> +
> +static bool vi_can_enable_separate_dcc(struct r600_texture *tex)
> +{
> +	/* The minimum number of fullscreen draws per frame that is required
> +	 * to enable DCC. */
> +	return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
> +}

Rename this function to vi_should_enable_separate_dcc or similar.

> +
> +/* Called by fast clear. */
> +static void vi_separate_dcc_try_enable(struct r600_common_context *rctx,
> +				       struct r600_texture *tex)
> +{
> +	/* The intent is to use this with shared displayable back buffers,
> +	 * but it's not strictly limited only to them.
> +	 */
> +	if (!tex->resource.is_shared ||
> +	    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
> +	    tex->resource.b.b.target != PIPE_TEXTURE_2D ||
> +	    tex->surface.last_level > 0 ||
> +	    !tex->surface.dcc_size)
> +		return;
> +
> +	if (tex->dcc_offset)
> +		return; /* already enabled */
> +
> +	if (!vi_can_enable_separate_dcc(tex))
> +		return; /* stats show that DCC decompression is too expensive */
> +
> +	assert(tex->surface.level[0].dcc_enabled);
> +	assert(!tex->dcc_separate_buffer);
> +
> +	r600_texture_discard_cmask(rctx->screen, tex);
> +
> +	/* Get a DCC buffer. */
> +	if (tex->last_dcc_separate_buffer) {
> +		tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
> +		tex->last_dcc_separate_buffer = NULL;
> +	} else {
> +		tex->dcc_separate_buffer = (struct r600_resource*)
> +			r600_aligned_buffer_create(rctx->b.screen, 0,
> +						   PIPE_USAGE_DEFAULT,
> +						   tex->surface.dcc_size,
> +						   tex->surface.dcc_alignment);
> +		if (!tex->dcc_separate_buffer)
> +			return;
> +
> +		/* Enabling for the first time, so start the query. */
> +		tex->dcc_gather_statistics = true;
> +		vi_separate_dcc_start_query(&rctx->b, tex);

I think it would be cleaner to put the statistics enablement above the 
vi_can_enable_separate_dcc/vi_should_enable_separate_dcc check, since 
then the code doesn't rely on a magic initialization of ps_ratio. 
Generally it would separate the statistics gathering more cleanly from 
the enable/disable.

> +	}
> +
> +	/* dcc_offset is the absolute GPUVM address. */
> +	tex->dcc_offset = tex->dcc_separate_buffer->gpu_address;
> +
> +	/* no need to flag anything since this is called by fast clear that
> +	 * flags framebuffer state
> +	 */
> +}
> +
> +/**
> + * Called by pipe_context::flush_resource, the place where DCC decompression
> + * takes place.
> + */
> +void vi_separate_dcc_analyze_stats(struct pipe_context *ctx,
> +				   struct r600_texture *tex)

Bike-shedding: I'd prefer a name like _gather_stats or _collect_stats.

> +{
> +	struct r600_common_context *rctx = (struct r600_common_context*)ctx;
> +	unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
> +	bool query_active = rctx->dcc_stats[i].query_active;
> +	bool disable = false;
> +
> +	if (rctx->dcc_stats[i].ps_stats[2]) {
> +		union pipe_query_result result;
> +
> +		/* Read the results. */
> +		ctx->get_query_result(ctx, rctx->dcc_stats[i].ps_stats[2],
> +				      true, &result);

What if this stalls?

> +		ctx->destroy_query(ctx, rctx->dcc_stats[i].ps_stats[2]);
> +		rctx->dcc_stats[i].ps_stats[2] = NULL;
> +
> +		/* Compute the approximate number of fullscreen draws. */
> +		tex->ps_draw_ratio =
> +			result.pipeline_statistics.ps_invocations /
> +			(tex->resource.b.b.width0 * tex->resource.b.b.height0);
> +
> +		disable = tex->dcc_separate_buffer &&
> +			  !vi_can_enable_separate_dcc(tex);
> +	}
> +
> +	tex->num_slow_clears = 0;
> +
> +	/* stop the statistics query for ps_stats[0] */
> +	if (query_active)
> +		vi_separate_dcc_stop_query(ctx, tex);
> +
> +	/* Move the queries in the queue by one. */
> +	rctx->dcc_stats[i].ps_stats[2] = rctx->dcc_stats[i].ps_stats[1];
> +	rctx->dcc_stats[i].ps_stats[1] = rctx->dcc_stats[i].ps_stats[0];
> +	rctx->dcc_stats[i].ps_stats[0] = NULL;
> +
> +	/* create and start a new query as ps_stats[0] */
> +	if (query_active)
> +		vi_separate_dcc_start_query(ctx, tex);
> +
> +	if (disable) {
> +		assert(!tex->last_dcc_separate_buffer);
> +		tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
> +		tex->dcc_separate_buffer = NULL;
> +		tex->dcc_offset = 0;
> +		/* no need to flag anything since this is called after
> +		 * decompression that re-sets framebuffer state
> +		 */
> +	}

DCC disabling logic shouldn't be in a function called *_analyze_stats. 
Also, I find it clearer to reset tex->num_slow_clears outside, in 
flush_resource.

Nicolai

> +}
> +
> +/* FAST COLOR CLEAR */
> +
>   static void evergreen_set_clear_color(struct r600_texture *rtex,
>   				      enum pipe_format surface_format,
>   				      const union pipe_color_union *color)
> @@ -1966,6 +2188,22 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
>   			continue;
>   		}
>
> +		/* Fast clear is the most appropriate place to enable DCC for
> +		 * displayable surfaces.
> +		 */
> +		if (rctx->chip_class >= VI) {
> +			vi_separate_dcc_try_enable(rctx, tex);
> +
> +			/* Stoney can't do a CMASK-based clear, so all clears are
> +			 * considered to be hypothetically slow clears, which
> +			 * is weighed when determining to enable separate DCC.
> +			 */
> +			if (tex->dcc_gather_statistics &&
> +			    rctx->family == CHIP_STONEY)
> +				tex->num_slow_clears++;
> +		}
> +
> +		/* Try to clear DCC first, otherwise try CMASK. */
>   		if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) {
>   			uint32_t reset_value;
>   			bool clear_words_needed;
> @@ -1982,6 +2220,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
>
>   			if (clear_words_needed)
>   				tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
> +			tex->separate_dcc_dirty = true;
>   		} else {
>   			/* Stoney/RB+ doesn't work with CMASK fast clear. */
>   			if (rctx->family == CHIP_STONEY)
> diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
> index 73a72e0..ece9d42 100644
> --- a/src/gallium/drivers/radeonsi/si_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_blit.c
> @@ -1056,9 +1056,21 @@ static void si_flush_resource(struct pipe_context *ctx,
>
>   	assert(res->target != PIPE_BUFFER);
>
> +	/* st/dri calls flush twice per frame (not a bug), this prevents double
> +	 * decompression. */
> +	if (rtex->dcc_separate_buffer && !rtex->separate_dcc_dirty)
> +		return;
> +
>   	if (!rtex->is_depth && (rtex->cmask.size || rtex->dcc_offset)) {
>   		si_blit_decompress_color(ctx, rtex, 0, res->last_level,
> -					 0, util_max_layer(res, 0), false);
> +					 0, util_max_layer(res, 0),
> +					 rtex->dcc_separate_buffer != NULL);
> +	}
> +
> +	/* Always do the analysis even if DCC is disabled at the moment. */
> +	if (rtex->dcc_gather_statistics && rtex->separate_dcc_dirty) {
> +		rtex->separate_dcc_dirty = false;
> +		vi_separate_dcc_analyze_stats(ctx, rtex);
>   	}
>   }
>
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index 492a670..d0c3bde 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -2242,6 +2242,15 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
>   	unsigned old_nr_samples = sctx->framebuffer.nr_samples;
>   	int i;
>
> +	for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
> +		if (!sctx->framebuffer.state.cbufs[i])
> +			continue;
> +
> +		rtex = (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
> +		if (rtex->dcc_gather_statistics)
> +			vi_separate_dcc_stop_query(ctx, rtex);
> +	}
> +
>   	/* Only flush TC when changing the framebuffer state, because
>   	 * the only client not using TC that can change textures is
>   	 * the framebuffer.
> @@ -2308,6 +2317,12 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
>   		r600_context_add_resource_size(ctx, surf->base.texture);
>
>   		p_atomic_inc(&rtex->framebuffers_bound);
> +
> +		if (rtex->dcc_gather_statistics) {
> +			/* Dirty tracking must be enabled for DCC usage analysis. */
> +			sctx->framebuffer.compressed_cb_mask |= 1 << i;
> +			vi_separate_dcc_start_query(ctx, rtex);
> +		}
>   	}
>   	/* Set the second SPI format for possible dual-src blending. */
>   	if (i == 1 && surf) {
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 641becf..d980d7b 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -981,7 +981,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   			surf = sctx->framebuffer.state.cbufs[i];
>   			rtex = (struct r600_texture*)surf->texture;
>
> -			rtex->dirty_level_mask |= 1 << surf->u.tex.level;
> +			if (rtex->fmask.size)
> +				rtex->dirty_level_mask |= 1 << surf->u.tex.level;
> +			if (rtex->dcc_gather_statistics)
> +				rtex->separate_dcc_dirty = true;
>   		} while (mask);
>   	}
>
>