[Mesa-dev] [PATCH 5/6] gallium/radeon: add a heuristic dynamically enabling DCC for scanout surfaces
Nicolai Hähnle
nhaehnle at gmail.com
Fri Jun 24 11:09:13 UTC 2016
On 22.06.2016 20:29, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> DCC for displayable surfaces is allocated in a separate buffer and is
> enabled or disabled based on PS invocations from 2 frames ago (to let
> queries go idle) and the number of slow clears from the current frame.
>
> At least an equivalent of 5 fullscreen draws or slow clears must be done
> to enable DCC. (PS invocations / (width * height) + num_slow_clears >= 5)
>
> Pipeline statistic queries are always active if a color buffer that can
> have separate DCC is bound, even if separate DCC is disabled. That means
> the window color buffer is always monitored and DCC is enabled only when
> the situation is right.
>
> The tracking of per-texture queries in r600_common_context is quite ugly,
> but I don't see a better way.
>
> The first fast clear always enables DCC. DCC decompression can disable it.
> A later fast clear can enable it again. Enable/disable typically happens
> only once per frame.
>
> The impact is expected to be negligible because games usually don't have
> a high level of overdraw. DCC usually activates when too much blending
> is happening (smoke rendering) or when testing glClear performance and
> CMASK isn't supported (Stoney).
Nice stuff. One corner case to think of: what happens when DCC is
enabled for a texture that is currently bound? Needs the same treatment
as when DCC is disabled, right?
More comments below...
> ---
> src/gallium/drivers/radeon/r600_pipe_common.c | 15 ++
> src/gallium/drivers/radeon/r600_pipe_common.h | 40 +++++
> src/gallium/drivers/radeon/r600_texture.c | 239 ++++++++++++++++++++++++++
> src/gallium/drivers/radeonsi/si_blit.c | 14 +-
> src/gallium/drivers/radeonsi/si_state.c | 15 ++
> src/gallium/drivers/radeonsi/si_state_draw.c | 5 +-
> 6 files changed, 326 insertions(+), 2 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
> index 5d4a679..66afcfa 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -397,6 +397,21 @@ bool r600_common_context_init(struct r600_common_context *rctx,
>
> void r600_common_context_cleanup(struct r600_common_context *rctx)
> {
> + unsigned i,j;
> +
> + /* Release DCC stats. */
> + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
> + assert(!rctx->dcc_stats[i].query_active);
> +
> + for (j = 0; j < ARRAY_SIZE(rctx->dcc_stats[i].ps_stats); j++)
> + if (rctx->dcc_stats[i].ps_stats[j])
> + rctx->b.destroy_query(&rctx->b,
> + rctx->dcc_stats[i].ps_stats[j]);
> +
> + pipe_resource_reference((struct pipe_resource**)
> + &rctx->dcc_stats[i].tex, NULL);
> + }
> +
> if (rctx->gfx.cs)
> rctx->ws->cs_destroy(rctx->gfx.cs);
> if (rctx->dma.cs)
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
> index 92cba13..cdec907 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
> @@ -272,6 +272,25 @@ struct r600_texture {
> * dcc_offset contains the absolute GPUVM address, not the relative one.
> */
> struct r600_resource *dcc_separate_buffer;
> + /* When DCC is temporarily disabled, the separate buffer is here. */
> + struct r600_resource *last_dcc_separate_buffer;
> + /* We need to track DCC dirtiness, because st/dri usually calls
> + * flush_resource twice per frame (not a bug) and we don't wanna
> + * decompress DCC twice. Also, the dirty tracking must be done even
> + * if DCC isn't used, because it's required by the DCC usage analysis
> + * for a possible future enablement.
> + */
> + bool separate_dcc_dirty;
> + /* Statistics gathering for the DCC enablement heuristic. */
> + bool dcc_gather_statistics;
> + /* Estimate of how much this color buffer is written to in units of
> + * full-screen draws: ps_invocations / (width * height)
> + * Shader kills, late Z, and blending with trivial discards make it
> + * inaccurate (we need to count CB updates, not PS invocations).
> + */
> + unsigned ps_draw_ratio;
> + /* The number of clears since the last DCC usage analysis. */
> + unsigned num_slow_clears;
>
> /* Counter that should be non-zero if the texture is bound to a
> * framebuffer. Implemented in radeonsi only.
> @@ -536,6 +555,21 @@ struct r600_common_context {
> float sample_locations_8x[8][2];
> float sample_locations_16x[16][2];
>
> + /* Statistics gathering for the DCC enablement heuristic. It can't be
> + * in r600_texture because r600_texture can be shared by multiple
> + * contexts. This is for back buffers only. We shouldn't get too many
> + * of those.
> + */
> + struct {
> + struct r600_texture *tex;
> + /* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
> + struct pipe_query *ps_stats[3];
> + /* If all slots are used and another slot is needed,
> + * the least recently used slot is evicted based on this. */
> + int64_t last_use_timestamp;
> + bool query_active;
> + } dcc_stats[2];
> +
> /* The list of all texture buffer objects in this context.
> * This list is walked when a buffer is invalidated/reallocated and
> * the GPU addresses are updated. */
> @@ -688,6 +722,12 @@ struct pipe_surface *r600_create_surface_custom(struct pipe_context *pipe,
> const struct pipe_surface *templ,
> unsigned width, unsigned height);
> unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap);
> +void vi_separate_dcc_start_query(struct pipe_context *ctx,
> + struct r600_texture *tex);
> +void vi_separate_dcc_stop_query(struct pipe_context *ctx,
> + struct r600_texture *tex);
> +void vi_separate_dcc_analyze_stats(struct pipe_context *ctx,
> + struct r600_texture *tex);
> void vi_dcc_clear_level(struct r600_common_context *rctx,
> struct r600_texture *rtex,
> unsigned level, unsigned clear_value);
> diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
> index 23be5ed..7295ab6 100644
> --- a/src/gallium/drivers/radeon/r600_texture.c
> +++ b/src/gallium/drivers/radeon/r600_texture.c
> @@ -26,9 +26,11 @@
> */
> #include "r600_pipe_common.h"
> #include "r600_cs.h"
> +#include "r600_query.h"
> #include "util/u_format.h"
> #include "util/u_memory.h"
> #include "util/u_pack_color.h"
> +#include "os/os_time.h"
> #include <errno.h>
> #include <inttypes.h>
>
> @@ -567,6 +569,7 @@ static void r600_texture_destroy(struct pipe_screen *screen,
> }
> pb_reference(&resource->buf, NULL);
> r600_resource_reference(&rtex->dcc_separate_buffer, NULL);
> + r600_resource_reference(&rtex->last_dcc_separate_buffer, NULL);
> FREE(rtex);
> }
>
> @@ -1017,6 +1020,7 @@ r600_texture_create_object(struct pipe_screen *screen,
> rtex->non_disp_tiling = rtex->is_depth && rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D;
> /* Applies to GCN. */
> rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
> + rtex->ps_draw_ratio = 100; /* start with a sufficiently high number */
>
> if (rtex->is_depth) {
> if (!(base->flags & (R600_RESOURCE_FLAG_TRANSFER |
> @@ -1705,6 +1709,224 @@ unsigned r600_translate_colorswap(enum pipe_format format, bool do_endian_swap)
> return ~0U;
> }
>
> +/* PIPELINE_STAT-BASED DCC ENABLEMENT FOR DISPLAYABLE SURFACES */
> +
> +/**
> + * Return the per-context slot where DCC statistics queries for the texture live.
> + */
> +static unsigned vi_get_context_dcc_stats_index(struct r600_common_context *rctx,
> + struct r600_texture *tex)
> +{
> + int i, empty_slot = -1;
> +
> + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats); i++) {
> + /* Return if found. */
> + if (rctx->dcc_stats[i].tex == tex) {
> + rctx->dcc_stats[i].last_use_timestamp = os_time_get();
> + return i;
> + }
> +
> + /* Record the first seen empty slot. */
> + if (empty_slot == -1 && !rctx->dcc_stats[i].tex)
> + empty_slot = i;
> + }
> +
> + /* Not found. Remove the oldest member to make space in the array. */
> + if (empty_slot == -1) {
> + int oldest_slot = 0;
> +
> + /* Find the oldest slot. */
> + for (i = 1; i < ARRAY_SIZE(rctx->dcc_stats); i++)
> + if (rctx->dcc_stats[oldest_slot].last_use_timestamp >
> + rctx->dcc_stats[i].last_use_timestamp)
> + oldest_slot = i;
> +
> + /* Clean up the oldest slot. */
> + if (rctx->dcc_stats[oldest_slot].query_active)
> + vi_separate_dcc_stop_query(&rctx->b,
> + rctx->dcc_stats[oldest_slot].tex);
> +
> + for (i = 0; i < ARRAY_SIZE(rctx->dcc_stats[oldest_slot].ps_stats); i++)
> + if (rctx->dcc_stats[oldest_slot].ps_stats[i]) {
> + rctx->b.destroy_query(&rctx->b,
> + rctx->dcc_stats[oldest_slot].ps_stats[i]);
> + rctx->dcc_stats[oldest_slot].ps_stats[i] = NULL;
> + }
> +
> + pipe_resource_reference((struct pipe_resource**)
> + &rctx->dcc_stats[oldest_slot].tex, NULL);
> + empty_slot = oldest_slot;
> + }
> +
> + /* Add the texture to the new slot. */
> + pipe_resource_reference((struct pipe_resource**)&rctx->dcc_stats[empty_slot].tex,
> + &tex->resource.b.b);
> + rctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
> + return empty_slot;
> +}
> +
> +static struct pipe_query *
> +vi_create_resuming_pipestats_query(struct pipe_context *ctx)
> +{
> + struct r600_query_hw *query = (struct r600_query_hw*)
> + ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0);
> +
> + query->flags |= R600_QUERY_HW_FLAG_BEGIN_RESUMES;
> + return (struct pipe_query*)query;
> +}
> +
> +/**
> + * Called when binding a color buffer.
> + */
> +void vi_separate_dcc_start_query(struct pipe_context *ctx,
> + struct r600_texture *tex)
> +{
> + struct r600_common_context *rctx = (struct r600_common_context*)ctx;
> + unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
> +
> + assert(!rctx->dcc_stats[i].query_active);
> +
> + if (!rctx->dcc_stats[i].ps_stats[0])
> + rctx->dcc_stats[i].ps_stats[0] = vi_create_resuming_pipestats_query(ctx);
> +
> + /* begin or resume the query */
> + ctx->begin_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
> + rctx->dcc_stats[i].query_active = true;
> +}
> +
> +/**
> + * Called when unbinding a color buffer.
> + */
> +void vi_separate_dcc_stop_query(struct pipe_context *ctx,
> + struct r600_texture *tex)
> +{
> + struct r600_common_context *rctx = (struct r600_common_context*)ctx;
> + unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
> +
> + assert(rctx->dcc_stats[i].query_active);
> + assert(rctx->dcc_stats[i].ps_stats[0]);
> +
> + /* pause or end the query */
> + ctx->end_query(ctx, rctx->dcc_stats[i].ps_stats[0]);
> + rctx->dcc_stats[i].query_active = false;
> +}
> +
> +static bool vi_can_enable_separate_dcc(struct r600_texture *tex)
> +{
> + /* The minimum number of fullscreen draws per frame that is required
> + * to enable DCC. */
> + return tex->ps_draw_ratio + tex->num_slow_clears >= 5;
> +}
Rename this function to vi_should_enable_separate_dcc or similar.
> +
> +/* Called by fast clear. */
> +static void vi_separate_dcc_try_enable(struct r600_common_context *rctx,
> + struct r600_texture *tex)
> +{
> + /* The intent is to use this with shared displayable back buffers,
> + * but it's not strictly limited only to them.
> + */
> + if (!tex->resource.is_shared ||
> + !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
> + tex->resource.b.b.target != PIPE_TEXTURE_2D ||
> + tex->surface.last_level > 0 ||
> + !tex->surface.dcc_size)
> + return;
> +
> + if (tex->dcc_offset)
> + return; /* already enabled */
> +
> + if (!vi_can_enable_separate_dcc(tex))
> + return; /* stats show that DCC decompression is too expensive */
> +
> + assert(tex->surface.level[0].dcc_enabled);
> + assert(!tex->dcc_separate_buffer);
> +
> + r600_texture_discard_cmask(rctx->screen, tex);
> +
> + /* Get a DCC buffer. */
> + if (tex->last_dcc_separate_buffer) {
> + tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
> + tex->last_dcc_separate_buffer = NULL;
> + } else {
> + tex->dcc_separate_buffer = (struct r600_resource*)
> + r600_aligned_buffer_create(rctx->b.screen, 0,
> + PIPE_USAGE_DEFAULT,
> + tex->surface.dcc_size,
> + tex->surface.dcc_alignment);
> + if (!tex->dcc_separate_buffer)
> + return;
> +
> + /* Enabling for the first time, so start the query. */
> + tex->dcc_gather_statistics = true;
> + vi_separate_dcc_start_query(&rctx->b, tex);
I think it would be cleaner to put the statistics enablement above the
vi_can_enable_separate_dcc/vi_should_enable_separate_dcc check, since
then the code doesn't rely on a magic initialization of ps_ratio.
Generally it would separate the statistics gathering more cleanly from
the enable/disable.
> + }
> +
> + /* dcc_offset is the absolute GPUVM address. */
> + tex->dcc_offset = tex->dcc_separate_buffer->gpu_address;
> +
> + /* no need to flag anything since this is called by fast clear that
> + * flags framebuffer state
> + */
> +}
> +
> +/**
> + * Called by pipe_context::flush_resource, the place where DCC decompression
> + * takes place.
> + */
> +void vi_separate_dcc_analyze_stats(struct pipe_context *ctx,
> + struct r600_texture *tex)
Bike-shedding: I'd prefer a name like _gather_stats or _collect_stats.
> +{
> + struct r600_common_context *rctx = (struct r600_common_context*)ctx;
> + unsigned i = vi_get_context_dcc_stats_index(rctx, tex);
> + bool query_active = rctx->dcc_stats[i].query_active;
> + bool disable = false;
> +
> + if (rctx->dcc_stats[i].ps_stats[2]) {
> + union pipe_query_result result;
> +
> + /* Read the results. */
> + ctx->get_query_result(ctx, rctx->dcc_stats[i].ps_stats[2],
> + true, &result);
What if this stalls?
> + ctx->destroy_query(ctx, rctx->dcc_stats[i].ps_stats[2]);
> + rctx->dcc_stats[i].ps_stats[2] = NULL;
> +
> + /* Compute the approximate number of fullscreen draws. */
> + tex->ps_draw_ratio =
> + result.pipeline_statistics.ps_invocations /
> + (tex->resource.b.b.width0 * tex->resource.b.b.height0);
> +
> + disable = tex->dcc_separate_buffer &&
> + !vi_can_enable_separate_dcc(tex);
> + }
> +
> + tex->num_slow_clears = 0;
> +
> + /* stop the statistics query for ps_stats[0] */
> + if (query_active)
> + vi_separate_dcc_stop_query(ctx, tex);
> +
> + /* Move the queries in the queue by one. */
> + rctx->dcc_stats[i].ps_stats[2] = rctx->dcc_stats[i].ps_stats[1];
> + rctx->dcc_stats[i].ps_stats[1] = rctx->dcc_stats[i].ps_stats[0];
> + rctx->dcc_stats[i].ps_stats[0] = NULL;
> +
> + /* create and start a new query as ps_stats[0] */
> + if (query_active)
> + vi_separate_dcc_start_query(ctx, tex);
> +
> + if (disable) {
> + assert(!tex->last_dcc_separate_buffer);
> + tex->last_dcc_separate_buffer = tex->dcc_separate_buffer;
> + tex->dcc_separate_buffer = NULL;
> + tex->dcc_offset = 0;
> + /* no need to flag anything since this is called after
> + * decompression that re-sets framebuffer state
> + */
> + }
DCC disabling logic shouldn't be in a function called *_analyze_stats.
Also, I find it clearer to reset tex->num_slow_clears outside, in
flush_resource.
Nicolai
> +}
> +
> +/* FAST COLOR CLEAR */
> +
> static void evergreen_set_clear_color(struct r600_texture *rtex,
> enum pipe_format surface_format,
> const union pipe_color_union *color)
> @@ -1966,6 +2188,22 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
> continue;
> }
>
> + /* Fast clear is the most appropriate place to enable DCC for
> + * displayable surfaces.
> + */
> + if (rctx->chip_class >= VI) {
> + vi_separate_dcc_try_enable(rctx, tex);
> +
> + /* Stoney can't do a CMASK-based clear, so all clears are
> + * considered to be hypothetically slow clears, which
> + * is weighed when determining to enable separate DCC.
> + */
> + if (tex->dcc_gather_statistics &&
> + rctx->family == CHIP_STONEY)
> + tex->num_slow_clears++;
> + }
> +
> + /* Try to clear DCC first, otherwise try CMASK. */
> if (tex->dcc_offset && tex->surface.level[0].dcc_enabled) {
> uint32_t reset_value;
> bool clear_words_needed;
> @@ -1982,6 +2220,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
>
> if (clear_words_needed)
> tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
> + tex->separate_dcc_dirty = true;
> } else {
> /* Stoney/RB+ doesn't work with CMASK fast clear. */
> if (rctx->family == CHIP_STONEY)
> diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
> index 73a72e0..ece9d42 100644
> --- a/src/gallium/drivers/radeonsi/si_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_blit.c
> @@ -1056,9 +1056,21 @@ static void si_flush_resource(struct pipe_context *ctx,
>
> assert(res->target != PIPE_BUFFER);
>
> + /* st/dri calls flush twice per frame (not a bug), this prevents double
> + * decompression. */
> + if (rtex->dcc_separate_buffer && !rtex->separate_dcc_dirty)
> + return;
> +
> if (!rtex->is_depth && (rtex->cmask.size || rtex->dcc_offset)) {
> si_blit_decompress_color(ctx, rtex, 0, res->last_level,
> - 0, util_max_layer(res, 0), false);
> + 0, util_max_layer(res, 0),
> + rtex->dcc_separate_buffer != NULL);
> + }
> +
> + /* Always do the analysis even if DCC is disabled at the moment. */
> + if (rtex->dcc_gather_statistics && rtex->separate_dcc_dirty) {
> + rtex->separate_dcc_dirty = false;
> + vi_separate_dcc_analyze_stats(ctx, rtex);
> }
> }
>
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index 492a670..d0c3bde 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -2242,6 +2242,15 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
> unsigned old_nr_samples = sctx->framebuffer.nr_samples;
> int i;
>
> + for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
> + if (!sctx->framebuffer.state.cbufs[i])
> + continue;
> +
> + rtex = (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
> + if (rtex->dcc_gather_statistics)
> + vi_separate_dcc_stop_query(ctx, rtex);
> + }
> +
> /* Only flush TC when changing the framebuffer state, because
> * the only client not using TC that can change textures is
> * the framebuffer.
> @@ -2308,6 +2317,12 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
> r600_context_add_resource_size(ctx, surf->base.texture);
>
> p_atomic_inc(&rtex->framebuffers_bound);
> +
> + if (rtex->dcc_gather_statistics) {
> + /* Dirty tracking must be enabled for DCC usage analysis. */
> + sctx->framebuffer.compressed_cb_mask |= 1 << i;
> + vi_separate_dcc_start_query(ctx, rtex);
> + }
> }
> /* Set the second SPI format for possible dual-src blending. */
> if (i == 1 && surf) {
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 641becf..d980d7b 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -981,7 +981,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
> surf = sctx->framebuffer.state.cbufs[i];
> rtex = (struct r600_texture*)surf->texture;
>
> - rtex->dirty_level_mask |= 1 << surf->u.tex.level;
> + if (rtex->fmask.size)
> + rtex->dirty_level_mask |= 1 << surf->u.tex.level;
> + if (rtex->dcc_gather_statistics)
> + rtex->separate_dcc_dirty = true;
> } while (mask);
> }
>
>
More information about the mesa-dev
mailing list