[Mesa-dev] [PATCH 09/15] radeonsi/gfx9: don't flush L2 metadata for CB if not needed
Marek Olšák
maraeo at gmail.com
Tue Aug 22 11:07:46 UTC 2017
On Tue, Aug 22, 2017 at 10:19 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 21.08.2017 23:54, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> ---
>> src/gallium/drivers/radeonsi/si_blit.c | 8 +++++---
>> src/gallium/drivers/radeonsi/si_pipe.h | 23
>> +++++++++++++++++++----
>> src/gallium/drivers/radeonsi/si_state.c | 19 +++++++++++++++----
>> src/gallium/drivers/radeonsi/si_state_draw.c | 11 +++++------
>> 4 files changed, 44 insertions(+), 17 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_blit.c
>> b/src/gallium/drivers/radeonsi/si_blit.c
>> index ae7f809..3228933 100644
>> --- a/src/gallium/drivers/radeonsi/si_blit.c
>> +++ b/src/gallium/drivers/radeonsi/si_blit.c
>> @@ -399,21 +399,22 @@ si_decompress_depth(struct si_context *sctx,
>> if (inplace_planes & PIPE_MASK_Z)
>> tex->dirty_level_mask = 0;
>> if (inplace_planes & PIPE_MASK_S)
>> tex->stencil_dirty_level_mask = 0;
>> }
>> }
>> /* set_framebuffer_state takes care of coherency for
>> single-sample.
>> * The DB->CB copy uses CB for the final writes.
>> */
>> if (copy_planes && tex->resource.b.b.nr_samples > 1)
>> - si_make_CB_shader_coherent(sctx,
>> tex->resource.b.b.nr_samples);
>> + si_make_CB_shader_coherent(sctx,
>> tex->resource.b.b.nr_samples,
>> + false);
>> }
>> static void
>> si_decompress_sampler_depth_textures(struct si_context *sctx,
>> struct si_textures_info *textures)
>> {
>> unsigned i;
>> unsigned mask = textures->needs_depth_decompress_mask;
>> while (mask) {
>> @@ -504,21 +505,22 @@ static void si_blit_decompress_color(struct
>> pipe_context *ctx,
>> }
>> /* The texture will always be dirty if some layers aren't
>> flushed.
>> * I don't think this case occurs often though. */
>> if (first_layer == 0 && last_layer >= max_layer) {
>> rtex->dirty_level_mask &= ~(1 << level);
>> }
>> }
>> sctx->decompression_enabled = false;
>> - si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples);
>> + si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples,
>> + vi_dcc_enabled(rtex, first_level));
>> }
>> static void
>> si_decompress_color_texture(struct si_context *sctx, struct r600_texture
>> *tex,
>> unsigned first_level, unsigned last_level)
>> {
>> /* CMASK or DCC can be discarded and we can still end up here. */
>> if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
>> return;
>> @@ -1193,21 +1195,21 @@ static void si_do_CB_resolve(struct si_context
>> *sctx,
>> si_blitter_begin(&sctx->b.b, SI_COLOR_RESOLVE |
>> (info->render_condition_enable ? 0 :
>> SI_DISABLE_RENDER_COND));
>> util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level,
>> dst_z,
>> info->src.resource,
>> info->src.box.z,
>> ~0, sctx->custom_blend_resolve,
>> format);
>> si_blitter_end(&sctx->b.b);
>> /* Flush caches for possible texturing. */
>> - si_make_CB_shader_coherent(sctx, 1);
>> + si_make_CB_shader_coherent(sctx, 1, false);
>> }
>> static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
>> const struct pipe_blit_info *info)
>> {
>> struct si_context *sctx = (struct si_context*)ctx;
>> struct r600_texture *src = (struct
>> r600_texture*)info->src.resource;
>> struct r600_texture *dst = (struct
>> r600_texture*)info->dst.resource;
>> MAYBE_UNUSED struct r600_texture *rtmp;
>> unsigned dst_width = u_minify(info->dst.resource->width0,
>> info->dst.level);
>> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
>> b/src/gallium/drivers/radeonsi/si_pipe.h
>> index 671c488..3e59e21 100644
>> --- a/src/gallium/drivers/radeonsi/si_pipe.h
>> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
>> @@ -50,21 +50,24 @@
>> #define SI_CONTEXT_INV_ICACHE (R600_CONTEXT_PRIVATE_FLAG << 0)
>> /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
>> #define SI_CONTEXT_INV_SMEM_L1 (R600_CONTEXT_PRIVATE_FLAG
>> << 1)
>> /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
>> #define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG
>> << 2)
>> /* Used by everything except CB/DB, can be bypassed (SLC=1). Other
>> names: TC L2 */
>> #define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3)
>> /* Write dirty L2 lines back to memory (shader and CP DMA stores), but
>> don't
>> * invalidate L2. SI-CIK can't do it, so they will do complete
>> invalidation. */
>> #define SI_CONTEXT_WRITEBACK_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG
>> << 4)
>> -/* gaps */
>> +/* Writeback & invalidate the L2 metadata cache. It can only be coupled
>> with
>> + * a CB or DB flush. */
>> +#define SI_CONTEXT_INV_L2_METADATA (R600_CONTEXT_PRIVATE_FLAG << 5)
>> +/* gap */
>> /* Framebuffer caches. */
>> #define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 7)
>> #define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 8)
>> /* Engine synchronization. */
>> #define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9)
>> #define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10)
>> #define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11)
>> #define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 12)
>> #define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13)
>> @@ -190,20 +193,21 @@ struct si_framebuffer {
>> unsigned spi_shader_col_format_blend;
>> unsigned spi_shader_col_format_blend_alpha;
>> ubyte nr_samples:5; /* at most 16xAA */
>> ubyte log_samples:3; /* at most 4 =
>> 16xAA */
>> ubyte compressed_cb_mask;
>> ubyte color_is_int8;
>> ubyte color_is_int10;
>> ubyte dirty_cbufs;
>> bool dirty_zsbuf;
>> bool any_dst_linear;
>> + bool CB_has_shader_readable_metadata;
>> };
>> struct si_clip_state {
>> struct r600_atom atom;
>> struct pipe_clip_state state;
>> bool any_nonzeros;
>> };
>> struct si_sample_locs {
>> struct r600_atom atom;
>> @@ -588,28 +592,39 @@ si_optimal_tcc_alignment(struct si_context *sctx,
>> unsigned upload_size)
>> * the whole thing will fit into a cache line if we align it to
>> its size.
>> * The idea is that multiple small uploads can share a cache line.
>> * If the upload size is greater, align it to the cache line size.
>> */
>> alignment = util_next_power_of_two(upload_size);
>> tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size;
>> return MIN2(alignment, tcc_cache_line_size);
>> }
>> static inline void
>> -si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples)
>> +si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
>> + bool shaders_read_metadata)
>> {
>> sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
>> SI_CONTEXT_INV_VMEM_L1;
>> - /* Single-sample color is coherent with shaders on GFX9. */
>> - if (sctx->b.chip_class <= VI || num_samples >= 2)
>> + if (sctx->b.chip_class >= GFX9) {
>> + /* Single-sample color is coherent with shaders on GFX9,
>> but
>> + * L2 metadata must be flushed if shaders read metadata.
>> + * (DCC, CMASK).
>> + */
>> + if (num_samples >= 2)
>> + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
>> + else if (shaders_read_metadata)
>> + sctx->b.flags |= SI_CONTEXT_INV_L2_METADATA;
>> + } else {
>> + /* SI-CI-VI */
>> sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
>> + }
>> }
>> static inline void
>> si_make_DB_shader_coherent(struct si_context *sctx, unsigned
>> num_samples,
>> bool include_stencil)
>> {
>> sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
>> SI_CONTEXT_INV_VMEM_L1;
>> /* Single-sample depth (not stencil) is coherent with shaders on
>> GFX9. */
>> diff --git a/src/gallium/drivers/radeonsi/si_state.c
>> b/src/gallium/drivers/radeonsi/si_state.c
>> index d116c07..e5d8d21 100644
>> --- a/src/gallium/drivers/radeonsi/si_state.c
>> +++ b/src/gallium/drivers/radeonsi/si_state.c
>> @@ -2566,21 +2566,22 @@ static void si_set_framebuffer_state(struct
>> pipe_context *ctx,
>> *
>> * When MSAA is enabled, CB and TC caches are flushed on demand
>> * (after FMASK decompression). Shader write -> FB read
>> transitions
>> * cannot happen for MSAA textures, because MSAA shader images are
>> * not supported.
>> *
>> * Only flush and wait for CB if there is actually a bound color
>> buffer.
>> */
>> if (sctx->framebuffer.nr_samples <= 1 &&
>> sctx->framebuffer.state.nr_cbufs)
>> - si_make_CB_shader_coherent(sctx,
>> sctx->framebuffer.nr_samples);
>> + si_make_CB_shader_coherent(sctx,
>> sctx->framebuffer.nr_samples,
>> +
>> sctx->framebuffer.CB_has_shader_readable_metadata);
>> sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
>> /* u_blitter doesn't invoke depth decompression when it does
>> multiple
>> * blits in a row, but the only case when it matters for DB is
>> when
>> * doing generate_mipmap. So here we flush DB manually between
>> * individual generate_mipmap blits.
>> * Note that lower mipmap levels aren't compressed.
>> */
>> if (sctx->generate_mipmap_for_depth)
>> @@ -2601,20 +2602,21 @@ static void si_set_framebuffer_state(struct
>> pipe_context *ctx,
>> sctx->framebuffer.spi_shader_col_format_alpha = 0;
>> sctx->framebuffer.spi_shader_col_format_blend = 0;
>> sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
>> sctx->framebuffer.color_is_int8 = 0;
>> sctx->framebuffer.color_is_int10 = 0;
>> sctx->framebuffer.compressed_cb_mask = 0;
>> sctx->framebuffer.nr_samples =
>> util_framebuffer_get_num_samples(state);
>> sctx->framebuffer.log_samples =
>> util_logbase2(sctx->framebuffer.nr_samples);
>> sctx->framebuffer.any_dst_linear = false;
>> + sctx->framebuffer.CB_has_shader_readable_metadata = false;
>> for (i = 0; i < state->nr_cbufs; i++) {
>> if (!state->cbufs[i])
>> continue;
>> surf = (struct r600_surface*)state->cbufs[i];
>> rtex = (struct r600_texture*)surf->base.texture;
>> if (!surf->color_initialized) {
>> si_initialize_color_surface(sctx, surf);
>> @@ -2635,20 +2637,23 @@ static void si_set_framebuffer_state(struct
>> pipe_context *ctx,
>> if (surf->color_is_int10)
>> sctx->framebuffer.color_is_int10 |= 1 << i;
>> if (rtex->fmask.size) {
>> sctx->framebuffer.compressed_cb_mask |= 1 << i;
>> }
>> if (rtex->surface.is_linear)
>> sctx->framebuffer.any_dst_linear = true;
>> + if (vi_dcc_enabled(rtex, surf->base.u.tex.level))
>> + sctx->framebuffer.CB_has_shader_readable_metadata
>> = true;
>> +
>> r600_context_add_resource_size(ctx, surf->base.texture);
>> p_atomic_inc(&rtex->framebuffers_bound);
>> if (rtex->dcc_gather_statistics) {
>> /* Dirty tracking must be enabled for DCC usage
>> analysis. */
>> sctx->framebuffer.compressed_cb_mask |= 1 << i;
>> vi_separate_dcc_start_query(ctx, rtex);
>> }
>> }
>> @@ -4015,21 +4020,22 @@ static void si_set_tess_state(struct pipe_context
>> *ctx,
>> static void si_texture_barrier(struct pipe_context *ctx, unsigned
>> flags)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> si_update_fb_dirtiness_after_rendering(sctx);
>> /* Multisample surfaces are flushed in si_decompress_textures. */
>> if (sctx->framebuffer.nr_samples <= 1 &&
>> sctx->framebuffer.state.nr_cbufs)
>> - si_make_CB_shader_coherent(sctx,
>> sctx->framebuffer.nr_samples);
>> + si_make_CB_shader_coherent(sctx,
>> sctx->framebuffer.nr_samples,
>> +
>> sctx->framebuffer.CB_has_shader_readable_metadata);
>> }
>> /* This only ensures coherency for shader image/buffer stores. */
>> static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>> {
>> struct si_context *sctx = (struct si_context *)ctx;
>> /* Subsequent commands must wait for all shader invocations to
>> * complete. */
>> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>> @@ -4060,23 +4066,28 @@ static void si_memory_barrier(struct pipe_context
>> *ctx, unsigned flags)
>> }
>> /* MSAA color, any depth and any stencil are flushed in
>> * si_decompress_textures when needed.
>> */
>> if (flags & PIPE_BARRIER_FRAMEBUFFER &&
>> sctx->framebuffer.nr_samples <= 1 &&
>> sctx->framebuffer.state.nr_cbufs) {
>> sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
>> - /* Single-sample color is coherent with TC on GFX9. */
>> - if (sctx->screen->b.chip_class <= VI)
>> + if (sctx->b.chip_class >= GFX9) {
>> + /* Single-sample color is coherent with TC on
>> GFX9. */
>> + if
>> (sctx->framebuffer.CB_has_shader_readable_metadata)
>> + sctx->b.flags |=
>> SI_CONTEXT_INV_L2_METADATA;
>
>
> MemoryBarrier is about making *shader writes* visible to other parts of the
> pipeline.
>
> So I think:
>
> - the comment above is misleading
>
> - there should be no need to invalidate the metadata, since DCC must already
> have been disabled or at least decompressed for the relevant texture, and
> the shader doesn't modify the metadata anyway.
You are right. I've replaced that locally with:
if (sctx->b.chip_class <= VI)
sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
and no comment.
Marek
More information about the mesa-dev
mailing list