[Mesa-dev] [PATCH 09/15] radeonsi/gfx9: don't flush L2 metadata for CB if not needed

Tue Aug 22 11:37:23 UTC 2017

On 22.08.2017 13:07, Marek Olšák wrote:
> On Tue, Aug 22, 2017 at 10:19 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> On 21.08.2017 23:54, Marek Olšák wrote:
>>>
>>> From: Marek Olšák <marek.olsak at amd.com>
>>>
>>> ---
>>>    src/gallium/drivers/radeonsi/si_blit.c       |  8 +++++---
>>>    src/gallium/drivers/radeonsi/si_pipe.h       | 23
>>> +++++++++++++++++++----
>>>    src/gallium/drivers/radeonsi/si_state.c      | 19 +++++++++++++++----
>>>    src/gallium/drivers/radeonsi/si_state_draw.c | 11 +++++------
>>>    4 files changed, 44 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/radeonsi/si_blit.c
>>> b/src/gallium/drivers/radeonsi/si_blit.c
>>> index ae7f809..3228933 100644
>>> --- a/src/gallium/drivers/radeonsi/si_blit.c
>>> +++ b/src/gallium/drivers/radeonsi/si_blit.c
>>> @@ -399,21 +399,22 @@ si_decompress_depth(struct si_context *sctx,
>>>                          if (inplace_planes & PIPE_MASK_Z)
>>>                                  tex->dirty_level_mask = 0;
>>>                          if (inplace_planes & PIPE_MASK_S)
>>>                                  tex->stencil_dirty_level_mask = 0;
>>>                  }
>>>          }
>>>          /* set_framebuffer_state takes care of coherency for
>>> single-sample.
>>>           * The DB->CB copy uses CB for the final writes.
>>>           */
>>>          if (copy_planes && tex->resource.b.b.nr_samples > 1)
>>> -               si_make_CB_shader_coherent(sctx,
>>> tex->resource.b.b.nr_samples);
>>> +               si_make_CB_shader_coherent(sctx,
>>> tex->resource.b.b.nr_samples,
>>> +                                          false);
>>>    }
>>>      static void
>>>    si_decompress_sampler_depth_textures(struct si_context *sctx,
>>>                                       struct si_textures_info *textures)
>>>    {
>>>          unsigned i;
>>>          unsigned mask = textures->needs_depth_decompress_mask;
>>>          while (mask) {
>>> @@ -504,21 +505,22 @@ static void si_blit_decompress_color(struct
>>> pipe_context *ctx,
>>>                  }
>>>                  /* The texture will always be dirty if some layers aren't
>>> flushed.
>>>                   * I don't think this case occurs often though. */
>>>                  if (first_layer == 0 && last_layer >= max_layer) {
>>>                          rtex->dirty_level_mask &= ~(1 << level);
>>>                  }
>>>          }
>>>          sctx->decompression_enabled = false;
>>> -       si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples);
>>> +       si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples,
>>> +                                  vi_dcc_enabled(rtex, first_level));
>>>    }
>>>      static void
>>>    si_decompress_color_texture(struct si_context *sctx, struct r600_texture
>>> *tex,
>>>                              unsigned first_level, unsigned last_level)
>>>    {
>>>          /* CMASK or DCC can be discarded and we can still end up here. */
>>>          if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
>>>                  return;
>>>    @@ -1193,21 +1195,21 @@ static void si_do_CB_resolve(struct si_context
>>> *sctx,
>>>          si_blitter_begin(&sctx->b.b, SI_COLOR_RESOLVE |
>>>                           (info->render_condition_enable ? 0 :
>>> SI_DISABLE_RENDER_COND));
>>>          util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level,
>>> dst_z,
>>>                                            info->src.resource,
>>> info->src.box.z,
>>>                                            ~0, sctx->custom_blend_resolve,
>>>                                            format);
>>>          si_blitter_end(&sctx->b.b);
>>>          /* Flush caches for possible texturing. */
>>> -       si_make_CB_shader_coherent(sctx, 1);
>>> +       si_make_CB_shader_coherent(sctx, 1, false);
>>>    }
>>>      static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
>>>                                       const struct pipe_blit_info *info)
>>>    {
>>>          struct si_context *sctx = (struct si_context*)ctx;
>>>          struct r600_texture *src = (struct
>>> r600_texture*)info->src.resource;
>>>          struct r600_texture *dst = (struct
>>> r600_texture*)info->dst.resource;
>>>          MAYBE_UNUSED struct r600_texture *rtmp;
>>>          unsigned dst_width = u_minify(info->dst.resource->width0,
>>> info->dst.level);
>>> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
>>> b/src/gallium/drivers/radeonsi/si_pipe.h
>>> index 671c488..3e59e21 100644
>>> --- a/src/gallium/drivers/radeonsi/si_pipe.h
>>> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
>>> @@ -50,21 +50,24 @@
>>>    #define SI_CONTEXT_INV_ICACHE         (R600_CONTEXT_PRIVATE_FLAG << 0)
>>>    /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
>>>    #define SI_CONTEXT_INV_SMEM_L1                (R600_CONTEXT_PRIVATE_FLAG
>>> << 1)
>>>    /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
>>>    #define SI_CONTEXT_INV_VMEM_L1                (R600_CONTEXT_PRIVATE_FLAG
>>> << 2)
>>>    /* Used by everything except CB/DB, can be bypassed (SLC=1). Other
>>> names: TC L2 */
>>>    #define SI_CONTEXT_INV_GLOBAL_L2      (R600_CONTEXT_PRIVATE_FLAG << 3)
>>>    /* Write dirty L2 lines back to memory (shader and CP DMA stores), but
>>> don't
>>>     * invalidate L2. SI-CIK can't do it, so they will do complete
>>> invalidation. */
>>>    #define SI_CONTEXT_WRITEBACK_GLOBAL_L2        (R600_CONTEXT_PRIVATE_FLAG
>>> << 4)
>>> -/* gaps */
>>> +/* Writeback & invalidate the L2 metadata cache. It can only be coupled
>>> with
>>> + * a CB or DB flush. */
>>> +#define SI_CONTEXT_INV_L2_METADATA     (R600_CONTEXT_PRIVATE_FLAG << 5)
>>> +/* gap */
>>>    /* Framebuffer caches. */
>>>    #define SI_CONTEXT_FLUSH_AND_INV_DB   (R600_CONTEXT_PRIVATE_FLAG << 7)
>>>    #define SI_CONTEXT_FLUSH_AND_INV_CB   (R600_CONTEXT_PRIVATE_FLAG << 8)
>>>    /* Engine synchronization. */
>>>    #define SI_CONTEXT_VS_PARTIAL_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 9)
>>>    #define SI_CONTEXT_PS_PARTIAL_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 10)
>>>    #define SI_CONTEXT_CS_PARTIAL_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 11)
>>>    #define SI_CONTEXT_VGT_FLUSH          (R600_CONTEXT_PRIVATE_FLAG << 12)
>>>    #define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13)
>>>    @@ -190,20 +193,21 @@ struct si_framebuffer {
>>>          unsigned                        spi_shader_col_format_blend;
>>>          unsigned                        spi_shader_col_format_blend_alpha;
>>>          ubyte                           nr_samples:5; /* at most 16xAA */
>>>          ubyte                           log_samples:3; /* at most 4 =
>>> 16xAA */
>>>          ubyte                           compressed_cb_mask;
>>>          ubyte                           color_is_int8;
>>>          ubyte                           color_is_int10;
>>>          ubyte                           dirty_cbufs;
>>>          bool                            dirty_zsbuf;
>>>          bool                            any_dst_linear;
>>> +       bool                            CB_has_shader_readable_metadata;
>>>    };
>>>      struct si_clip_state {
>>>          struct r600_atom                atom;
>>>          struct pipe_clip_state          state;
>>>          bool                            any_nonzeros;
>>>    };
>>>      struct si_sample_locs {
>>>          struct r600_atom        atom;
>>> @@ -588,28 +592,39 @@ si_optimal_tcc_alignment(struct si_context *sctx,
>>> unsigned upload_size)
>>>           * the whole thing will fit into a cache line if we align it to
>>> its size.
>>>           * The idea is that multiple small uploads can share a cache line.
>>>           * If the upload size is greater, align it to the cache line size.
>>>           */
>>>          alignment = util_next_power_of_two(upload_size);
>>>          tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size;
>>>          return MIN2(alignment, tcc_cache_line_size);
>>>    }
>>>      static inline void
>>> -si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples)
>>> +si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
>>> +                          bool shaders_read_metadata)
>>>    {
>>>          sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
>>>                           SI_CONTEXT_INV_VMEM_L1;
>>>    -     /* Single-sample color is coherent with shaders on GFX9. */
>>> -       if (sctx->b.chip_class <= VI || num_samples >= 2)
>>> +       if (sctx->b.chip_class >= GFX9) {
>>> +               /* Single-sample color is coherent with shaders on GFX9,
>>> but
>>> +                * L2 metadata must be flushed if shaders read metadata.
>>> +                * (DCC, CMASK).
>>> +                */
>>> +               if (num_samples >= 2)
>>> +                       sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
>>> +               else if (shaders_read_metadata)
>>> +                       sctx->b.flags |= SI_CONTEXT_INV_L2_METADATA;
>>> +       } else {
>>> +               /* SI-CI-VI */
>>>                  sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
>>> +       }
>>>    }
>>>      static inline void
>>>    si_make_DB_shader_coherent(struct si_context *sctx, unsigned
>>> num_samples,
>>>                             bool include_stencil)
>>>    {
>>>          sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
>>>                           SI_CONTEXT_INV_VMEM_L1;
>>>          /* Single-sample depth (not stencil) is coherent with shaders on
>>> GFX9. */
>>> diff --git a/src/gallium/drivers/radeonsi/si_state.c
>>> b/src/gallium/drivers/radeonsi/si_state.c
>>> index d116c07..e5d8d21 100644
>>> --- a/src/gallium/drivers/radeonsi/si_state.c
>>> +++ b/src/gallium/drivers/radeonsi/si_state.c
>>> @@ -2566,21 +2566,22 @@ static void si_set_framebuffer_state(struct
>>> pipe_context *ctx,
>>>           *
>>>           * When MSAA is enabled, CB and TC caches are flushed on demand
>>>           * (after FMASK decompression). Shader write -> FB read
>>> transitions
>>>           * cannot happen for MSAA textures, because MSAA shader images are
>>>           * not supported.
>>>           *
>>>           * Only flush and wait for CB if there is actually a bound color
>>> buffer.
>>>           */
>>>          if (sctx->framebuffer.nr_samples <= 1 &&
>>>              sctx->framebuffer.state.nr_cbufs)
>>> -               si_make_CB_shader_coherent(sctx,
>>> sctx->framebuffer.nr_samples);
>>> +               si_make_CB_shader_coherent(sctx,
>>> sctx->framebuffer.nr_samples,
>>> +
>>> sctx->framebuffer.CB_has_shader_readable_metadata);
>>>          sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
>>>          /* u_blitter doesn't invoke depth decompression when it does
>>> multiple
>>>           * blits in a row, but the only case when it matters for DB is
>>> when
>>>           * doing generate_mipmap. So here we flush DB manually between
>>>           * individual generate_mipmap blits.
>>>           * Note that lower mipmap levels aren't compressed.
>>>           */
>>>          if (sctx->generate_mipmap_for_depth)
>>> @@ -2601,20 +2602,21 @@ static void si_set_framebuffer_state(struct
>>> pipe_context *ctx,
>>>          sctx->framebuffer.spi_shader_col_format_alpha = 0;
>>>          sctx->framebuffer.spi_shader_col_format_blend = 0;
>>>          sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
>>>          sctx->framebuffer.color_is_int8 = 0;
>>>          sctx->framebuffer.color_is_int10 = 0;
>>>          sctx->framebuffer.compressed_cb_mask = 0;
>>>          sctx->framebuffer.nr_samples =
>>> util_framebuffer_get_num_samples(state);
>>>          sctx->framebuffer.log_samples =
>>> util_logbase2(sctx->framebuffer.nr_samples);
>>>          sctx->framebuffer.any_dst_linear = false;
>>> +       sctx->framebuffer.CB_has_shader_readable_metadata = false;
>>>          for (i = 0; i < state->nr_cbufs; i++) {
>>>                  if (!state->cbufs[i])
>>>                          continue;
>>>                  surf = (struct r600_surface*)state->cbufs[i];
>>>                  rtex = (struct r600_texture*)surf->base.texture;
>>>                  if (!surf->color_initialized) {
>>>                          si_initialize_color_surface(sctx, surf);
>>> @@ -2635,20 +2637,23 @@ static void si_set_framebuffer_state(struct
>>> pipe_context *ctx,
>>>                  if (surf->color_is_int10)
>>>                          sctx->framebuffer.color_is_int10 |= 1 << i;
>>>                  if (rtex->fmask.size) {
>>>                          sctx->framebuffer.compressed_cb_mask |= 1 << i;
>>>                  }
>>>                  if (rtex->surface.is_linear)
>>>                          sctx->framebuffer.any_dst_linear = true;
>>>    +             if (vi_dcc_enabled(rtex, surf->base.u.tex.level))
>>> +                       sctx->framebuffer.CB_has_shader_readable_metadata
>>> = true;
>>> +
>>>                  r600_context_add_resource_size(ctx, surf->base.texture);
>>>                  p_atomic_inc(&rtex->framebuffers_bound);
>>>                  if (rtex->dcc_gather_statistics) {
>>>                          /* Dirty tracking must be enabled for DCC usage
>>> analysis. */
>>>                          sctx->framebuffer.compressed_cb_mask |= 1 << i;
>>>                          vi_separate_dcc_start_query(ctx, rtex);
>>>                  }
>>>          }
>>> @@ -4015,21 +4020,22 @@ static void si_set_tess_state(struct pipe_context
>>> *ctx,
>>>      static void si_texture_barrier(struct pipe_context *ctx, unsigned
>>> flags)
>>>    {
>>>          struct si_context *sctx = (struct si_context *)ctx;
>>>          si_update_fb_dirtiness_after_rendering(sctx);
>>>          /* Multisample surfaces are flushed in si_decompress_textures. */
>>>          if (sctx->framebuffer.nr_samples <= 1 &&
>>>              sctx->framebuffer.state.nr_cbufs)
>>> -               si_make_CB_shader_coherent(sctx,
>>> sctx->framebuffer.nr_samples);
>>> +               si_make_CB_shader_coherent(sctx,
>>> sctx->framebuffer.nr_samples,
>>> +
>>> sctx->framebuffer.CB_has_shader_readable_metadata);
>>>    }
>>>      /* This only ensures coherency for shader image/buffer stores. */
>>>    static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
>>>    {
>>>          struct si_context *sctx = (struct si_context *)ctx;
>>>          /* Subsequent commands must wait for all shader invocations to
>>>           * complete. */
>>>          sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>>> @@ -4060,23 +4066,28 @@ static void si_memory_barrier(struct pipe_context
>>> *ctx, unsigned flags)
>>>          }
>>>          /* MSAA color, any depth and any stencil are flushed in
>>>           * si_decompress_textures when needed.
>>>           */
>>>          if (flags & PIPE_BARRIER_FRAMEBUFFER &&
>>>              sctx->framebuffer.nr_samples <= 1 &&
>>>              sctx->framebuffer.state.nr_cbufs) {
>>>                  sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
>>>    -             /* Single-sample color is coherent with TC on GFX9. */
>>> -               if (sctx->screen->b.chip_class <= VI)
>>> +               if (sctx->b.chip_class >= GFX9) {
>>> +                       /* Single-sample color is coherent with TC on
>>> GFX9. */
>>> +                       if
>>> (sctx->framebuffer.CB_has_shader_readable_metadata)
>>> +                               sctx->b.flags |=
>>> SI_CONTEXT_INV_L2_METADATA;
>>
>>
>> MemoryBarrier is about making *shader writes* visible to other parts of the
>> pipeline.
>>
>> So I think:
>>
>> - the comment above is misleading
>>
>> - there should be no need to invalidate the metadata, since DCC must already
>> have been disabled or at least decompressed for the relevant texture, and
>> the shader doesn't modify the metadata anyway.
> 
> You are right. I've replaced that locally with:
> 
>          if (sctx->b.chip_class <= VI)
>              sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;

Looks good, thanks!

> 
> and no comment.
> 
> Marek
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.