[Mesa-dev] [PATCH 6/7] radeonsi: use optimal packet order when doing a pipeline sync

Fri Aug 4 14:38:37 UTC 2017

There is an ugly bug here: prefetches are skipped, because
emit_all_states clears all dirty bits. Expect v2...

Marek

On Fri, Aug 4, 2017 at 12:05 PM, Marek Olšák <maraeo at gmail.com> wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> Process new SET packets in parallel with previous draw calls.
>
> This decreases [CP busy / SPI busy] by a very tiny amount (verified with
> GRBM perf counters), and probably increases FPS by a very tiny amount
> for apps that do pipeline syncs often.
> ---
>  src/gallium/drivers/radeonsi/si_state_draw.c | 54 ++++++++++++++++++++++++----
>  1 file changed, 48 insertions(+), 6 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index ae48115..06a18c1 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -1173,30 +1173,31 @@ static bool si_cache_flush_and_prefetch(struct si_context *sctx)
>          */
>         if (!si_upload_graphics_shader_descriptors(sctx))
>                 return false;
>
>         if (sctx->prefetch_L2)
>                 cik_emit_prefetch_L2(sctx);
>
>         return true;
>  }
>
> -static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info)
> +static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
> +                              unsigned skip_atom_mask)
>  {
>         /* Emit state atoms. */
> -       unsigned mask = sctx->dirty_atoms;
> +       unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
>         while (mask) {
>                 struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
>
>                 atom->emit(&sctx->b, atom);
>         }
> -       sctx->dirty_atoms = 0;
> +       sctx->dirty_atoms &= skip_atom_mask;
>
>         /* Emit states. */
>         mask = sctx->dirty_states;
>         while (mask) {
>                 unsigned i = u_bit_scan(&mask);
>                 struct si_pm4_state *state = sctx->queued.array[i];
>
>                 if (!state || sctx->emitted.array[i] == state)
>                         continue;
>
> @@ -1384,23 +1385,64 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>          */
>         if (!si_upload_vertex_buffer_descriptors(sctx))
>                 return;
>
>         /* GFX9 scissor bug workaround. There is also a more efficient but
>          * more involved alternative workaround. */
>         if (sctx->b.chip_class == GFX9 &&
>             si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
>                 sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>
> -       if (!si_cache_flush_and_prefetch(sctx))
> -               return;
> -       si_emit_all_states(sctx, info);
> +       /* Use an optimal packet order based on whether we need to sync the pipeline. */
> +       if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
> +                                     SI_CONTEXT_FLUSH_AND_INV_DB |
> +                                     SI_CONTEXT_PS_PARTIAL_FLUSH |
> +                                     SI_CONTEXT_CS_PARTIAL_FLUSH))) {
> +               /* If we have to wait for idle, set all states first, so that all
> +                * SET packets are processed in parallel with previous draw calls.
> +                * Sequence:
> +                * - process SET packets except SET_SH packets for shader pointers
> +                * - flush caches and wait for previous draw calls
> +                * - start CE dumps (might already be ongoing if there is no CE-DE barrier)
> +                * - start prefetches
> +                * - process SET_SH packets for shader pointers
> +                * - wait for CE dumps
> +                * - draw
> +                */
> +               struct r600_atom *shader_pointers = &sctx->shader_userdata.atom;
> +
> +               /* Emit all states except shader pointers. */
> +               si_emit_all_states(sctx, info, 1 << shader_pointers->id);
> +
> +               if (!si_cache_flush_and_prefetch(sctx))
> +                       return;
> +
> +               /* Set shader pointers last. */
> +               if (si_is_atom_dirty(sctx, shader_pointers)) {
> +                       shader_pointers->emit(&sctx->b, NULL);
> +                       sctx->dirty_atoms = 0;
> +               }
> +       } else {
> +               /* If we don't wait for idle, do CE dumps and start prefetches
> +                * first, so that they are being done in parallel with all SET
> +                * packets. Sequence:
> +                * - flush caches
> +                * - start CE dumps (might already be ongoing if CE is ahead)
> +                * - start prefetches
> +                * - process SET packets
> +                * - wait for CE dumps
> +                * - draw
> +                */
> +               if (!si_cache_flush_and_prefetch(sctx))
> +                       return;
> +               si_emit_all_states(sctx, info, 0);
> +       }
>
>         si_ce_pre_draw_synchronization(sctx);
>         si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
>         si_ce_post_draw_synchronization(sctx);
>
>         if (sctx->trace_buf)
>                 si_trace_emit(sctx);
>
>         /* Workaround for a VGT hang when streamout is enabled.
>          * It must be done after drawing. */
> --
> 2.7.4
>