[Mesa-dev] [PATCH 15/17] radeonsi: always prefetch later shaders after the draw packet

Wed Apr 4 12:59:47 UTC 2018

On Wed, Apr 4, 2018, 6:07 AM Samuel Pitoiset <samuel.pitoiset at gmail.com>
wrote:

>
>
> On 04/04/2018 03:59 AM, Marek Olšák wrote:
> > From: Marek Olšák <marek.olsak at amd.com>
> >
> > so that the draw is started as soon as possible.
> > ---
> >   src/gallium/drivers/radeonsi/si_cp_dma.c     | 68
> ++++++++++++++++++----------
> >   src/gallium/drivers/radeonsi/si_pipe.h       |  2 +-
> >   src/gallium/drivers/radeonsi/si_state_draw.c | 11 ++++-
> >   src/util/bitscan.h                           |  8 ++++
> >   4 files changed, 61 insertions(+), 28 deletions(-)
> >
> > diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c
> b/src/gallium/drivers/radeonsi/si_cp_dma.c
> > index 15bd305a350..ea2c7cf7198 100644
> > --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> > +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> > @@ -514,80 +514,98 @@ static void cik_prefetch_shader_async(struct
> si_context *sctx,
> >   static void cik_prefetch_VBO_descriptors(struct si_context *sctx)
> >   {
> >       if (!sctx->vertex_elements)
> >               return;
> >
> >       cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b,
> >                                sctx->vb_descriptors_offset,
> >
> sctx->vertex_elements->desc_list_byte_size);
> >   }
> >
> > -void cik_emit_prefetch_L2(struct si_context *sctx)
> > +/**
> > + * Prefetch shaders and VBO descriptors.
> > + *
> > + * \param first_two  Whether only the first 2 items should be
> prefetched,
> > + *                   which are usually the API VS and VBO descriptors.
> > + */
> > +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two)
> >   {
> > +     unsigned mask;
> > +
> > +     assert(sctx->prefetch_L2_mask);
> > +
> > +     if (first_two) {
> > +             mask = 1 << u_bit_scan16(&sctx->prefetch_L2_mask);
> > +
> > +             if (sctx->prefetch_L2_mask)
> > +                     mask |= 1 << u_bit_scan16(&sctx->prefetch_L2_mask);
>
> Where do you reset the prefetch L2 mask ? It looks like to me that you
> are going to prefetch VS/VBOs twice in the fast draw path.
>

u_bit_scan16 clears the returned bit.

Marek

> +     } else {
> > +             mask = sctx->prefetch_L2_mask;
> > +             sctx->prefetch_L2_mask = 0;
> > +     }
> > +
> >       /* Prefetch shaders and VBO descriptors to TC L2. */
> >       if (sctx->b.chip_class >= GFX9) {
> >               /* Choose the right spot for the VBO prefetch. */
> >               if (sctx->tes_shader.cso) {
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
> > +                     if (mask & SI_PREFETCH_HS)
> >                               cik_prefetch_shader_async(sctx,
> sctx->queued.named.hs);
> > -                     if (sctx->prefetch_L2_mask &
> SI_PREFETCH_VBO_DESCRIPTORS)
> > +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
> >                               cik_prefetch_VBO_descriptors(sctx);
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
> > +                     if (mask & SI_PREFETCH_GS)
> >                               cik_prefetch_shader_async(sctx, sctx->
> queued.named.gs);
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> > +                     if (mask & SI_PREFETCH_VS)
> >                               cik_prefetch_shader_async(sctx,
> sctx->queued.named.vs);
> >               } else if (sctx->gs_shader.cso) {
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
> > +                     if (mask & SI_PREFETCH_GS)
> >                               cik_prefetch_shader_async(sctx, sctx->
> queued.named.gs);
> > -                     if (sctx->prefetch_L2_mask &
> SI_PREFETCH_VBO_DESCRIPTORS)
> > +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
> >                               cik_prefetch_VBO_descriptors(sctx);
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> > +                     if (mask & SI_PREFETCH_VS)
> >                               cik_prefetch_shader_async(sctx,
> sctx->queued.named.vs);
> >               } else {
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> > +                     if (mask & SI_PREFETCH_VS)
> >                               cik_prefetch_shader_async(sctx,
> sctx->queued.named.vs);
> > -                     if (sctx->prefetch_L2_mask &
> SI_PREFETCH_VBO_DESCRIPTORS)
> > +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
> >                               cik_prefetch_VBO_descriptors(sctx);
> >               }
> >       } else {
> >               /* SI-CI-VI */
> >               /* Choose the right spot for the VBO prefetch. */
> >               if (sctx->tes_shader.cso) {
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_LS)
> > +                     if (mask & SI_PREFETCH_LS)
> >                               cik_prefetch_shader_async(sctx, sctx->
> queued.named.ls);
> > -                     if (sctx->prefetch_L2_mask &
> SI_PREFETCH_VBO_DESCRIPTORS)
> > +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
> >                               cik_prefetch_VBO_descriptors(sctx);
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
> > +                     if (mask & SI_PREFETCH_HS)
> >                               cik_prefetch_shader_async(sctx,
> sctx->queued.named.hs);
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
> > +                     if (mask & SI_PREFETCH_ES)
> >                               cik_prefetch_shader_async(sctx, sctx->
> queued.named.es);
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
> > +                     if (mask & SI_PREFETCH_GS)
> >                               cik_prefetch_shader_async(sctx, sctx->
> queued.named.gs);
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> > +                     if (mask & SI_PREFETCH_VS)
> >                               cik_prefetch_shader_async(sctx,
> sctx->queued.named.vs);
> >               } else if (sctx->gs_shader.cso) {
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
> > +                     if (mask & SI_PREFETCH_ES)
> >                               cik_prefetch_shader_async(sctx, sctx->
> queued.named.es);
> > -                     if (sctx->prefetch_L2_mask &
> SI_PREFETCH_VBO_DESCRIPTORS)
> > +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
> >                               cik_prefetch_VBO_descriptors(sctx);
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
> > +                     if (mask & SI_PREFETCH_GS)
> >                               cik_prefetch_shader_async(sctx, sctx->
> queued.named.gs);
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> > +                     if (mask & SI_PREFETCH_VS)
> >                               cik_prefetch_shader_async(sctx,
> sctx->queued.named.vs);
> >               } else {
> > -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> > +                     if (mask & SI_PREFETCH_VS)
> >                               cik_prefetch_shader_async(sctx,
> sctx->queued.named.vs);
> > -                     if (sctx->prefetch_L2_mask &
> SI_PREFETCH_VBO_DESCRIPTORS)
> > +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
> >                               cik_prefetch_VBO_descriptors(sctx);
> >               }
> >       }
> >
> > -     if (sctx->prefetch_L2_mask & SI_PREFETCH_PS)
> > +     if (mask & SI_PREFETCH_PS)
> >               cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
> > -
> > -     sctx->prefetch_L2_mask = 0;
> >   }
> >
> >   void si_init_cp_dma_functions(struct si_context *sctx)
> >   {
> >       sctx->b.b.clear_buffer = si_pipe_clear_buffer;
> >   }
> > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/drivers/radeonsi/si_pipe.h
> > index bb1aebdda42..62641fde5e3 100644
> > --- a/src/gallium/drivers/radeonsi/si_pipe.h
> > +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> > @@ -688,21 +688,21 @@ enum r600_coherency {
> >
> >   void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource
> *dst,
> >                    uint64_t offset, uint64_t size, unsigned value,
> >                    enum r600_coherency coher);
> >   void si_copy_buffer(struct si_context *sctx,
> >                   struct pipe_resource *dst, struct pipe_resource *src,
> >                   uint64_t dst_offset, uint64_t src_offset, unsigned
> size,
> >                   unsigned user_flags);
> >   void cik_prefetch_TC_L2_async(struct si_context *sctx, struct
> pipe_resource *buf,
> >                             uint64_t offset, unsigned size);
> > -void cik_emit_prefetch_L2(struct si_context *sctx);
> > +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two);
> >   void si_init_cp_dma_functions(struct si_context *sctx);
> >
> >   /* si_debug.c */
> >   void si_auto_log_cs(void *data, struct u_log_context *log);
> >   void si_log_hw_flush(struct si_context *sctx);
> >   void si_log_draw_state(struct si_context *sctx, struct u_log_context
> *log);
> >   void si_log_compute_state(struct si_context *sctx, struct
> u_log_context *log);
> >   void si_init_debug_functions(struct si_context *sctx);
> >   void si_check_vm_faults(struct r600_common_context *ctx,
> >                       struct radeon_saved_cs *saved, enum ring_type
> ring);
> > diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c
> b/src/gallium/drivers/radeonsi/si_state_draw.c
> > index 1e79ccca054..8446b1b50bc 100644
> > --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> > +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> > @@ -1450,36 +1450,43 @@ void si_draw_vbo(struct pipe_context *ctx, const
> struct pipe_draw_info *info)
> >                       sctx->b.render_cond_atom.emit(&sctx->b, NULL);
> >               sctx->dirty_atoms = 0;
> >
> >               si_emit_draw_packets(sctx, info, indexbuf, index_size,
> index_offset);
> >               /* <-- CUs are busy here. */
> >
> >               /* Start prefetches after the draw has been started. Both
> will run
> >                * in parallel, but starting the draw first is more
> important.
> >                */
> >               if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
> > -                     cik_emit_prefetch_L2(sctx);
> > +                     cik_emit_prefetch_L2(sctx, false);
> >       } else {
> >               /* If we don't wait for idle, start prefetches first, then
> set
> >                * states, and draw at the end.
> >                */
> >               if (sctx->b.flags)
> >                       si_emit_cache_flush(sctx);
> >
> > +             /* Only prefetch the first 2 items, e.g. the API VS and VBO
> > +              * descriptors. */
> >               if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
> > -                     cik_emit_prefetch_L2(sctx);
> > +                     cik_emit_prefetch_L2(sctx, true);
> >
> >               if (!si_upload_graphics_shader_descriptors(sctx))
> >                       return;
> >
> >               si_emit_all_states(sctx, info, 0);
> >               si_emit_draw_packets(sctx, info, indexbuf, index_size,
> index_offset);
> > +
> > +             /* Prefetch the remaining shaders after the draw has been
> > +              * started. */
> > +             if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
> > +                     cik_emit_prefetch_L2(sctx, false);
> >       }
> >
> >       if (unlikely(sctx->current_saved_cs)) {
> >               si_trace_emit(sctx);
> >               si_log_draw_state(sctx, sctx->b.log);
> >       }
> >
> >       /* Workaround for a VGT hang when streamout is enabled.
> >        * It must be done after drawing. */
> >       if ((sctx->b.family == CHIP_HAWAII ||
> > diff --git a/src/util/bitscan.h b/src/util/bitscan.h
> > index 5cc75f0beba..78ff8e0cea1 100644
> > --- a/src/util/bitscan.h
> > +++ b/src/util/bitscan.h
> > @@ -89,20 +89,28 @@ ffsll(long long int val);
> >
> >
> >   /* Destructively loop over all of the bits in a mask as in:
> >    *
> >    * while (mymask) {
> >    *   int i = u_bit_scan(&mymask);
> >    *   ... process element i
> >    * }
> >    *
> >    */
> > +static inline int
> > +u_bit_scan16(uint16_t *mask)
> > +{
> > +   const int i = ffs(*mask) - 1;
> > +   *mask ^= (1u << i);
> > +   return i;
> > +}
> > +
> >   static inline int
> >   u_bit_scan(unsigned *mask)
> >   {
> >      const int i = ffs(*mask) - 1;
> >      *mask ^= (1u << i);
> >      return i;
> >   }
> >
> >   static inline int
> >   u_bit_scan64(uint64_t *mask)
> >
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180404/da0a761e/attachment-0001.html>