<div dir="auto"><div><br><br><div class="gmail_quote"><div dir="ltr">On Wed, Apr 4, 2018, 6:07 AM Samuel Pitoiset <<a href="mailto:samuel.pitoiset@gmail.com">samuel.pitoiset@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><br>
<br>
On 04/04/2018 03:59 AM, Marek Olšák wrote:<br>
> From: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank" rel="noreferrer">marek.olsak@amd.com</a>><br>
><br>
> so that the draw is started as soon as possible.<br>
> ---<br>
>   src/gallium/drivers/radeonsi/si_cp_dma.c     | 68 ++++++++++++++++++----------<br>
>   src/gallium/drivers/radeonsi/si_pipe.h       |  2 +-<br>
>   src/gallium/drivers/radeonsi/si_state_draw.c | 11 ++++-<br>
>   src/util/bitscan.h                           |  8 ++++<br>
>   4 files changed, 61 insertions(+), 28 deletions(-)<br>
><br>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c<br>
> index 15bd305a350..ea2c7cf7198 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c<br>
> @@ -514,80 +514,98 @@ static void cik_prefetch_shader_async(struct si_context *sctx,<br>
>   static void cik_prefetch_VBO_descriptors(struct si_context *sctx)<br>
>   {<br>
>       if (!sctx->vertex_elements)<br>
>               return;<br>
><br>
>       cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b,<br>
>                                sctx->vb_descriptors_offset,<br>
>                                sctx->vertex_elements->desc_list_byte_size);<br>
>   }<br>
><br>
> -void cik_emit_prefetch_L2(struct si_context *sctx)<br>
> +/**<br>
> + * Prefetch shaders and VBO descriptors.<br>
> + *<br>
> + * \param first_two  Whether only the first 2 items should be prefetched,<br>
> + *                   which are usually the API VS and VBO descriptors.<br>
> + */<br>
> +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two)<br>
>   {<br>
> +     unsigned mask;<br>
> +<br>
> +     assert(sctx->prefetch_L2_mask);<br>
> +<br>
> +     if (first_two) {<br>
> +             mask = 1 << u_bit_scan16(&sctx->prefetch_L2_mask);<br>
> +<br>
> +             if (sctx->prefetch_L2_mask)<br>
> +                     mask |= 1 << u_bit_scan16(&sctx->prefetch_L2_mask);<br>
<br>
Where do you reset the prefetch L2 mask ? It looks like to me that you<br>
are going to prefetch VS/VBOs twice in the fast draw path.<br></blockquote></div></div><div dir="auto"><br></div><div dir="auto">u_bit_scan16 clears the returned bit.</div><div dir="auto"><br></div><div dir="auto">Marek</div><div dir="auto"><br></div><div dir="auto"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
> +     } else {<br>
> +             mask = sctx->prefetch_L2_mask;<br>
> +             sctx->prefetch_L2_mask = 0;<br>
> +     }<br>
> +<br>
>       /* Prefetch shaders and VBO descriptors to TC L2. */<br>
>       if (sctx->b.chip_class >= GFX9) {<br>
>               /* Choose the right spot for the VBO prefetch. */<br>
>               if (sctx->tes_shader.cso) {<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)<br>
> +                     if (mask & SI_PREFETCH_HS)<br>
>                               cik_prefetch_shader_async(sctx, sctx->queued.named.hs);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
>                               cik_prefetch_VBO_descriptors(sctx);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)<br>
> +                     if (mask & SI_PREFETCH_GS)<br>
>                               cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.gs" rel="noreferrer noreferrer" target="_blank">queued.named.gs</a>);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> +                     if (mask & SI_PREFETCH_VS)<br>
>                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
>               } else if (sctx->gs_shader.cso) {<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)<br>
> +                     if (mask & SI_PREFETCH_GS)<br>
>                               cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.gs" rel="noreferrer noreferrer" target="_blank">queued.named.gs</a>);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
>                               cik_prefetch_VBO_descriptors(sctx);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> +                     if (mask & SI_PREFETCH_VS)<br>
>                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
>               } else {<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> +                     if (mask & SI_PREFETCH_VS)<br>
>                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
>                               cik_prefetch_VBO_descriptors(sctx);<br>
>               }<br>
>       } else {<br>
>               /* SI-CI-VI */<br>
>               /* Choose the right spot for the VBO prefetch. */<br>
>               if (sctx->tes_shader.cso) {<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_LS)<br>
> +                     if (mask & SI_PREFETCH_LS)<br>
>                               cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.ls" rel="noreferrer noreferrer" target="_blank">queued.named.ls</a>);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
>                               cik_prefetch_VBO_descriptors(sctx);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)<br>
> +                     if (mask & SI_PREFETCH_HS)<br>
>                               cik_prefetch_shader_async(sctx, sctx->queued.named.hs);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)<br>
> +                     if (mask & SI_PREFETCH_ES)<br>
>                               cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.es" rel="noreferrer noreferrer" target="_blank">queued.named.es</a>);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)<br>
> +                     if (mask & SI_PREFETCH_GS)<br>
>                               cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.gs" rel="noreferrer noreferrer" target="_blank">queued.named.gs</a>);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> +                     if (mask & SI_PREFETCH_VS)<br>
>                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
>               } else if (sctx->gs_shader.cso) {<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)<br>
> +                     if (mask & SI_PREFETCH_ES)<br>
>                               cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.es" rel="noreferrer noreferrer" target="_blank">queued.named.es</a>);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
>                               cik_prefetch_VBO_descriptors(sctx);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)<br>
> +                     if (mask & SI_PREFETCH_GS)<br>
>                               cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.gs" rel="noreferrer noreferrer" target="_blank">queued.named.gs</a>);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> +                     if (mask & SI_PREFETCH_VS)<br>
>                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
>               } else {<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> +                     if (mask & SI_PREFETCH_VS)<br>
>                               cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
> -                     if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> +                     if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
>                               cik_prefetch_VBO_descriptors(sctx);<br>
>               }<br>
>       }<br>
><br>
> -     if (sctx->prefetch_L2_mask & SI_PREFETCH_PS)<br>
> +     if (mask & SI_PREFETCH_PS)<br>
>               cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.ps" rel="noreferrer noreferrer" target="_blank">queued.named.ps</a>);<br>
> -<br>
> -     sctx->prefetch_L2_mask = 0;<br>
>   }<br>
><br>
>   void si_init_cp_dma_functions(struct si_context *sctx)<br>
>   {<br>
>       sctx->b.b.clear_buffer = si_pipe_clear_buffer;<br>
>   }<br>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> index bb1aebdda42..62641fde5e3 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_pipe.h<br>
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> @@ -688,21 +688,21 @@ enum r600_coherency {<br>
><br>
>   void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,<br>
>                    uint64_t offset, uint64_t size, unsigned value,<br>
>                    enum r600_coherency coher);<br>
>   void si_copy_buffer(struct si_context *sctx,<br>
>                   struct pipe_resource *dst, struct pipe_resource *src,<br>
>                   uint64_t dst_offset, uint64_t src_offset, unsigned size,<br>
>                   unsigned user_flags);<br>
>   void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,<br>
>                             uint64_t offset, unsigned size);<br>
> -void cik_emit_prefetch_L2(struct si_context *sctx);<br>
> +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two);<br>
>   void si_init_cp_dma_functions(struct si_context *sctx);<br>
><br>
>   /* si_debug.c */<br>
>   void si_auto_log_cs(void *data, struct u_log_context *log);<br>
>   void si_log_hw_flush(struct si_context *sctx);<br>
>   void si_log_draw_state(struct si_context *sctx, struct u_log_context *log);<br>
>   void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);<br>
>   void si_init_debug_functions(struct si_context *sctx);<br>
>   void si_check_vm_faults(struct r600_common_context *ctx,<br>
>                       struct radeon_saved_cs *saved, enum ring_type ring);<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> index 1e79ccca054..8446b1b50bc 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> @@ -1450,36 +1450,43 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)<br>
>                       sctx->b.render_cond_atom.emit(&sctx->b, NULL);<br>
>               sctx->dirty_atoms = 0;<br>
><br>
>               si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);<br>
>               /* <-- CUs are busy here. */<br>
><br>
>               /* Start prefetches after the draw has been started. Both will run<br>
>                * in parallel, but starting the draw first is more important.<br>
>                */<br>
>               if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)<br>
> -                     cik_emit_prefetch_L2(sctx);<br>
> +                     cik_emit_prefetch_L2(sctx, false);<br>
>       } else {<br>
>               /* If we don't wait for idle, start prefetches first, then set<br>
>                * states, and draw at the end.<br>
>                */<br>
>               if (sctx->b.flags)<br>
>                       si_emit_cache_flush(sctx);<br>
><br>
> +             /* Only prefetch the first 2 items, e.g. the API VS and VBO<br>
> +              * descriptors. */<br>
>               if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)<br>
> -                     cik_emit_prefetch_L2(sctx);<br>
> +                     cik_emit_prefetch_L2(sctx, true);<br>
><br>
>               if (!si_upload_graphics_shader_descriptors(sctx))<br>
>                       return;<br>
><br>
>               si_emit_all_states(sctx, info, 0);<br>
>               si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);<br>
> +<br>
> +             /* Prefetch the remaining shaders after the draw has been<br>
> +              * started. */<br>
> +             if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)<br>
> +                     cik_emit_prefetch_L2(sctx, false);<br>
>       }<br>
><br>
>       if (unlikely(sctx->current_saved_cs)) {<br>
>               si_trace_emit(sctx);<br>
>               si_log_draw_state(sctx, sctx->b.log);<br>
>       }<br>
><br>
>       /* Workaround for a VGT hang when streamout is enabled.<br>
>        * It must be done after drawing. */<br>
>       if ((sctx->b.family == CHIP_HAWAII ||<br>
> diff --git a/src/util/bitscan.h b/src/util/bitscan.h<br>
> index 5cc75f0beba..78ff8e0cea1 100644<br>
> --- a/src/util/bitscan.h<br>
> +++ b/src/util/bitscan.h<br>
> @@ -89,20 +89,28 @@ ffsll(long long int val);<br>
><br>
><br>
>   /* Destructively loop over all of the bits in a mask as in:<br>
>    *<br>
>    * while (mymask) {<br>
>    *   int i = u_bit_scan(&mymask);<br>
>    *   ... process element i<br>
>    * }<br>
>    *<br>
>    */<br>
> +static inline int<br>
> +u_bit_scan16(uint16_t *mask)<br>
> +{<br>
> +   const int i = ffs(*mask) - 1;<br>
> +   *mask ^= (1u << i);<br>
> +   return i;<br>
> +}<br>
> +<br>
>   static inline int<br>
>   u_bit_scan(unsigned *mask)<br>
>   {<br>
>      const int i = ffs(*mask) - 1;<br>
>      *mask ^= (1u << i);<br>
>      return i;<br>
>   }<br>
><br>
>   static inline int<br>
>   u_bit_scan64(uint64_t *mask)<br>
><br>
</blockquote></div></div></div>