<div dir="auto"><div><br><br><div class="gmail_quote"><div dir="ltr">On Wed, Apr 4, 2018, 6:07 AM Samuel Pitoiset <<a href="mailto:samuel.pitoiset@gmail.com">samuel.pitoiset@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><br>
<br>
On 04/04/2018 03:59 AM, Marek Olšák wrote:<br>
> From: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank" rel="noreferrer">marek.olsak@amd.com</a>><br>
><br>
> so that the draw is started as soon as possible.<br>
> ---<br>
> src/gallium/drivers/radeonsi/si_cp_dma.c | 68 ++++++++++++++++++----------<br>
> src/gallium/drivers/radeonsi/si_pipe.h | 2 +-<br>
> src/gallium/drivers/radeonsi/si_state_draw.c | 11 ++++-<br>
> src/util/bitscan.h | 8 ++++<br>
> 4 files changed, 61 insertions(+), 28 deletions(-)<br>
><br>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c<br>
> index 15bd305a350..ea2c7cf7198 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c<br>
> @@ -514,80 +514,98 @@ static void cik_prefetch_shader_async(struct si_context *sctx,<br>
> static void cik_prefetch_VBO_descriptors(struct si_context *sctx)<br>
> {<br>
> if (!sctx->vertex_elements)<br>
> return;<br>
><br>
> cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b,<br>
> sctx->vb_descriptors_offset,<br>
> sctx->vertex_elements->desc_list_byte_size);<br>
> }<br>
><br>
> -void cik_emit_prefetch_L2(struct si_context *sctx)<br>
> +/**<br>
> + * Prefetch shaders and VBO descriptors.<br>
> + *<br>
> + * \param first_two Whether only the first 2 items should be prefetched,<br>
> + * which are usually the API VS and VBO descriptors.<br>
> + */<br>
> +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two)<br>
> {<br>
> + unsigned mask;<br>
> +<br>
> + assert(sctx->prefetch_L2_mask);<br>
> +<br>
> + if (first_two) {<br>
> + mask = 1 << u_bit_scan16(&sctx->prefetch_L2_mask);<br>
> +<br>
> + if (sctx->prefetch_L2_mask)<br>
> + mask |= 1 << u_bit_scan16(&sctx->prefetch_L2_mask);<br>
<br>
Where do you reset the prefetch L2 mask ? It looks like to me that you<br>
are going to prefetch VS/VBOs twice in the fast draw path.<br></blockquote></div></div><div dir="auto"><br></div><div dir="auto">u_bit_scan16 clears the returned bit.</div><div dir="auto"><br></div><div dir="auto">Marek</div><div dir="auto"><br></div><div dir="auto"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
> + } else {<br>
> + mask = sctx->prefetch_L2_mask;<br>
> + sctx->prefetch_L2_mask = 0;<br>
> + }<br>
> +<br>
> /* Prefetch shaders and VBO descriptors to TC L2. */<br>
> if (sctx->b.chip_class >= GFX9) {<br>
> /* Choose the right spot for the VBO prefetch. */<br>
> if (sctx->tes_shader.cso) {<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)<br>
> + if (mask & SI_PREFETCH_HS)<br>
> cik_prefetch_shader_async(sctx, sctx->queued.named.hs);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> + if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> cik_prefetch_VBO_descriptors(sctx);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)<br>
> + if (mask & SI_PREFETCH_GS)<br>
> cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.gs" rel="noreferrer noreferrer" target="_blank">queued.named.gs</a>);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> + if (mask & SI_PREFETCH_VS)<br>
> cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
> } else if (sctx->gs_shader.cso) {<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)<br>
> + if (mask & SI_PREFETCH_GS)<br>
> cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.gs" rel="noreferrer noreferrer" target="_blank">queued.named.gs</a>);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> + if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> cik_prefetch_VBO_descriptors(sctx);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> + if (mask & SI_PREFETCH_VS)<br>
> cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
> } else {<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> + if (mask & SI_PREFETCH_VS)<br>
> cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> + if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> cik_prefetch_VBO_descriptors(sctx);<br>
> }<br>
> } else {<br>
> /* SI-CI-VI */<br>
> /* Choose the right spot for the VBO prefetch. */<br>
> if (sctx->tes_shader.cso) {<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS)<br>
> + if (mask & SI_PREFETCH_LS)<br>
> cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.ls" rel="noreferrer noreferrer" target="_blank">queued.named.ls</a>);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> + if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> cik_prefetch_VBO_descriptors(sctx);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)<br>
> + if (mask & SI_PREFETCH_HS)<br>
> cik_prefetch_shader_async(sctx, sctx->queued.named.hs);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)<br>
> + if (mask & SI_PREFETCH_ES)<br>
> cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.es" rel="noreferrer noreferrer" target="_blank">queued.named.es</a>);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)<br>
> + if (mask & SI_PREFETCH_GS)<br>
> cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.gs" rel="noreferrer noreferrer" target="_blank">queued.named.gs</a>);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> + if (mask & SI_PREFETCH_VS)<br>
> cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
> } else if (sctx->gs_shader.cso) {<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)<br>
> + if (mask & SI_PREFETCH_ES)<br>
> cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.es" rel="noreferrer noreferrer" target="_blank">queued.named.es</a>);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> + if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> cik_prefetch_VBO_descriptors(sctx);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)<br>
> + if (mask & SI_PREFETCH_GS)<br>
> cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.gs" rel="noreferrer noreferrer" target="_blank">queued.named.gs</a>);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> + if (mask & SI_PREFETCH_VS)<br>
> cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
> } else {<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)<br>
> + if (mask & SI_PREFETCH_VS)<br>
> cik_prefetch_shader_async(sctx, sctx->queued.named.vs);<br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> + if (mask & SI_PREFETCH_VBO_DESCRIPTORS)<br>
> cik_prefetch_VBO_descriptors(sctx);<br>
> }<br>
> }<br>
><br>
> - if (sctx->prefetch_L2_mask & SI_PREFETCH_PS)<br>
> + if (mask & SI_PREFETCH_PS)<br>
> cik_prefetch_shader_async(sctx, sctx-><a href="http://queued.named.ps" rel="noreferrer noreferrer" target="_blank">queued.named.ps</a>);<br>
> -<br>
> - sctx->prefetch_L2_mask = 0;<br>
> }<br>
><br>
> void si_init_cp_dma_functions(struct si_context *sctx)<br>
> {<br>
> sctx->b.b.clear_buffer = si_pipe_clear_buffer;<br>
> }<br>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> index bb1aebdda42..62641fde5e3 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_pipe.h<br>
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> @@ -688,21 +688,21 @@ enum r600_coherency {<br>
><br>
> void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,<br>
> uint64_t offset, uint64_t size, unsigned value,<br>
> enum r600_coherency coher);<br>
> void si_copy_buffer(struct si_context *sctx,<br>
> struct pipe_resource *dst, struct pipe_resource *src,<br>
> uint64_t dst_offset, uint64_t src_offset, unsigned size,<br>
> unsigned user_flags);<br>
> void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,<br>
> uint64_t offset, unsigned size);<br>
> -void cik_emit_prefetch_L2(struct si_context *sctx);<br>
> +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two);<br>
> void si_init_cp_dma_functions(struct si_context *sctx);<br>
><br>
> /* si_debug.c */<br>
> void si_auto_log_cs(void *data, struct u_log_context *log);<br>
> void si_log_hw_flush(struct si_context *sctx);<br>
> void si_log_draw_state(struct si_context *sctx, struct u_log_context *log);<br>
> void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);<br>
> void si_init_debug_functions(struct si_context *sctx);<br>
> void si_check_vm_faults(struct r600_common_context *ctx,<br>
> struct radeon_saved_cs *saved, enum ring_type ring);<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> index 1e79ccca054..8446b1b50bc 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> @@ -1450,36 +1450,43 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)<br>
> sctx->b.render_cond_atom.emit(&sctx->b, NULL);<br>
> sctx->dirty_atoms = 0;<br>
><br>
> si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);<br>
> /* <-- CUs are busy here. */<br>
><br>
> /* Start prefetches after the draw has been started. Both will run<br>
> * in parallel, but starting the draw first is more important.<br>
> */<br>
> if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)<br>
> - cik_emit_prefetch_L2(sctx);<br>
> + cik_emit_prefetch_L2(sctx, false);<br>
> } else {<br>
> /* If we don't wait for idle, start prefetches first, then set<br>
> * states, and draw at the end.<br>
> */<br>
> if (sctx->b.flags)<br>
> si_emit_cache_flush(sctx);<br>
><br>
> + /* Only prefetch the first 2 items, e.g. the API VS and VBO<br>
> + * descriptors. */<br>
> if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)<br>
> - cik_emit_prefetch_L2(sctx);<br>
> + cik_emit_prefetch_L2(sctx, true);<br>
><br>
> if (!si_upload_graphics_shader_descriptors(sctx))<br>
> return;<br>
><br>
> si_emit_all_states(sctx, info, 0);<br>
> si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);<br>
> +<br>
> + /* Prefetch the remaining shaders after the draw has been<br>
> + * started. */<br>
> + if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)<br>
> + cik_emit_prefetch_L2(sctx, false);<br>
> }<br>
><br>
> if (unlikely(sctx->current_saved_cs)) {<br>
> si_trace_emit(sctx);<br>
> si_log_draw_state(sctx, sctx->b.log);<br>
> }<br>
><br>
> /* Workaround for a VGT hang when streamout is enabled.<br>
> * It must be done after drawing. */<br>
> if ((sctx->b.family == CHIP_HAWAII ||<br>
> diff --git a/src/util/bitscan.h b/src/util/bitscan.h<br>
> index 5cc75f0beba..78ff8e0cea1 100644<br>
> --- a/src/util/bitscan.h<br>
> +++ b/src/util/bitscan.h<br>
> @@ -89,20 +89,28 @@ ffsll(long long int val);<br>
><br>
><br>
> /* Destructively loop over all of the bits in a mask as in:<br>
> *<br>
> * while (mymask) {<br>
> * int i = u_bit_scan(&mymask);<br>
> * ... process element i<br>
> * }<br>
> *<br>
> */<br>
> +static inline int<br>
> +u_bit_scan16(uint16_t *mask)<br>
> +{<br>
> + const int i = ffs(*mask) - 1;<br>
> + *mask ^= (1u << i);<br>
> + return i;<br>
> +}<br>
> +<br>
> static inline int<br>
> u_bit_scan(unsigned *mask)<br>
> {<br>
> const int i = ffs(*mask) - 1;<br>
> *mask ^= (1u << i);<br>
> return i;<br>
> }<br>
><br>
> static inline int<br>
> u_bit_scan64(uint64_t *mask)<br>
><br>
</blockquote></div></div></div>