[Mesa-dev] [PATCH 15/17] radeonsi: always prefetch later shaders after the draw packet

Thu Apr 5 08:18:57 UTC 2018

Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>

On 04/05/2018 02:33 AM, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
> 
> so that the draw is started as soon as possible.
> 
> v2: only prefetch the API VS and VBO descriptors
> ---
>   src/gallium/drivers/radeonsi/si_cp_dma.c     | 89 +++++++++++++++++++++-------
>   src/gallium/drivers/radeonsi/si_pipe.h       |  2 +-
>   src/gallium/drivers/radeonsi/si_state_draw.c | 10 +++-
>   3 files changed, 75 insertions(+), 26 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 15bd305a350..1e26774ffee 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -514,80 +514,123 @@ static void cik_prefetch_shader_async(struct si_context *sctx,
>   static void cik_prefetch_VBO_descriptors(struct si_context *sctx)
>   {
>   	if (!sctx->vertex_elements)
>   		return;
>   
>   	cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b,
>   				 sctx->vb_descriptors_offset,
>   				 sctx->vertex_elements->desc_list_byte_size);
>   }
>   
> -void cik_emit_prefetch_L2(struct si_context *sctx)
> +/**
> + * Prefetch shaders and VBO descriptors.
> + *
> + * \param vertex_stage_only  Whether only the the API VS and VBO descriptors
> + *                           should be prefetched.
> + */
> +void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only)
>   {
> +	unsigned mask = sctx->prefetch_L2_mask;
> +	assert(mask);
> +
>   	/* Prefetch shaders and VBO descriptors to TC L2. */
>   	if (sctx->b.chip_class >= GFX9) {
>   		/* Choose the right spot for the VBO prefetch. */
>   		if (sctx->tes_shader.cso) {
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
> +			if (mask & SI_PREFETCH_HS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)
> +			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
>   				cik_prefetch_VBO_descriptors(sctx);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
> +			if (vertex_stage_only) {
> +				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS |
> +							    SI_PREFETCH_VBO_DESCRIPTORS);
> +				return;
> +			}
> +
> +			if (mask & SI_PREFETCH_GS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> +			if (mask & SI_PREFETCH_VS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
>   		} else if (sctx->gs_shader.cso) {
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
> +			if (mask & SI_PREFETCH_GS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)
> +			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
>   				cik_prefetch_VBO_descriptors(sctx);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> +			if (vertex_stage_only) {
> +				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS |
> +							    SI_PREFETCH_VBO_DESCRIPTORS);
> +				return;
> +			}
> +
> +			if (mask & SI_PREFETCH_VS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
>   		} else {
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> +			if (mask & SI_PREFETCH_VS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)
> +			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
>   				cik_prefetch_VBO_descriptors(sctx);
> +			if (vertex_stage_only) {
> +				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS |
> +							    SI_PREFETCH_VBO_DESCRIPTORS);
> +				return;
> +			}
>   		}
>   	} else {
>   		/* SI-CI-VI */
>   		/* Choose the right spot for the VBO prefetch. */
>   		if (sctx->tes_shader.cso) {
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_LS)
> +			if (mask & SI_PREFETCH_LS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)
> +			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
>   				cik_prefetch_VBO_descriptors(sctx);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
> +			if (vertex_stage_only) {
> +				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS |
> +							    SI_PREFETCH_VBO_DESCRIPTORS);
> +				return;
> +			}
> +
> +			if (mask & SI_PREFETCH_HS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
> +			if (mask & SI_PREFETCH_ES)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.es);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
> +			if (mask & SI_PREFETCH_GS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> +			if (mask & SI_PREFETCH_VS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
>   		} else if (sctx->gs_shader.cso) {
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
> +			if (mask & SI_PREFETCH_ES)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.es);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)
> +			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
>   				cik_prefetch_VBO_descriptors(sctx);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
> +			if (vertex_stage_only) {
> +				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES |
> +							    SI_PREFETCH_VBO_DESCRIPTORS);
> +				return;
> +			}
> +
> +			if (mask & SI_PREFETCH_GS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> +			if (mask & SI_PREFETCH_VS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
>   		} else {
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
> +			if (mask & SI_PREFETCH_VS)
>   				cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
> -			if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS)
> +			if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
>   				cik_prefetch_VBO_descriptors(sctx);
> +			if (vertex_stage_only) {
> +				sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS |
> +							    SI_PREFETCH_VBO_DESCRIPTORS);
> +				return;
> +			}
>   		}
>   	}
>   
> -	if (sctx->prefetch_L2_mask & SI_PREFETCH_PS)
> +	if (mask & SI_PREFETCH_PS)
>   		cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
>   
>   	sctx->prefetch_L2_mask = 0;
>   }
>   
>   void si_init_cp_dma_functions(struct si_context *sctx)
>   {
>   	sctx->b.b.clear_buffer = si_pipe_clear_buffer;
>   }
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index bb1aebdda42..62641fde5e3 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -688,21 +688,21 @@ enum r600_coherency {
>   
>   void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
>   		     uint64_t offset, uint64_t size, unsigned value,
>   		     enum r600_coherency coher);
>   void si_copy_buffer(struct si_context *sctx,
>   		    struct pipe_resource *dst, struct pipe_resource *src,
>   		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
>   		    unsigned user_flags);
>   void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
>   			      uint64_t offset, unsigned size);
> -void cik_emit_prefetch_L2(struct si_context *sctx);
> +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two);
>   void si_init_cp_dma_functions(struct si_context *sctx);
>   
>   /* si_debug.c */
>   void si_auto_log_cs(void *data, struct u_log_context *log);
>   void si_log_hw_flush(struct si_context *sctx);
>   void si_log_draw_state(struct si_context *sctx, struct u_log_context *log);
>   void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);
>   void si_init_debug_functions(struct si_context *sctx);
>   void si_check_vm_faults(struct r600_common_context *ctx,
>   			struct radeon_saved_cs *saved, enum ring_type ring);
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 1e79ccca054..ebaf7d65955 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -1450,36 +1450,42 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   			sctx->b.render_cond_atom.emit(&sctx->b, NULL);
>   		sctx->dirty_atoms = 0;
>   
>   		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
>   		/* <-- CUs are busy here. */
>   
>   		/* Start prefetches after the draw has been started. Both will run
>   		 * in parallel, but starting the draw first is more important.
>   		 */
>   		if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
> -			cik_emit_prefetch_L2(sctx);
> +			cik_emit_prefetch_L2(sctx, false);
>   	} else {
>   		/* If we don't wait for idle, start prefetches first, then set
>   		 * states, and draw at the end.
>   		 */
>   		if (sctx->b.flags)
>   			si_emit_cache_flush(sctx);
>   
> +		/* Only prefetch the API VS and VBO descriptors. */
>   		if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
> -			cik_emit_prefetch_L2(sctx);
> +			cik_emit_prefetch_L2(sctx, true);
>   
>   		if (!si_upload_graphics_shader_descriptors(sctx))
>   			return;
>   
>   		si_emit_all_states(sctx, info, 0);
>   		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
> +
> +		/* Prefetch the remaining shaders after the draw has been
> +		 * started. */
> +		if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
> +			cik_emit_prefetch_L2(sctx, false);
>   	}
>   
>   	if (unlikely(sctx->current_saved_cs)) {
>   		si_trace_emit(sctx);
>   		si_log_draw_state(sctx, sctx->b.log);
>   	}
>   
>   	/* Workaround for a VGT hang when streamout is enabled.
>   	 * It must be done after drawing. */
>   	if ((sctx->b.family == CHIP_HAWAII ||
>