[Mesa-dev] [PATCH 11/11] radeonsi: use optimal packet order when doing a pipeline sync

Mon Aug 7 06:54:56 UTC 2017

On 07.08.2017 00:20, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
> 
> Process most new SET packets in parallel with previous draw calls, then
> flush caches and wait, start the draw, and do L2 prefetches last.
> 
> This decreases the [CP busy / SPI busy] ratio (verified with GRBM perf
> counters). In other words, the time window when shaders are idle (between
> (the wait and the draw) is much shorter now.
> ---
>   src/gallium/drivers/radeonsi/si_state_draw.c | 117 +++++++++++++++++++--------
>   1 file changed, 83 insertions(+), 34 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 9df5b7a..22b7f56 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -1155,28 +1155,63 @@ void si_ce_pre_draw_synchronization(struct si_context *sctx)
>   void si_ce_post_draw_synchronization(struct si_context *sctx)
>   {
>   	if (sctx->ce_need_synchronization) {
>   		radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
>   		radeon_emit(sctx->b.gfx.cs, 0); /* unused */
>   
>   		sctx->ce_need_synchronization = false;
>   	}
>   }
>   
> +static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
> +			       unsigned skip_atom_mask)
> +{
> +	/* Emit state atoms. */
> +	unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
> +	while (mask) {
> +		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
> +
> +		atom->emit(&sctx->b, atom);
> +	}
> +	sctx->dirty_atoms &= skip_atom_mask;
> +
> +	/* Emit states. */
> +	mask = sctx->dirty_states;
> +	while (mask) {
> +		unsigned i = u_bit_scan(&mask);
> +		struct si_pm4_state *state = sctx->queued.array[i];
> +
> +		if (!state || sctx->emitted.array[i] == state)
> +			continue;
> +
> +		si_pm4_emit(sctx, state);
> +		sctx->emitted.array[i] = state;
> +	}
> +	sctx->dirty_states = 0;
> +
> +	/* Emit draw states. */
> +	unsigned num_patches = 0;
> +
> +	si_emit_rasterizer_prim_state(sctx);
> +	if (sctx->tes_shader.cso)
> +		si_emit_derived_tess_state(sctx, info, &num_patches);
> +	si_emit_vs_state(sctx, info);
> +	si_emit_draw_registers(sctx, info, num_patches);
> +}
> +
>   void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   {
>   	struct si_context *sctx = (struct si_context *)ctx;
>   	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
>   	struct pipe_resource *indexbuf = info->index.resource;
> -	unsigned mask, dirty_tex_counter;
> +	unsigned dirty_tex_counter;
>   	enum pipe_prim_type rast_prim;
> -	unsigned num_patches = 0;
>   	unsigned index_size = info->index_size;
>   	unsigned index_offset = info->indirect ? info->start * index_size : 0;
>   
>   	if (likely(!info->indirect)) {
>   		/* SI-CI treat instance_count==0 as instance_count==1. There is
>   		 * no workaround for indirect draws, but we can at least skip
>   		 * direct draws.
>   		 */
>   		if (unlikely(!info->instance_count))
>   			return;
> @@ -1244,23 +1279,20 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   
>   		if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
>   			sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
>   			sctx->do_update_shaders = true;
>   		}
>   	}
>   
>   	if (sctx->do_update_shaders && !si_update_shaders(sctx))
>   		return;
>   
> -	if (!si_upload_graphics_shader_descriptors(sctx))
> -		return;
> -
>   	if (index_size) {
>   		/* Translate or upload, if needed. */
>   		/* 8-bit indices are supported on VI. */
>   		if (sctx->b.chip_class <= CIK && index_size == 1) {
>   			unsigned start, count, start_offset, size, offset;
>   			void *ptr;
>   
>   			si_get_draw_start_count(sctx, info, &start, &count);
>   			start_offset = start * 2;
>   			size = count * 2;
> @@ -1335,58 +1367,75 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   	 */
>   	if (!si_upload_vertex_buffer_descriptors(sctx))
>   		return;
>   
>   	/* GFX9 scissor bug workaround. There is also a more efficient but
>   	 * more involved alternative workaround. */
>   	if (sctx->b.chip_class == GFX9 &&
>   	    si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
>   		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>   
> -	/* Flush caches before the first state atom, which does L2 prefetches. */
> -	if (sctx->b.flags)
> +	/* Use optimal packet order based on whether we need to sync the pipeline. */
> +	if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
> +				      SI_CONTEXT_FLUSH_AND_INV_DB |
> +				      SI_CONTEXT_PS_PARTIAL_FLUSH |
> +				      SI_CONTEXT_CS_PARTIAL_FLUSH))) {
> +		/* If we have to wait for idle, set all states first, so that all
> +		 * SET packets are processed in parallel with previous draw calls.
> +		 * Then upload descriptors, set shader pointers, and draw, and
> +		 * prefetch at the end. This ensures that the time the CUs
> +		 * are idle is very short. (there are only CE dumps and SET_SH

The CE dump part of the comment is not true (yet).

This is a nice improvement! With the comment fixed, patches 7-11:

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> +		 * packets between the wait and the draw)
> +		 */
> +		struct r600_atom *shader_pointers = &sctx->shader_pointers.atom;
> +
> +		/* Emit all states except shader pointers. */
> +		si_emit_all_states(sctx, info, 1 << shader_pointers->id);
>   		si_emit_cache_flush(sctx);
>   
> -	if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
> -		cik_emit_prefetch_L2(sctx);
> +		/* <-- CUs are idle here. */
> +		if (!si_upload_graphics_shader_descriptors(sctx))
> +			return;
>   
> -	/* Emit state atoms. */
> -	mask = sctx->dirty_atoms;
> -	while (mask) {
> -		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
> +		/* Set shader pointers after descriptors are uploaded. */
> +		if (si_is_atom_dirty(sctx, shader_pointers)) {
> +			shader_pointers->emit(&sctx->b, NULL);
> +			sctx->dirty_atoms = 0;
> +		}
>   
> -		atom->emit(&sctx->b, atom);
> -	}
> -	sctx->dirty_atoms = 0;
> +		si_ce_pre_draw_synchronization(sctx);
> +		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
> +		/* <-- CUs are busy here. */
>   
> -	/* Emit states. */
> -	mask = sctx->dirty_states;
> -	while (mask) {
> -		unsigned i = u_bit_scan(&mask);
> -		struct si_pm4_state *state = sctx->queued.array[i];
> +		/* Start prefetches after the draw has been started. Both will run
> +		 * in parallel, but starting the draw first is more important.
> +		 */
> +		if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
> +			cik_emit_prefetch_L2(sctx);
> +	} else {
> +		/* If we don't wait for idle, start prefetches first, then set
> +		 * states, and draw at the end.
> +		 */
> +		if (sctx->b.flags)
> +			si_emit_cache_flush(sctx);
>   
> -		if (!state || sctx->emitted.array[i] == state)
> -			continue;
> +		if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
> +			cik_emit_prefetch_L2(sctx);
>   
> -		si_pm4_emit(sctx, state);
> -		sctx->emitted.array[i] = state;
> -	}
> -	sctx->dirty_states = 0;
> +		if (!si_upload_graphics_shader_descriptors(sctx))
> +			return;
>   
> -	si_emit_rasterizer_prim_state(sctx);
> -	if (sctx->tes_shader.cso)
> -		si_emit_derived_tess_state(sctx, info, &num_patches);
> -	si_emit_vs_state(sctx, info);
> -	si_emit_draw_registers(sctx, info, num_patches);
> +		si_emit_all_states(sctx, info, 0);
> +		si_ce_pre_draw_synchronization(sctx);
> +		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
> +	}
>   
> -	si_ce_pre_draw_synchronization(sctx);
> -	si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
>   	si_ce_post_draw_synchronization(sctx);
>   
>   	if (sctx->trace_buf)
>   		si_trace_emit(sctx);
>   
>   	/* Workaround for a VGT hang when streamout is enabled.
>   	 * It must be done after drawing. */
>   	if ((sctx->b.family == CHIP_HAWAII ||
>   	     sctx->b.family == CHIP_TONGA ||
>   	     sctx->b.family == CHIP_FIJI) &&
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.