[Mesa-dev] [PATCH 06/11] radeonsi: add a separate dirty mask for prefetches

Mon Aug 7 06:47:01 UTC 2017

On 07.08.2017 00:20, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
> 
> so that we don't rely on si_pm4_state_enabled_and_changed, allowing us
> to move prefetches after draw calls.
> ---
>   src/gallium/drivers/radeonsi/si_cp_dma.c        | 16 ++++++++--------
>   src/gallium/drivers/radeonsi/si_descriptors.c   |  3 +--
>   src/gallium/drivers/radeonsi/si_hw_context.c    | 16 ++++++++++++++--
>   src/gallium/drivers/radeonsi/si_pipe.h          | 10 +++++++++-
>   src/gallium/drivers/radeonsi/si_state_draw.c    |  2 +-
>   src/gallium/drivers/radeonsi/si_state_shaders.c | 16 ++++++++++++++--
>   6 files changed, 47 insertions(+), 16 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 24fa6fd..21202b3 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -444,38 +444,38 @@ static void cik_prefetch_shader_async(struct si_context *sctx,
>   {
>   	struct pipe_resource *bo = &state->bo[0]->b.b;
>   	assert(state->nbo == 1);
>   
>   	cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
>   }
>   
>   void cik_emit_prefetch_L2(struct si_context *sctx)
>   {
>   	/* Prefetch shaders and VBO descriptors to TC L2. */
> -	if (si_pm4_state_enabled_and_changed(sctx, ls))
> +	if (sctx->prefetch_L2_mask & SI_PREFETCH_LS)
>   		cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
> -	if (si_pm4_state_enabled_and_changed(sctx, hs))
> +	if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
>   		cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
> -	if (si_pm4_state_enabled_and_changed(sctx, es))
> +	if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
>   		cik_prefetch_shader_async(sctx, sctx->queued.named.es);
> -	if (si_pm4_state_enabled_and_changed(sctx, gs))
> +	if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
>   		cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
> -	if (si_pm4_state_enabled_and_changed(sctx, vs))
> +	if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
>   		cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
>   
>   	/* Vertex buffer descriptors are uploaded uncached, so prefetch
>   	 * them right after the VS binary. */
> -	if (sctx->vertex_buffer_pointer_dirty) {
> +	if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) {
>   		cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b,
>   					 sctx->vertex_buffers.buffer_offset,
>   					 sctx->vertex_elements->desc_list_byte_size);
>   	}
> -	if (si_pm4_state_enabled_and_changed(sctx, ps))
> +	if (sctx->prefetch_L2_mask & SI_PREFETCH_PS)
>   		cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
>   
> -	sctx->prefetch_L2 = false;
> +	sctx->prefetch_L2_mask = 0;
>   }
>   
>   void si_init_cp_dma_functions(struct si_context *sctx)
>   {
>   	sctx->b.clear_buffer = si_clear_buffer;
>   }
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 917b0e1..43f1792 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -1169,24 +1169,23 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
>   					      (struct r600_resource*)vb->buffer.resource,
>   					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
>   		}
>   	}
>   
>   	/* Don't flush the const cache. It would have a very negative effect
>   	 * on performance (confirmed by testing). New descriptors are always
>   	 * uploaded to a fresh new buffer, so I don't think flushing the const
>   	 * cache is needed. */
>   	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
> -	if (sctx->b.chip_class >= CIK)
> -		sctx->prefetch_L2 = true;
>   	sctx->vertex_buffers_dirty = false;
>   	sctx->vertex_buffer_pointer_dirty = true;
> +	sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
>   	return true;
>   }
>   
>   
>   /* CONSTANT BUFFERS */
>   
>   static unsigned
>   si_const_and_shader_buffer_descriptors_idx(unsigned shader)
>   {
>   	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
> index 756b159..3582cd7 100644
> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
> @@ -209,22 +209,34 @@ void si_begin_new_cs(struct si_context *ctx)
>   		si_pm4_emit(ctx, ctx->init_config_gs_rings);
>   
>   	if (ctx->ce_preamble_ib)
>   		si_ce_enable_loads(ctx->ce_preamble_ib);
>   	else if (ctx->ce_ib)
>   		si_ce_enable_loads(ctx->ce_ib);
>   
>   	if (ctx->ce_ib)
>   		si_ce_restore_all_descriptors_at_ib_start(ctx);
>   
> -	if (ctx->b.chip_class >= CIK)
> -		ctx->prefetch_L2 = true;
> +	if (ctx->queued.named.ls)
> +		ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
> +	if (ctx->queued.named.hs)
> +		ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
> +	if (ctx->queued.named.es)
> +		ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
> +	if (ctx->queued.named.gs)
> +		ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
> +	if (ctx->queued.named.vs)
> +		ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
> +	if (ctx->queued.named.ps)
> +		ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
> +	if (ctx->vertex_buffers.buffer)
> +		ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
>   
>   	/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
>   	ctx->framebuffer.dirty_cbufs =
>   		u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
>   	/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
>   	ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
>   	/* This should always be marked as dirty to set the framebuffer scissor
>   	 * at least. */
>   	si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
>   
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index d213886..62b64e1 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -61,20 +61,28 @@
>   /* Framebuffer caches. */
>   #define SI_CONTEXT_FLUSH_AND_INV_DB	(R600_CONTEXT_PRIVATE_FLAG << 7)
>   #define SI_CONTEXT_FLUSH_AND_INV_CB	(R600_CONTEXT_PRIVATE_FLAG << 8)
>   /* Engine synchronization. */
>   #define SI_CONTEXT_VS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 9)
>   #define SI_CONTEXT_PS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 10)
>   #define SI_CONTEXT_CS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 11)
>   #define SI_CONTEXT_VGT_FLUSH		(R600_CONTEXT_PRIVATE_FLAG << 12)
>   #define SI_CONTEXT_VGT_STREAMOUT_SYNC	(R600_CONTEXT_PRIVATE_FLAG << 13)
>   
> +#define SI_PREFETCH_VBO_DESCRIPTORS	(1 << 0)
> +#define SI_PREFETCH_LS			(1 << 1)
> +#define SI_PREFETCH_HS			(1 << 2)
> +#define SI_PREFETCH_ES			(1 << 3)
> +#define SI_PREFETCH_GS			(1 << 4)
> +#define SI_PREFETCH_VS			(1 << 5)
> +#define SI_PREFETCH_PS			(1 << 6)
> +
>   #define SI_MAX_BORDER_COLORS	4096
>   #define SIX_BITS		0x3F
>   
>   struct si_compute;
>   struct hash_table;
>   struct u_suballocator;
>   
>   struct si_screen {
>   	struct r600_common_screen	b;
>   	unsigned			gs_table_depth;
> @@ -272,25 +280,25 @@ struct si_context {
>   	struct si_shader_ctx_state	fixed_func_tcs_shader;
>   	struct r600_resource		*wait_mem_scratch;
>   	unsigned			wait_mem_number;
>   
>   	struct radeon_winsys_cs		*ce_ib;
>   	struct radeon_winsys_cs		*ce_preamble_ib;
>   	struct r600_resource		*ce_ram_saved_buffer;
>   	struct u_suballocator		*ce_suballocator;
>   	unsigned			ce_ram_saved_offset;
>   	uint16_t			total_ce_ram_allocated;
> +	uint16_t			prefetch_L2_mask;
>   	bool				ce_need_synchronization:1;
>   
>   	bool				gfx_flush_in_progress:1;
>   	bool				compute_is_busy:1;
> -	bool				prefetch_L2:1;
>   
>   	/* Atoms (direct states). */
>   	union si_state_atoms		atoms;
>   	unsigned			dirty_atoms; /* mask */
>   	/* PM4 states (precomputed immutable states) */
>   	unsigned			dirty_states;
>   	union si_state			queued;
>   	union si_state			emitted;
>   
>   	/* Atom declarations. */
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index 3f933fe..c78450c 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -1339,21 +1339,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>   	/* GFX9 scissor bug workaround. There is also a more efficient but
>   	 * more involved alternative workaround. */
>   	if (sctx->b.chip_class == GFX9 &&
>   	    si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
>   		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
>   
>   	/* Flush caches before the first state atom, which does L2 prefetches. */
>   	if (sctx->b.flags)
>   		si_emit_cache_flush(sctx);
>   
> -	if (sctx->prefetch_L2)
> +	if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)

Maybe guard all the places where the mask is set by the chip_class 
instead, to be consistent and save the check here.

I also wonder if it might not be better to keep prefetching as an atom, 
and check the mask when there's anything to be prefetched. But in terms 
of code, both are fine, so whether you want to take a look at the 
numbers or not, patches 1-7 are

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

>   		cik_emit_prefetch_L2(sctx);
>   
>   	/* Emit state atoms. */
>   	mask = sctx->dirty_atoms;
>   	while (mask) {
>   		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
>   
>   		atom->emit(&sctx->b, atom);
>   	}
>   	sctx->dirty_atoms = 0;
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index cb5a23e..5b8f907 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -3300,22 +3300,34 @@ bool si_update_shaders(struct si_context *sctx)
>   	if (si_pm4_state_enabled_and_changed(sctx, ls) ||
>   	    si_pm4_state_enabled_and_changed(sctx, hs) ||
>   	    si_pm4_state_enabled_and_changed(sctx, es) ||
>   	    si_pm4_state_enabled_and_changed(sctx, gs) ||
>   	    si_pm4_state_enabled_and_changed(sctx, vs) ||
>   	    si_pm4_state_enabled_and_changed(sctx, ps)) {
>   		if (!si_update_spi_tmpring_size(sctx))
>   			return false;
>   	}
>   
> -	if (sctx->b.chip_class >= CIK)
> -		sctx->prefetch_L2 = true;
> +	if (sctx->b.chip_class >= CIK) {
> +		if (si_pm4_state_enabled_and_changed(sctx, ls))
> +			sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
> +		if (si_pm4_state_enabled_and_changed(sctx, hs))
> +			sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
> +		if (si_pm4_state_enabled_and_changed(sctx, es))
> +			sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
> +		if (si_pm4_state_enabled_and_changed(sctx, gs))
> +			sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
> +		if (si_pm4_state_enabled_and_changed(sctx, vs))
> +			sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
> +		if (si_pm4_state_enabled_and_changed(sctx, ps))
> +			sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
> +	}
>   
>   	sctx->do_update_shaders = false;
>   	return true;
>   }
>   
>   static void si_emit_scratch_state(struct si_context *sctx,
>   				  struct r600_atom *atom)
>   {
>   	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>   
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.