[Mesa-dev] [PATCH 3/3] radeonsi: add TC L2 prefetch for shaders and VBO descriptors

Fri Jan 6 11:55:56 UTC 2017

On 02.01.2017 21:18, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> ---
>  src/gallium/drivers/radeonsi/si_cp_dma.c     | 12 +++++++++
>  src/gallium/drivers/radeonsi/si_pipe.h       |  2 ++
>  src/gallium/drivers/radeonsi/si_state_draw.c | 37 +++++++++++++++++++++++++++-
>  3 files changed, 50 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 653021e..13b901b 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -360,14 +360,26 @@ void si_copy_buffer(struct si_context *sctx,
>  					 &is_first);
>
>  	if (tc_l2_flag)
>  		r600_resource(dst)->TC_L2_dirty = true;
>
>  	/* If it's not a prefetch... */
>  	if (dst_offset != src_offset)
>  		sctx->b.num_cp_dma_calls++;
>  }
>
> +void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
> +			      uint64_t offset, unsigned size)
> +{
> +	assert(sctx->b.chip_class >= CIK);
> +
> +	si_copy_buffer(sctx, buf, buf, offset, offset, size,
> +		       SI_CPDMA_SKIP_CHECK_CS_SPACE |
> +		       SI_CPDMA_SKIP_SYNC_AFTER |
> +		       SI_CPDMA_SKIP_SYNC_BEFORE |
> +		       SI_CPDMA_SKIP_GFX_SYNC);
> +}
> +
>  void si_init_cp_dma_functions(struct si_context *sctx)
>  {
>  	sctx->b.clear_buffer = si_clear_buffer;
>  }
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index dc37c8d..c0a4636 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -374,20 +374,22 @@ void si_resource_copy_region(struct pipe_context *ctx,
>  /* si_cp_dma.c */
>  #define SI_CPDMA_SKIP_CHECK_CS_SPACE	(1 << 0) /* don't call need_cs_space */
>  #define SI_CPDMA_SKIP_SYNC_AFTER	(1 << 1) /* don't wait for DMA after the copy */
>  #define SI_CPDMA_SKIP_SYNC_BEFORE	(1 << 2) /* don't wait for DMA before the copy (RAW hazards) */
>  #define SI_CPDMA_SKIP_GFX_SYNC		(1 << 3) /* don't flush caches and don't wait for PS/CS */
>
>  void si_copy_buffer(struct si_context *sctx,
>  		    struct pipe_resource *dst, struct pipe_resource *src,
>  		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
>  		    unsigned user_flags);
> +void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
> +			      uint64_t offset, unsigned size);
>  void si_init_cp_dma_functions(struct si_context *sctx);
>
>  /* si_debug.c */
>  void si_init_debug_functions(struct si_context *sctx);
>  void si_check_vm_faults(struct r600_common_context *ctx,
>  			struct radeon_saved_cs *saved, enum ring_type ring);
>  bool si_replace_shader(unsigned num, struct radeon_shader_binary *binary);
>
>  /* si_dma.c */
>  void si_init_dma_functions(struct si_context *sctx);
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index b3f664e..7b75602 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -930,20 +930,31 @@ void si_ce_pre_draw_synchronization(struct si_context *sctx)
>  void si_ce_post_draw_synchronization(struct si_context *sctx)
>  {
>  	if (sctx->ce_need_synchronization) {
>  		radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
>  		radeon_emit(sctx->b.gfx.cs, 0);
>
>  		sctx->ce_need_synchronization = false;
>  	}
>  }
>
> +static void cik_prefetch_shader_async(struct si_context *sctx,
> +				      struct si_pm4_state *state)
> +{
> +	if (state) {
> +		struct pipe_resource *bo = &state->bo[0]->b.b;
> +		assert(state->nbo == 1);
> +
> +		cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
> +	}
> +}
> +
>  void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>  {
>  	struct si_context *sctx = (struct si_context *)ctx;
>  	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
>  	struct pipe_index_buffer ib = {};
>  	unsigned mask, dirty_fb_counter, dirty_tex_counter, rast_prim;
>
>  	if (likely(!info->indirect)) {
>  		/* SI-CI treat instance_count==0 as instance_count==1. There is
>  		 * no workaround for indirect draws, but we can at least skip
> @@ -1107,24 +1118,48 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>
>  	si_need_cs_space(sctx);
>
>  	/* Since we've called r600_context_add_resource_size for vertex buffers,
>  	 * this must be called after si_need_cs_space, because we must let
>  	 * need_cs_space flush before we add buffers to the buffer list.
>  	 */
>  	if (!si_upload_vertex_buffer_descriptors(sctx))
>  		return;
>
> -	/* Flushed caches prior to emitting states. */
> +	/* Flushed caches prior to prefetching shaders. */
>  	if (sctx->b.flags)
>  		si_emit_cache_flush(sctx);
>
> +	/* Prefetch shaders and VBO descriptors to TC L2. */
> +	if (sctx->b.chip_class >= CIK) {
> +		if (si_pm4_state_changed(sctx, ls))
> +			cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
> +		if (si_pm4_state_changed(sctx, hs))
> +			cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
> +		if (si_pm4_state_changed(sctx, es))
> +			cik_prefetch_shader_async(sctx, sctx->queued.named.es);
> +		if (si_pm4_state_changed(sctx, gs))
> +			cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
> +		if (si_pm4_state_changed(sctx, vs))
> +			cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
> +
> +		/* Vertex buffer descriptors are uploaded uncached, so prefetch
> +		 * them right after the VS binary. */
> +		if (sctx->vertex_buffers.pointer_dirty) {
> +			cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b,
> +						sctx->vertex_buffers.buffer_offset,
> +						sctx->vertex_elements->count * 16);
> +		}

Logically this should come directly after the API vertex shader, right? 
So you're basically putting this in a sub-optimal place for 
tessellation/geometry pipelines to simplify the code? Okay, that may be 
a reasonable trade-off given how rare they are.

For the series:

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> +		if (si_pm4_state_changed(sctx, ps))
> +			cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
> +	}
> +
>  	/* Emit states. */
>  	mask = sctx->dirty_atoms;
>  	while (mask) {
>  		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
>
>  		atom->emit(&sctx->b, atom);
>  	}
>  	sctx->dirty_atoms = 0;
>
>  	si_pm4_emit_dirty(sctx);
>