[Mesa-dev] [PATCH 07/17] radeonsi: atomize L2 prefetches

Thu Jan 26 16:04:23 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

to move the big conditional statement out of draw_vbo
---
 src/gallium/drivers/radeonsi/si_cp_dma.c        | 39 +++++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_descriptors.c   |  2 ++
 src/gallium/drivers/radeonsi/si_hw_context.c    |  3 ++
 src/gallium/drivers/radeonsi/si_pipe.h          |  1 +
 src/gallium/drivers/radeonsi/si_state.h         |  1 +
 src/gallium/drivers/radeonsi/si_state_draw.c    | 37 +----------------------
 src/gallium/drivers/radeonsi/si_state_shaders.c |  3 ++
 7 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 582e599..b398256 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -399,14 +399,53 @@ void si_copy_buffer(struct si_context *sctx,
 }
 
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
 			      uint64_t offset, unsigned size)
 {
 	assert(sctx->b.chip_class >= CIK);
 
 	si_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL);
 }
 
+static void cik_prefetch_shader_async(struct si_context *sctx,
+				      struct si_pm4_state *state)
+{
+	if (state) {
+		struct pipe_resource *bo = &state->bo[0]->b.b;
+		assert(state->nbo == 1);
+
+		cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
+	}
+}
+
+static void cik_emit_prefetch_L2(struct si_context *sctx, struct r600_atom *atom)
+{
+	/* Prefetch shaders and VBO descriptors to TC L2. */
+	if (si_pm4_state_changed(sctx, ls))
+		cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
+	if (si_pm4_state_changed(sctx, hs))
+		cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+	if (si_pm4_state_changed(sctx, es))
+		cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+	if (si_pm4_state_changed(sctx, gs))
+		cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+	if (si_pm4_state_changed(sctx, vs))
+		cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+
+	/* Vertex buffer descriptors are uploaded uncached, so prefetch
+	 * them right after the VS binary. */
+	if (sctx->vertex_buffer_pointer_dirty) {
+		cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b,
+					 sctx->vertex_buffers.buffer_offset,
+					 sctx->vertex_elements->count * 16);
+	}
+	if (si_pm4_state_changed(sctx, ps))
+		cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
+}
+
 void si_init_cp_dma_functions(struct si_context *sctx)
 {
 	sctx->b.clear_buffer = si_clear_buffer;
+
+	si_init_atom(sctx, &sctx->prefetch_L2, &sctx->atoms.s.prefetch_L2,
+		     cik_emit_prefetch_L2);
 }
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 4a9fcd0..4c1120a 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1031,20 +1031,22 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 					      (struct r600_resource*)vb->buffer,
 					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 		}
 	}
 
 	/* Don't flush the const cache. It would have a very negative effect
 	 * on performance (confirmed by testing). New descriptors are always
 	 * uploaded to a fresh new buffer, so I don't think flushing the const
 	 * cache is needed. */
 	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
+	if (sctx->b.chip_class >= CIK)
+		si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
 	sctx->vertex_buffers_dirty = false;
 	sctx->vertex_buffer_pointer_dirty = true;
 	return true;
 }
 
 
 /* CONSTANT BUFFERS */
 
 static unsigned
 si_const_buffer_descriptors_idx(unsigned shader)
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 57eaac9..d862e26 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -200,20 +200,23 @@ void si_begin_new_cs(struct si_context *ctx)
 		si_pm4_emit(ctx, ctx->init_config_gs_rings);
 
 	if (ctx->ce_preamble_ib)
 		si_ce_enable_loads(ctx->ce_preamble_ib);
 	else if (ctx->ce_ib)
 		si_ce_enable_loads(ctx->ce_ib);
 
 	if (ctx->ce_preamble_ib)
 		si_ce_reinitialize_all_descriptors(ctx);
 
+	if (ctx->b.chip_class >= CIK)
+		si_mark_atom_dirty(ctx, &ctx->prefetch_L2);
+
 	ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
 	ctx->framebuffer.dirty_zsbuf = true;
 	si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
 
 	si_mark_atom_dirty(ctx, &ctx->clip_regs);
 	si_mark_atom_dirty(ctx, &ctx->clip_state.atom);
 	ctx->msaa_sample_locs.nr_samples = 0;
 	si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs.atom);
 	si_mark_atom_dirty(ctx, &ctx->msaa_config);
 	si_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 6558474..b6474e6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -225,20 +225,21 @@ struct si_context {
 	bool				compute_is_busy;
 
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
 	unsigned			dirty_atoms; /* mask */
 	/* PM4 states (precomputed immutable states) */
 	union si_state			queued;
 	union si_state			emitted;
 
 	/* Atom declarations. */
+	struct r600_atom		prefetch_L2;
 	struct si_framebuffer		framebuffer;
 	struct si_sample_locs		msaa_sample_locs;
 	struct r600_atom		db_render_state;
 	struct r600_atom		msaa_config;
 	struct si_sample_mask		sample_mask;
 	struct r600_atom		cb_render_state;
 	struct si_blend_color		blend_color;
 	struct r600_atom		clip_regs;
 	struct si_clip_state		clip_state;
 	struct si_shader_data		shader_userdata;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 03e5011..915a8eb 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -125,20 +125,21 @@ union si_state {
 		struct si_pm4_state		*vgt_shader_config;
 		struct si_pm4_state		*vs;
 		struct si_pm4_state		*ps;
 	} named;
 	struct si_pm4_state	*array[0];
 };
 
 union si_state_atoms {
 	struct {
 		/* The order matters. */
+		struct r600_atom *prefetch_L2;
 		struct r600_atom *render_cond;
 		struct r600_atom *streamout_begin;
 		struct r600_atom *streamout_enable; /* must be after streamout_begin */
 		struct r600_atom *framebuffer;
 		struct r600_atom *msaa_sample_locs;
 		struct r600_atom *db_render_state;
 		struct r600_atom *msaa_config;
 		struct r600_atom *sample_mask;
 		struct r600_atom *cb_render_state;
 		struct r600_atom *blend_color;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index db671c9..0374841 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -933,31 +933,20 @@ void si_ce_pre_draw_synchronization(struct si_context *sctx)
 void si_ce_post_draw_synchronization(struct si_context *sctx)
 {
 	if (sctx->ce_need_synchronization) {
 		radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0));
 		radeon_emit(sctx->b.gfx.cs, 0);
 
 		sctx->ce_need_synchronization = false;
 	}
 }
 
-static void cik_prefetch_shader_async(struct si_context *sctx,
-				      struct si_pm4_state *state)
-{
-	if (state) {
-		struct pipe_resource *bo = &state->bo[0]->b.b;
-		assert(state->nbo == 1);
-
-		cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
-	}
-}
-
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	struct pipe_index_buffer ib = {};
 	unsigned mask, dirty_fb_counter, dirty_tex_counter, rast_prim;
 
 	if (likely(!info->indirect)) {
 		/* SI-CI treat instance_count==0 as instance_count==1. There is
 		 * no workaround for indirect draws, but we can at least skip
@@ -1122,48 +1111,24 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	si_need_cs_space(sctx);
 
 	/* Since we've called r600_context_add_resource_size for vertex buffers,
 	 * this must be called after si_need_cs_space, because we must let
 	 * need_cs_space flush before we add buffers to the buffer list.
 	 */
 	if (!si_upload_vertex_buffer_descriptors(sctx))
 		return;
 
-	/* Flushed caches prior to prefetching shaders. */
+	/* Flush caches before the first state atom, which does L2 prefetches. */
 	if (sctx->b.flags)
 		si_emit_cache_flush(sctx);
 
-	/* Prefetch shaders and VBO descriptors to TC L2. */
-	if (sctx->b.chip_class >= CIK) {
-		if (si_pm4_state_changed(sctx, ls))
-			cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
-		if (si_pm4_state_changed(sctx, hs))
-			cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
-		if (si_pm4_state_changed(sctx, es))
-			cik_prefetch_shader_async(sctx, sctx->queued.named.es);
-		if (si_pm4_state_changed(sctx, gs))
-			cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-		if (si_pm4_state_changed(sctx, vs))
-			cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
-
-		/* Vertex buffer descriptors are uploaded uncached, so prefetch
-		 * them right after the VS binary. */
-		if (sctx->vertex_buffer_pointer_dirty) {
-			cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b,
-						sctx->vertex_buffers.buffer_offset,
-						sctx->vertex_elements->count * 16);
-		}
-		if (si_pm4_state_changed(sctx, ps))
-			cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
-	}
-
 	/* Emit states. */
 	mask = sctx->dirty_atoms;
 	while (mask) {
 		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
 
 		atom->emit(&sctx->b, atom);
 	}
 	sctx->dirty_atoms = 0;
 
 	si_pm4_emit_dirty(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index b3616dc..02f8d6c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2518,20 +2518,23 @@ bool si_update_shaders(struct si_context *sctx)
 	if (si_pm4_state_changed(sctx, ls) ||
 	    si_pm4_state_changed(sctx, hs) ||
 	    si_pm4_state_changed(sctx, es) ||
 	    si_pm4_state_changed(sctx, gs) ||
 	    si_pm4_state_changed(sctx, vs) ||
 	    si_pm4_state_changed(sctx, ps)) {
 		if (!si_update_spi_tmpring_size(sctx))
 			return false;
 	}
 
+	if (sctx->b.chip_class >= CIK)
+		si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
+
 	sctx->do_update_shaders = false;
 	return true;
 }
 
 void si_init_shader_functions(struct si_context *sctx)
 {
 	si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);
 
 	sctx->b.b.create_vs_state = si_create_shader_selector;
 	sctx->b.b.create_tcs_state = si_create_shader_selector;
-- 
2.7.4