[Mesa-dev] [PATCH 06/11] radeonsi: add a separate dirty mask for prefetches

Sun Aug 6 22:20:21 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

so that we don't rely on si_pm4_state_enabled_and_changed, allowing us
to move prefetches after draw calls.
---
 src/gallium/drivers/radeonsi/si_cp_dma.c        | 16 ++++++++--------
 src/gallium/drivers/radeonsi/si_descriptors.c   |  3 +--
 src/gallium/drivers/radeonsi/si_hw_context.c    | 16 ++++++++++++++--
 src/gallium/drivers/radeonsi/si_pipe.h          | 10 +++++++++-
 src/gallium/drivers/radeonsi/si_state_draw.c    |  2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c | 16 ++++++++++++++--
 6 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 24fa6fd..21202b3 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -444,38 +444,38 @@ static void cik_prefetch_shader_async(struct si_context *sctx,
 {
 	struct pipe_resource *bo = &state->bo[0]->b.b;
 	assert(state->nbo == 1);
 
 	cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
 }
 
 void cik_emit_prefetch_L2(struct si_context *sctx)
 {
 	/* Prefetch shaders and VBO descriptors to TC L2. */
-	if (si_pm4_state_enabled_and_changed(sctx, ls))
+	if (sctx->prefetch_L2_mask & SI_PREFETCH_LS)
 		cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
-	if (si_pm4_state_enabled_and_changed(sctx, hs))
+	if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
 		cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
-	if (si_pm4_state_enabled_and_changed(sctx, es))
+	if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
 		cik_prefetch_shader_async(sctx, sctx->queued.named.es);
-	if (si_pm4_state_enabled_and_changed(sctx, gs))
+	if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
 		cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-	if (si_pm4_state_enabled_and_changed(sctx, vs))
+	if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
 		cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
 
 	/* Vertex buffer descriptors are uploaded uncached, so prefetch
 	 * them right after the VS binary. */
-	if (sctx->vertex_buffer_pointer_dirty) {
+	if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) {
 		cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b,
 					 sctx->vertex_buffers.buffer_offset,
 					 sctx->vertex_elements->desc_list_byte_size);
 	}
-	if (si_pm4_state_enabled_and_changed(sctx, ps))
+	if (sctx->prefetch_L2_mask & SI_PREFETCH_PS)
 		cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
 
-	sctx->prefetch_L2 = false;
+	sctx->prefetch_L2_mask = 0;
 }
 
 void si_init_cp_dma_functions(struct si_context *sctx)
 {
 	sctx->b.clear_buffer = si_clear_buffer;
 }
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 917b0e1..43f1792 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1169,24 +1169,23 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 					      (struct r600_resource*)vb->buffer.resource,
 					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 		}
 	}
 
 	/* Don't flush the const cache. It would have a very negative effect
 	 * on performance (confirmed by testing). New descriptors are always
 	 * uploaded to a fresh new buffer, so I don't think flushing the const
 	 * cache is needed. */
 	si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
-	if (sctx->b.chip_class >= CIK)
-		sctx->prefetch_L2 = true;
 	sctx->vertex_buffers_dirty = false;
 	sctx->vertex_buffer_pointer_dirty = true;
+	sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
 	return true;
 }
 
 
 /* CONSTANT BUFFERS */
 
 static unsigned
 si_const_and_shader_buffer_descriptors_idx(unsigned shader)
 {
 	return SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS +
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 756b159..3582cd7 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -209,22 +209,34 @@ void si_begin_new_cs(struct si_context *ctx)
 		si_pm4_emit(ctx, ctx->init_config_gs_rings);
 
 	if (ctx->ce_preamble_ib)
 		si_ce_enable_loads(ctx->ce_preamble_ib);
 	else if (ctx->ce_ib)
 		si_ce_enable_loads(ctx->ce_ib);
 
 	if (ctx->ce_ib)
 		si_ce_restore_all_descriptors_at_ib_start(ctx);
 
-	if (ctx->b.chip_class >= CIK)
-		ctx->prefetch_L2 = true;
+	if (ctx->queued.named.ls)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+	if (ctx->queued.named.hs)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+	if (ctx->queued.named.es)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+	if (ctx->queued.named.gs)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+	if (ctx->queued.named.vs)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+	if (ctx->queued.named.ps)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+	if (ctx->vertex_buffers.buffer)
+		ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
 
 	/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
 	ctx->framebuffer.dirty_cbufs =
 		u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
 	/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
 	ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
 	/* This should always be marked as dirty to set the framebuffer scissor
 	 * at least. */
 	si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index d213886..62b64e1 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -61,20 +61,28 @@
 /* Framebuffer caches. */
 #define SI_CONTEXT_FLUSH_AND_INV_DB	(R600_CONTEXT_PRIVATE_FLAG << 7)
 #define SI_CONTEXT_FLUSH_AND_INV_CB	(R600_CONTEXT_PRIVATE_FLAG << 8)
 /* Engine synchronization. */
 #define SI_CONTEXT_VS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 9)
 #define SI_CONTEXT_PS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 10)
 #define SI_CONTEXT_CS_PARTIAL_FLUSH	(R600_CONTEXT_PRIVATE_FLAG << 11)
 #define SI_CONTEXT_VGT_FLUSH		(R600_CONTEXT_PRIVATE_FLAG << 12)
 #define SI_CONTEXT_VGT_STREAMOUT_SYNC	(R600_CONTEXT_PRIVATE_FLAG << 13)
 
+#define SI_PREFETCH_VBO_DESCRIPTORS	(1 << 0)
+#define SI_PREFETCH_LS			(1 << 1)
+#define SI_PREFETCH_HS			(1 << 2)
+#define SI_PREFETCH_ES			(1 << 3)
+#define SI_PREFETCH_GS			(1 << 4)
+#define SI_PREFETCH_VS			(1 << 5)
+#define SI_PREFETCH_PS			(1 << 6)
+
 #define SI_MAX_BORDER_COLORS	4096
 #define SIX_BITS		0x3F
 
 struct si_compute;
 struct hash_table;
 struct u_suballocator;
 
 struct si_screen {
 	struct r600_common_screen	b;
 	unsigned			gs_table_depth;
@@ -272,25 +280,25 @@ struct si_context {
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
 	struct r600_resource		*wait_mem_scratch;
 	unsigned			wait_mem_number;
 
 	struct radeon_winsys_cs		*ce_ib;
 	struct radeon_winsys_cs		*ce_preamble_ib;
 	struct r600_resource		*ce_ram_saved_buffer;
 	struct u_suballocator		*ce_suballocator;
 	unsigned			ce_ram_saved_offset;
 	uint16_t			total_ce_ram_allocated;
+	uint16_t			prefetch_L2_mask;
 	bool				ce_need_synchronization:1;
 
 	bool				gfx_flush_in_progress:1;
 	bool				compute_is_busy:1;
-	bool				prefetch_L2:1;
 
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
 	unsigned			dirty_atoms; /* mask */
 	/* PM4 states (precomputed immutable states) */
 	unsigned			dirty_states;
 	union si_state			queued;
 	union si_state			emitted;
 
 	/* Atom declarations. */
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 3f933fe..c78450c 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1339,21 +1339,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	/* GFX9 scissor bug workaround. There is also a more efficient but
 	 * more involved alternative workaround. */
 	if (sctx->b.chip_class == GFX9 &&
 	    si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
 		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
 
 	/* Flush caches before the first state atom, which does L2 prefetches. */
 	if (sctx->b.flags)
 		si_emit_cache_flush(sctx);
 
-	if (sctx->prefetch_L2)
+	if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
 		cik_emit_prefetch_L2(sctx);
 
 	/* Emit state atoms. */
 	mask = sctx->dirty_atoms;
 	while (mask) {
 		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
 
 		atom->emit(&sctx->b, atom);
 	}
 	sctx->dirty_atoms = 0;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index cb5a23e..5b8f907 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -3300,22 +3300,34 @@ bool si_update_shaders(struct si_context *sctx)
 	if (si_pm4_state_enabled_and_changed(sctx, ls) ||
 	    si_pm4_state_enabled_and_changed(sctx, hs) ||
 	    si_pm4_state_enabled_and_changed(sctx, es) ||
 	    si_pm4_state_enabled_and_changed(sctx, gs) ||
 	    si_pm4_state_enabled_and_changed(sctx, vs) ||
 	    si_pm4_state_enabled_and_changed(sctx, ps)) {
 		if (!si_update_spi_tmpring_size(sctx))
 			return false;
 	}
 
-	if (sctx->b.chip_class >= CIK)
-		sctx->prefetch_L2 = true;
+	if (sctx->b.chip_class >= CIK) {
+		if (si_pm4_state_enabled_and_changed(sctx, ls))
+			sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+		if (si_pm4_state_enabled_and_changed(sctx, hs))
+			sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+		if (si_pm4_state_enabled_and_changed(sctx, es))
+			sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+		if (si_pm4_state_enabled_and_changed(sctx, gs))
+			sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+		if (si_pm4_state_enabled_and_changed(sctx, vs))
+			sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+		if (si_pm4_state_enabled_and_changed(sctx, ps))
+			sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+	}
 
 	sctx->do_update_shaders = false;
 	return true;
 }
 
 static void si_emit_scratch_state(struct si_context *sctx,
 				  struct r600_atom *atom)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
-- 
2.7.4