[Mesa-dev] [PATCH 6/7] radeonsi: use optimal packet order when doing a pipeline sync

Fri Aug 4 10:05:54 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

Process new SET packets in parallel with previous draw calls.

This decreases [CP busy / SPI busy] by a very tiny amount (verified with
GRBM perf counters), and probably increases FPS by a very tiny amount
for apps that do pipeline syncs often.
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 54 ++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index ae48115..06a18c1 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1173,30 +1173,31 @@ static bool si_cache_flush_and_prefetch(struct si_context *sctx)
 	 */
 	if (!si_upload_graphics_shader_descriptors(sctx))
 		return false;
 
 	if (sctx->prefetch_L2)
 		cik_emit_prefetch_L2(sctx);
 
 	return true;
 }
 
-static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info)
+static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
+			       unsigned skip_atom_mask)
 {
 	/* Emit state atoms. */
-	unsigned mask = sctx->dirty_atoms;
+	unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
 	while (mask) {
 		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
 
 		atom->emit(&sctx->b, atom);
 	}
-	sctx->dirty_atoms = 0;
+	sctx->dirty_atoms &= skip_atom_mask;
 
 	/* Emit states. */
 	mask = sctx->dirty_states;
 	while (mask) {
 		unsigned i = u_bit_scan(&mask);
 		struct si_pm4_state *state = sctx->queued.array[i];
 
 		if (!state || sctx->emitted.array[i] == state)
 			continue;
 
@@ -1384,23 +1385,64 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	 */
 	if (!si_upload_vertex_buffer_descriptors(sctx))
 		return;
 
 	/* GFX9 scissor bug workaround. There is also a more efficient but
 	 * more involved alternative workaround. */
 	if (sctx->b.chip_class == GFX9 &&
 	    si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
 		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
 
-	if (!si_cache_flush_and_prefetch(sctx))
-		return;
-	si_emit_all_states(sctx, info);
+	/* Use an optimal packet order based on whether we need to sync the pipeline. */
+	if (unlikely(sctx->b.flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
+				      SI_CONTEXT_FLUSH_AND_INV_DB |
+				      SI_CONTEXT_PS_PARTIAL_FLUSH |
+				      SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+		/* If we have to wait for idle, set all states first, so that all
+		 * SET packets are processed in parallel with previous draw calls.
+		 * Sequence:
+		 * - process SET packets except SET_SH packets for shader pointers
+		 * - flush caches and wait for previous draw calls
+		 * - start CE dumps (might already be ongoing if there is no CE-DE barrier)
+		 * - start prefetches
+		 * - process SET_SH packets for shader pointers
+		 * - wait for CE dumps
+		 * - draw
+		 */
+		struct r600_atom *shader_pointers = &sctx->shader_userdata.atom;
+
+		/* Emit all states except shader pointers. */
+		si_emit_all_states(sctx, info, 1 << shader_pointers->id);
+
+		if (!si_cache_flush_and_prefetch(sctx))
+			return;
+
+		/* Set shader pointers last. */
+		if (si_is_atom_dirty(sctx, shader_pointers)) {
+			shader_pointers->emit(&sctx->b, NULL);
+			sctx->dirty_atoms = 0;
+		}
+	} else {
+		/* If we don't wait for idle, do CE dumps and start prefetches
+		 * first, so that they are being done in parallel with all SET
+		 * packets. Sequence:
+		 * - flush caches
+		 * - start CE dumps (might already be ongoing if CE is ahead)
+		 * - start prefetches
+		 * - process SET packets
+		 * - wait for CE dumps
+		 * - draw
+		 */
+		if (!si_cache_flush_and_prefetch(sctx))
+			return;
+		si_emit_all_states(sctx, info, 0);
+	}
 
 	si_ce_pre_draw_synchronization(sctx);
 	si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
 	si_ce_post_draw_synchronization(sctx);
 
 	if (sctx->trace_buf)
 		si_trace_emit(sctx);
 
 	/* Workaround for a VGT hang when streamout is enabled.
 	 * It must be done after drawing. */
-- 
2.7.4