[Mesa-dev] [PATCH 3/4] radeonsi/gfx9: ACQUIRE_MEM rolls the context, rework scissor bug workaround

Wed Apr 17 23:39:12 UTC 2019

From: Marek Olšák <marek.olsak at amd.com>

The rework is needed to include ACQUIRE_MEM in the workaround by moving
the workaround logic out of si_emit_all_states.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110355

Cc: 19.0 <mesa-stable at lists.freedesktop.org>
---
 src/gallium/drivers/radeonsi/si_state_draw.c | 60 +++++++++++++++++-----------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index e2fba41..b673c2f 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -889,20 +889,25 @@ static void si_emit_surface_sync(struct si_context *sctx,
 		radeon_emit(cs, 0);		/* CP_COHER_BASE_HI */
 		radeon_emit(cs, 0x0000000A);	/* POLL_INTERVAL */
 	} else {
 		/* ACQUIRE_MEM is only required on a compute ring. */
 		radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
 		radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
 		radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
 		radeon_emit(cs, 0);               /* CP_COHER_BASE */
 		radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
 	}
+
+	/* ACQUIRE_MEM has an implicit context roll if the current context
+	 * is busy. */
+	if (sctx->has_graphics)
+		sctx->context_roll_counter++;
 }
 
 void si_emit_cache_flush(struct si_context *sctx)
 {
 	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	uint32_t flags = sctx->flags;
 
 	if (!sctx->has_graphics) {
 		/* Only process compute flags. */
 		flags &= SI_CONTEXT_INV_ICACHE |
@@ -1216,40 +1221,25 @@ static void si_get_draw_start_count(struct si_context *sctx,
 	} else {
 		*start = info->start;
 		*count = info->count;
 	}
 }
 
 static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
 			       unsigned skip_atom_mask)
 {
 	unsigned num_patches = 0;
-	/* Vega10/Raven scissor bug workaround. When any context register is
-	 * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
-	 * registers must be written too.
-	 */
-	bool handle_scissor_bug = sctx->screen->has_gfx9_scissor_bug &&
-				  !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors);
-	bool context_roll = false; /* set correctly for GFX9 only */
 
-	context_roll |= si_emit_rasterizer_prim_state(sctx);
+	sctx->context_roll_counter |= si_emit_rasterizer_prim_state(sctx);
 	if (sctx->tes_shader.cso)
-		context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches);
-
-	if (handle_scissor_bug &&
-	    (info->count_from_stream_output ||
-	     sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
-	     sctx->dirty_states & si_states_that_always_roll_context() ||
-	     si_prim_restart_index_changed(sctx, info)))
-		context_roll = true;
-
-	sctx->context_roll_counter = 0;
+		sctx->context_roll_counter |=
+			si_emit_derived_tess_state(sctx, info, &num_patches);
 
 	/* Emit state atoms. */
 	unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
 	while (mask)
 		sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
 
 	sctx->dirty_atoms &= skip_atom_mask;
 
 	/* Emit states. */
 	mask = sctx->dirty_states;
@@ -1258,26 +1248,20 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
 		struct si_pm4_state *state = sctx->queued.array[i];
 
 		if (!state || sctx->emitted.array[i] == state)
 			continue;
 
 		si_pm4_emit(sctx, state);
 		sctx->emitted.array[i] = state;
 	}
 	sctx->dirty_states = 0;
 
-	if (handle_scissor_bug &&
-	    (context_roll || sctx->context_roll_counter)) {
-		sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-		sctx->atoms.s.scissors.emit(sctx);
-	}
-
 	/* Emit draw states. */
 	si_emit_vs_state(sctx, info);
 	si_emit_draw_registers(sctx, info, num_patches);
 }
 
 static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	struct pipe_resource *indexbuf = info->index.resource;
@@ -1462,20 +1446,37 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 
 	si_need_gfx_cs_space(sctx);
 
 	/* Since we've called si_context_add_resource_size for vertex buffers,
 	 * this must be called after si_need_cs_space, because we must let
 	 * need_cs_space flush before we add buffers to the buffer list.
 	 */
 	if (!si_upload_vertex_buffer_descriptors(sctx))
 		goto return_cleanup;
 
+	/* Vega10/Raven scissor bug workaround. When any context register is
+	 * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
+	 * registers must be written too.
+	 */
+	bool handle_scissor_bug = sctx->screen->has_gfx9_scissor_bug &&
+				  !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors);
+
+	/* If this is > 0 after all the non-draw packets, a context roll occured. */
+	sctx->context_roll_counter = 0;
+
+	if (handle_scissor_bug &&
+	    (info->count_from_stream_output ||
+	     sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
+	     sctx->dirty_states & si_states_that_always_roll_context() ||
+	     si_prim_restart_index_changed(sctx, info)))
+		sctx->context_roll_counter++;
+
 	/* Use optimal packet order based on whether we need to sync the pipeline. */
 	if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
 				      SI_CONTEXT_FLUSH_AND_INV_DB |
 				      SI_CONTEXT_PS_PARTIAL_FLUSH |
 				      SI_CONTEXT_CS_PARTIAL_FLUSH))) {
 		/* If we have to wait for idle, set all states first, so that all
 		 * SET packets are processed in parallel with previous draw calls.
 		 * Then draw and prefetch at the end. This ensures that the time
 		 * the CUs are idle is very short.
 		 */
@@ -1489,20 +1490,25 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 
 		/* Emit all states except possibly render condition. */
 		si_emit_all_states(sctx, info, masked_atoms);
 		si_emit_cache_flush(sctx);
 		/* <-- CUs are idle here. */
 
 		if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
 			sctx->atoms.s.render_cond.emit(sctx);
 		sctx->dirty_atoms = 0;
 
+		if (handle_scissor_bug && sctx->context_roll_counter) {
+			sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+			sctx->atoms.s.scissors.emit(sctx);
+		}
+
 		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
 		/* <-- CUs are busy here. */
 
 		/* Start prefetches after the draw has been started. Both will run
 		 * in parallel, but starting the draw first is more important.
 		 */
 		if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)
 			cik_emit_prefetch_L2(sctx, false);
 	} else {
 		/* If we don't wait for idle, start prefetches first, then set
@@ -1512,20 +1518,26 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 			si_emit_cache_flush(sctx);
 
 		/* Only prefetch the API VS and VBO descriptors. */
 		if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)
 			cik_emit_prefetch_L2(sctx, true);
 
 		if (!si_upload_graphics_shader_descriptors(sctx))
 			return;
 
 		si_emit_all_states(sctx, info, 0);
+
+		if (handle_scissor_bug && sctx->context_roll_counter) {
+			sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+			sctx->atoms.s.scissors.emit(sctx);
+		}
+
 		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
 
 		/* Prefetch the remaining shaders after the draw has been
 		 * started. */
 		if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)
 			cik_emit_prefetch_L2(sctx, false);
 	}
 
 	if (unlikely(sctx->current_saved_cs)) {
 		si_trace_emit(sctx);
-- 
2.7.4