[Mesa-dev] [PATCH 4/4] radeonsi: track context rolls better for the Vega scissor bug workaround

Sun Oct 7 07:05:50 UTC 2018

From: Marek Olšák <marek.olsak at amd.com>

We should get fewer context rolls with the SET_CONTEXT_REG optimization,
but it would have been for nothing if the scissor state rolled the context
anyway. Don't emit the scissor state if there is no context roll.
---
 src/gallium/drivers/radeonsi/si_pipe.h        |  1 +
 src/gallium/drivers/radeonsi/si_state.c       | 31 ++++++++++++++----
 src/gallium/drivers/radeonsi/si_state.h       | 17 ++--------
 .../drivers/radeonsi/si_state_binning.c       |  7 ++++
 src/gallium/drivers/radeonsi/si_state_draw.c  | 32 +++++++++++--------
 .../drivers/radeonsi/si_state_shaders.c       | 23 +++++++++++++
 .../drivers/radeonsi/si_state_viewport.c      |  3 ++
 7 files changed, 80 insertions(+), 34 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 7ae17435ab6..6edc06cece7 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1016,20 +1016,21 @@ struct si_context {
 	unsigned			num_vs_flushes;
 	unsigned			num_ps_flushes;
 	unsigned			num_cs_flushes;
 	unsigned			num_cb_cache_flushes;
 	unsigned			num_db_cache_flushes;
 	unsigned			num_L2_invalidates;
 	unsigned			num_L2_writebacks;
 	unsigned			num_resident_handles;
 	uint64_t			num_alloc_tex_transfer_bytes;
 	unsigned			last_tex_ps_draw_ratio; /* for query */
+	unsigned			context_roll_counter;
 
 	/* Queries. */
 	/* Maintain the list of active queries for pausing between IBs. */
 	int				num_occlusion_queries;
 	int				num_perfect_occlusion_queries;
 	struct list_head		active_queries;
 	unsigned			num_cs_dw_queries_suspend;
 
 	/* Render condition. */
 	struct pipe_query		*render_cond;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index d3c63406dd4..fa1fea5289c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -81,34 +81,35 @@ static void si_emit_cb_render_state(struct si_context *sctx)
 	 * but there is not enough color outputs. This is undefined behavior,
 	 * so disable color writes completely.
 	 *
 	 * Reproducible with Unigine Heaven 4.0 and drirc missing.
 	 */
 	if (blend && blend->dual_src_blend &&
 	    sctx->ps_shader.cso &&
 	    (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
 		cb_target_mask = 0;
 
-	radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK,
-				   SI_TRACKED_CB_TARGET_MASK, cb_target_mask);
-
 	/* GFX9: Flush DFSM when CB_TARGET_MASK changes.
 	 * I think we don't have to do anything between IBs.
 	 */
 	if (sctx->screen->dfsm_allowed &&
 	    sctx->last_cb_target_mask != cb_target_mask) {
 		sctx->last_cb_target_mask = cb_target_mask;
 
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
 	}
 
+	unsigned initial_cdw = cs->current.cdw;
+	radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK,
+				   SI_TRACKED_CB_TARGET_MASK, cb_target_mask);
+
 	if (sctx->chip_class >= VI) {
 		/* DCC MSAA workaround for blending.
 		 * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
 		 * COMBINER_DISABLE, but that would be more complicated.
 		 */
 		bool oc_disable = (sctx->chip_class == VI ||
 				   sctx->chip_class == GFX9) &&
 				  blend &&
 				  blend->blend_enable_4bit & cb_target_mask &&
 				  sctx->framebuffer.nr_samples >= 2;
@@ -245,20 +246,22 @@ static void si_emit_cb_render_state(struct si_context *sctx)
 				break;
 			}
 		}
 
 		/* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
 		radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT,
 					    SI_TRACKED_SX_PS_DOWNCONVERT,
 					    sx_ps_downconvert, sx_blend_opt_epsilon,
 					    sx_blend_opt_control);
 	}
+	if (initial_cdw != cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
 /*
  * Blender functions
  */
 
 static uint32_t si_translate_blend_function(int blend_func)
 {
 	switch (blend_func) {
 	case PIPE_BLEND_ADD:
@@ -766,31 +769,35 @@ static void si_emit_clip_regs(struct si_context *sctx)
 
 	/* Clip distances on points have no effect, so need to be implemented
 	 * as cull distances. This applies for the clipvertex case as well.
 	 *
 	 * Setting this for primitives other than points should have no adverse
 	 * effects.
 	 */
 	clipdist_mask &= rs->clip_plane_enable;
 	culldist_mask |= clipdist_mask;
 
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 	radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
 		SI_TRACKED_PA_CL_VS_OUT_CNTL,
 		vs_sel->pa_cl_vs_out_cntl |
 		S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
 		S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
 		clipdist_mask | (culldist_mask << 8));
 	radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL,
 		SI_TRACKED_PA_CL_CLIP_CNTL,
 		rs->pa_cl_clip_cntl |
 		ucp_mask |
 		S_028810_CLIP_DISABLE(window_space));
+
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
 /*
  * inferred state between framebuffer and rasterizer
  */
 static void si_update_poly_offset_state(struct si_context *sctx)
 {
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
 	if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
@@ -1345,20 +1352,21 @@ void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
 	st->saved_compute = sctx->cs_shader_state.program;
 
 	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
 	si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
 }
 
 static void si_emit_db_render_state(struct si_context *sctx)
 {
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	unsigned db_shader_control, db_render_control, db_count_control;
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 
 	/* DB_RENDER_CONTROL */
 	if (sctx->dbcb_depth_copy_enabled ||
 	    sctx->dbcb_stencil_copy_enabled) {
 		db_render_control =
 			S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
 			S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
 			S_028000_COPY_CENTROID(1) |
 			S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
 	} else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
@@ -1427,20 +1435,23 @@ static void si_emit_db_render_state(struct si_context *sctx)
 	/* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
 	if (!rs->multisample_enable)
 		db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
 
 	if (sctx->screen->has_rbplus &&
 	    !sctx->screen->rbplus_allowed)
 		db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
 
 	radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,
 				   SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
+
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
 /*
  * format translation
  */
 static uint32_t si_translate_colorformat(enum pipe_format format)
 {
 	const struct util_format_description *desc = util_format_description(format);
 	if (!desc)
 		return V_028C70_COLOR_INVALID;
@@ -3482,35 +3493,41 @@ static void si_emit_msaa_config(struct si_context *sctx)
 			db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
 				   S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
 				   S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
 				   S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
 			sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
 		} else if (sctx->smoothing_enabled) {
 			db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
 		}
 	}
 
+	unsigned initial_cdw = cs->current.cdw;
+
 	/* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
 	radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL,
 				    SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl,
 				    sc_aa_config);
 	/* R_028804_DB_EQAA */
 	radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA,
 				   db_eqaa);
 	/* R_028A4C_PA_SC_MODE_CNTL_1 */
 	radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1,
 				   SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
 
-	/* GFX9: Flush DFSM when the AA mode changes. */
-	if (sctx->screen->dfsm_allowed) {
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
+	if (initial_cdw != cs->current.cdw) {
+		sctx->context_roll_counter++;
+
+		/* GFX9: Flush DFSM when the AA mode changes. */
+		if (sctx->screen->dfsm_allowed) {
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+			radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
+		}
 	}
 }
 
 void si_update_ps_iter_samples(struct si_context *sctx)
 {
 	if (sctx->framebuffer.nr_samples > 1)
 		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 	if (sctx->screen->dpbb_allowed)
 		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 }
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index f52296d1119..83589e6918c 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -164,31 +164,27 @@ union si_state {
 		struct si_pm4_state		*ps;
 	} named;
 	struct si_pm4_state	*array[0];
 };
 
 #define SI_STATE_IDX(name) \
 	(offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
 #define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name))
 #define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *))
 
-static inline unsigned si_states_that_roll_context(void)
+static inline unsigned si_states_that_always_roll_context(void)
 {
 	return (SI_STATE_BIT(blend) |
 		SI_STATE_BIT(rasterizer) |
 		SI_STATE_BIT(dsa) |
 		SI_STATE_BIT(poly_offset) |
-		SI_STATE_BIT(es) |
-		SI_STATE_BIT(gs) |
-		SI_STATE_BIT(vgt_shader_config) |
-		SI_STATE_BIT(vs) |
-		SI_STATE_BIT(ps));
+		SI_STATE_BIT(vgt_shader_config));
 }
 
 union si_state_atoms {
 	struct {
 		/* The order matters. */
 		struct si_atom render_cond;
 		struct si_atom streamout_begin;
 		struct si_atom streamout_enable; /* must be after streamout_begin */
 		struct si_atom framebuffer;
 		struct si_atom msaa_sample_locs;
@@ -209,39 +205,32 @@ union si_state_atoms {
 		struct si_atom scratch_state;
 		struct si_atom window_rectangles;
 	} s;
 	struct si_atom array[0];
 };
 
 #define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / \
 			         sizeof(struct si_atom)))
 #define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct si_atom*))
 
-static inline unsigned si_atoms_that_roll_context(void)
+static inline unsigned si_atoms_that_always_roll_context(void)
 {
 	return (SI_ATOM_BIT(streamout_begin) |
 		SI_ATOM_BIT(streamout_enable) |
 		SI_ATOM_BIT(framebuffer) |
 		SI_ATOM_BIT(msaa_sample_locs) |
-		SI_ATOM_BIT(db_render_state) |
-		SI_ATOM_BIT(dpbb_state) |
-		SI_ATOM_BIT(msaa_config) |
 		SI_ATOM_BIT(sample_mask) |
-		SI_ATOM_BIT(cb_render_state) |
 		SI_ATOM_BIT(blend_color) |
-		SI_ATOM_BIT(clip_regs) |
 		SI_ATOM_BIT(clip_state) |
-		SI_ATOM_BIT(guardband) |
 		SI_ATOM_BIT(scissors) |
 		SI_ATOM_BIT(viewports) |
 		SI_ATOM_BIT(stencil_ref) |
-		SI_ATOM_BIT(spi_map) |
 		SI_ATOM_BIT(scratch_state));
 }
 
 struct si_shader_data {
 	uint32_t		sh_base[SI_NUM_SHADERS];
 };
 
 /* The list of registers whose emitted values are remembered by si_context. */
 enum si_tracked_reg {
 	SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */
diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c
index 4aad94d95f9..70c129242d1 100644
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -303,28 +303,32 @@ static struct uvec2 si_get_depth_bin_size(struct si_context *sctx)
 				{      193,    0,    0 },
 			},
 		},
 	};
 
 	return si_find_bin_size(sctx->screen, table, sum);
 }
 
 static void si_emit_dpbb_disable(struct si_context *sctx)
 {
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
 	radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
 		SI_TRACKED_PA_SC_BINNER_CNTL_0,
 		S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
 		S_028C44_DISABLE_START_OF_PRIM(1));
 	radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
 				   SI_TRACKED_DB_DFSM_CONTROL,
 				   S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
 				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
 void si_emit_dpbb_state(struct si_context *sctx)
 {
 	struct si_screen *sscreen = sctx->screen;
 	struct si_state_blend *blend = sctx->queued.named.blend;
 	struct si_state_dsa *dsa = sctx->queued.named.dsa;
 	unsigned db_shader_control = sctx->ps_db_shader_control;
 
 	assert(sctx->chip_class >= GFX9);
@@ -412,28 +416,31 @@ void si_emit_dpbb_state(struct si_context *sctx)
 		assert(0);
 	}
 
 	/* Emit registers. */
 	struct uvec2 bin_size_extend = {};
 	if (bin_size.x >= 32)
 		bin_size_extend.x = util_logbase2(bin_size.x) - 5;
 	if (bin_size.y >= 32)
 		bin_size_extend.y = util_logbase2(bin_size.y) - 5;
 
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 	radeon_opt_set_context_reg(
 		sctx, R_028C44_PA_SC_BINNER_CNTL_0,
 		SI_TRACKED_PA_SC_BINNER_CNTL_0,
 		S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
 		S_028C44_BIN_SIZE_X(bin_size.x == 16) |
 		S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
 		S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
 		S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
 		S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
 		S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
 		S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
 		S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
 		S_028C44_OPTIMAL_BIN_SELECTION(1));
 	radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
 				   SI_TRACKED_DB_DFSM_CONTROL,
 				   S_028060_PUNCHOUT_MODE(punchout_mode) |
 				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 69f723e4e4a..83eb646b791 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1182,40 +1182,40 @@ static void si_get_draw_start_count(struct si_context *sctx,
 	} else {
 		*start = info->start;
 		*count = info->count;
 	}
 }
 
 static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
 			       unsigned skip_atom_mask)
 {
 	unsigned num_patches = 0;
+	/* Vega10/Raven scissor bug workaround. When any context register is
+	 * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
+	 * registers must be written too.
+	 */
+	bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) &&
+				  !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors);
 	bool context_roll = false; /* set correctly for GFX9 only */
 
 	context_roll |= si_emit_rasterizer_prim_state(sctx);
 	if (sctx->tes_shader.cso)
 		context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches);
-	if (info->count_from_stream_output)
+
+	if (handle_scissor_bug &&
+	    (info->count_from_stream_output ||
+	     sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
+	     sctx->dirty_states & si_states_that_always_roll_context() ||
+	     si_prim_restart_index_changed(sctx, info)))
 		context_roll = true;
 
-	/* Vega10/Raven scissor bug workaround. When any context register is
-	 * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
-	 * registers must be written too.
-	 */
-	if ((sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) &&
-	    (context_roll ||
-	     sctx->dirty_atoms & si_atoms_that_roll_context() ||
-	     sctx->dirty_states & si_states_that_roll_context() ||
-	     si_prim_restart_index_changed(sctx, info))) {
-		sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-		si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
-	}
+	sctx->context_roll_counter = 0;
 
 	/* Emit state atoms. */
 	unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
 	while (mask)
 		sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
 
 	sctx->dirty_atoms &= skip_atom_mask;
 
 	/* Emit states. */
 	mask = sctx->dirty_states;
@@ -1224,20 +1224,26 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
 		struct si_pm4_state *state = sctx->queued.array[i];
 
 		if (!state || sctx->emitted.array[i] == state)
 			continue;
 
 		si_pm4_emit(sctx, state);
 		sctx->emitted.array[i] = state;
 	}
 	sctx->dirty_states = 0;
 
+	if (handle_scissor_bug &&
+	    (context_roll || sctx->context_roll_counter)) {
+		sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+		sctx->atoms.s.scissors.emit(sctx);
+	}
+
 	/* Emit draw states. */
 	si_emit_vs_state(sctx, info);
 	si_emit_draw_registers(sctx, info, num_patches);
 }
 
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	struct pipe_resource *indexbuf = info->index.resource;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 2bdac33586b..ad7d21e7816 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -554,37 +554,41 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
 
 	if (sscreen->info.chip_class <= VI) {
 		si_pm4_set_reg(pm4, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
 			       shader->config.rsrc2);
 	}
 }
 
 static void si_emit_shader_es(struct si_context *sctx)
 {
 	struct si_shader *shader = sctx->queued.named.es->shader;
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 
 	if (!shader)
 		return;
 
 	radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
 				   SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
 				   shader->selector->esgs_itemsize / 4);
 
 	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
 		radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
 					   SI_TRACKED_VGT_TF_PARAM,
 					   shader->vgt_tf_param);
 
 	if (shader->vgt_vertex_reuse_block_cntl)
 		radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
 					   SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
 					   shader->vgt_vertex_reuse_block_cntl);
+
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
 static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
 {
 	struct si_pm4_state *pm4;
 	unsigned num_user_sgprs;
 	unsigned vgpr_comp_cnt;
 	uint64_t va;
 	unsigned oc_lds_en;
 
@@ -755,20 +759,22 @@ static void gfx9_get_gs_info(struct si_shader_selector *es,
 	out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup *
 				      gs->gs_max_out_vertices;
 	out->lds_size = align(esgs_lds_size, 128) / 128;
 
 	assert(out->max_prims_per_subgroup <= max_out_prims);
 }
 
 static void si_emit_shader_gs(struct si_context *sctx)
 {
 	struct si_shader *shader = sctx->queued.named.gs->shader;
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
 	if (!shader)
 		return;
 
 	/* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
 	 * R_028A68_VGT_GSVS_RING_OFFSET_3, R_028A6C_VGT_GS_OUT_PRIM_TYPE */
 	radeon_opt_set_context_reg4(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1,
 				    SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
 				    shader->ctx_reg.gs.vgt_gsvs_ring_offset_1,
 				    shader->ctx_reg.gs.vgt_gsvs_ring_offset_2,
 				    shader->ctx_reg.gs.vgt_gsvs_ring_offset_3,
@@ -815,20 +821,23 @@ static void si_emit_shader_gs(struct si_context *sctx)
 
 		if (shader->key.part.gs.es->type == PIPE_SHADER_TESS_EVAL)
 			radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
 						   SI_TRACKED_VGT_TF_PARAM,
 						   shader->vgt_tf_param);
 		if (shader->vgt_vertex_reuse_block_cntl)
 			radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
 						   SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
 						   shader->vgt_vertex_reuse_block_cntl);
 	}
+
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
 static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 {
 	struct si_shader_selector *sel = shader->selector;
 	const ubyte *num_components = sel->info.num_stream_output_components;
 	unsigned gs_num_invocations = sel->gs_num_invocations;
 	struct si_pm4_state *pm4;
 	uint64_t va;
 	unsigned max_stream = sel->max_gs_stream;
@@ -950,20 +959,22 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 			       S_00B228_FLOAT_MODE(shader->config.float_mode));
 		si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
 			       S_00B22C_USER_SGPR(GFX6_GS_NUM_USER_SGPR) |
 			       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 	}
 }
 
 static void si_emit_shader_vs(struct si_context *sctx)
 {
 	struct si_shader *shader = sctx->queued.named.vs->shader;
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
 	if (!shader)
 		return;
 
 	radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE,
 				   SI_TRACKED_VGT_GS_MODE,
 				   shader->ctx_reg.vs.vgt_gs_mode);
 	radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN,
 				   SI_TRACKED_VGT_PRIMITIVEID_EN,
 				   shader->ctx_reg.vs.vgt_primitiveid_en);
 
@@ -987,20 +998,23 @@ static void si_emit_shader_vs(struct si_context *sctx)
 
 	if (shader->selector->type == PIPE_SHADER_TESS_EVAL)
 		radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
 					   SI_TRACKED_VGT_TF_PARAM,
 					   shader->vgt_tf_param);
 
 	if (shader->vgt_vertex_reuse_block_cntl)
 		radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
 					   SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
 					   shader->vgt_vertex_reuse_block_cntl);
+
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
 /**
  * Compute the state for \p shader, which will run as a vertex shader on the
  * hardware.
  *
  * If \p gs is non-NULL, it points to the geometry shader for which this shader
  * is the copy shader.
  */
 static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
@@ -1149,20 +1163,22 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
 	for (i = 0; i < num_targets; i++)
 		if (!(value & (0xf << (i * 4))))
 			value |= V_028714_SPI_SHADER_32_R << (i * 4);
 
 	return value;
 }
 
 static void si_emit_shader_ps(struct si_context *sctx)
 {
 	struct si_shader *shader = sctx->queued.named.ps->shader;
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
+
 	if (!shader)
 		return;
 
 	/* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/
 	radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA,
 				    SI_TRACKED_SPI_PS_INPUT_ENA,
 				    shader->ctx_reg.ps.spi_ps_input_ena,
 				    shader->ctx_reg.ps.spi_ps_input_addr);
 
 	radeon_opt_set_context_reg(sctx, R_0286E0_SPI_BARYC_CNTL,
@@ -1174,20 +1190,23 @@ static void si_emit_shader_ps(struct si_context *sctx)
 
 	/* R_028710_SPI_SHADER_Z_FORMAT, R_028714_SPI_SHADER_COL_FORMAT */
 	radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT,
 				    SI_TRACKED_SPI_SHADER_Z_FORMAT,
 				    shader->ctx_reg.ps.spi_shader_z_format,
 				    shader->ctx_reg.ps.spi_shader_col_format);
 
 	radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK,
 				   SI_TRACKED_CB_SHADER_MASK,
 				   shader->ctx_reg.ps.cb_shader_mask);
+
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
 static void si_shader_ps(struct si_shader *shader)
 {
 	struct tgsi_shader_info *info = &shader->selector->info;
 	struct si_pm4_state *pm4;
 	unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
 	unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
 	uint64_t va;
 	unsigned input_ena = shader->config.spi_ps_input_ena;
@@ -2842,23 +2861,27 @@ static void si_emit_spi_map(struct si_context *sctx)
 			spi_ps_input_cntl[num_written++] =
 			  si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
 
 		}
 	}
 	assert(num_interp == num_written);
 
 	/* R_028644_SPI_PS_INPUT_CNTL_0 */
 	/* Dota 2: Only ~16% of SPI map updates set different values. */
 	/* Talos: Only ~9% of SPI map updates set different values. */
+	unsigned initial_cdw = sctx->gfx_cs->current.cdw;
 	radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
 				    spi_ps_input_cntl,
 				    sctx->tracked_regs.spi_ps_input_cntl, num_interp);
+
+	if (initial_cdw != sctx->gfx_cs->current.cdw)
+		sctx->context_roll_counter++;
 }
 
 /**
  * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
  */
 static void si_init_config_add_vgt_flush(struct si_context *sctx)
 {
 	if (sctx->init_config_has_vgt_flush)
 		return;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index 819c773ba8e..587422e50ca 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -251,33 +251,36 @@ static void si_emit_guardband(struct si_context *ctx)
 		/* Discard primitives that would lie entirely outside the clip
 		 * region. */
 		discard_x = MIN2(discard_x, guardband_x);
 		discard_y = MIN2(discard_y, guardband_y);
 	}
 
 	/* If any of the GB registers is updated, all of them must be updated.
 	 * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
 	 * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
 	 */
+	unsigned initial_cdw = ctx->gfx_cs->current.cdw;
 	radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
 				    SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
 				    fui(guardband_y), fui(discard_y),
 				    fui(guardband_x), fui(discard_x));
 	radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
 				   SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
 				   S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
 				   S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
 	radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,
 				   SI_TRACKED_PA_SU_VTX_CNTL,
 				   S_028BE4_PIX_CENTER(rs->half_pixel_center) |
 				   S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
 						       vp_as_scissor.quant_mode));
+	if (initial_cdw != ctx->gfx_cs->current.cdw)
+		ctx->context_roll_counter++;
 }
 
 static void si_emit_scissors(struct si_context *ctx)
 {
 	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct pipe_scissor_state *states = ctx->scissors.states;
 	unsigned mask = ctx->scissors.dirty_mask;
 	bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
 
 	/* The simple case: Only 1 viewport is active. */
-- 
2.17.1