[Mesa-dev] [PATCH 1/2] radeonsi: enable out-of-order rasterization when possible on VI and GFX9 dGPUs

Thu Sep 7 08:21:13 UTC 2017

On 07.09.2017 00:35, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
> 
> ---
>   src/gallium/drivers/radeonsi/si_pipe.c          |   2 +
>   src/gallium/drivers/radeonsi/si_pipe.h          |   1 +
>   src/gallium/drivers/radeonsi/si_state.c         | 143 +++++++++++++++++++++++-
>   src/gallium/drivers/radeonsi/si_state.h         |  10 +-
>   src/gallium/drivers/radeonsi/si_state_shaders.c |   5 +
>   5 files changed, 156 insertions(+), 5 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index 640b57c..9642edd 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -1041,20 +1041,22 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
>   		 sscreen->b.info.pfp_fw_version >= 121 &&
>   		 sscreen->b.info.me_fw_version >= 87) ||
>   		(sscreen->b.chip_class == CIK &&
>   		 sscreen->b.info.pfp_fw_version >= 211 &&
>   		 sscreen->b.info.me_fw_version >= 173) ||
>   		(sscreen->b.chip_class == SI &&
>   		 sscreen->b.info.pfp_fw_version >= 79 &&
>   		 sscreen->b.info.me_fw_version >= 142);
>   
>   	sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
> +	sscreen->has_out_of_order_rast = sscreen->b.chip_class >= VI &&
> +					 sscreen->b.info.max_se >= 2;
>   	sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 &&
>   					    sscreen->b.family <= CHIP_POLARIS12) ||
>   					   sscreen->b.family == CHIP_VEGA10 ||
>   					   sscreen->b.family == CHIP_RAVEN;
>   	sscreen->dpbb_allowed = sscreen->b.chip_class >= GFX9 &&
>   				!(sscreen->b.debug_flags & DBG_NO_DPBB);
>   	sscreen->dfsm_allowed = sscreen->dpbb_allowed &&
>   				!(sscreen->b.debug_flags & DBG_NO_DFSM);
>   
>   	/* While it would be nice not to have this flag, we are constrained
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 8db7028..b8073ce 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -88,20 +88,21 @@ struct hash_table;
>   struct u_suballocator;
>   
>   struct si_screen {
>   	struct r600_common_screen	b;
>   	unsigned			gs_table_depth;
>   	unsigned			tess_offchip_block_dw_size;
>   	bool				has_clear_state;
>   	bool				has_distributed_tess;
>   	bool				has_draw_indirect_multi;
>   	bool				has_ds_bpermute;
> +	bool				has_out_of_order_rast;
>   	bool				has_msaa_sample_loc_bug;
>   	bool				dpbb_allowed;
>   	bool				dfsm_allowed;
>   	bool				llvm_has_working_vgpr_indexing;
>   
>   	/* Whether shaders are monolithic (1-part) or separate (3-part). */
>   	bool				use_monolithic_shaders;
>   	bool				record_llvm_ir;
>   
>   	mtx_t			shader_parts_mutex;
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index 7e9140b..855ad27 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -416,20 +416,21 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
>   	struct si_pm4_state *pm4 = &blend->pm4;
>   	uint32_t sx_mrt_blend_opt[8] = {0};
>   	uint32_t color_control = 0;
>   
>   	if (!blend)
>   		return NULL;
>   
>   	blend->alpha_to_coverage = state->alpha_to_coverage;
>   	blend->alpha_to_one = state->alpha_to_one;
>   	blend->dual_src_blend = util_blend_state_is_dual(state, 0);
> +	blend->logicop_enable = state->logicop_enable;
>   
>   	if (state->logicop_enable) {
>   		color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
>   	} else {
>   		color_control |= S_028808_ROP3(0xcc);
>   	}
>   
>   	si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
>   		       S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
>   		       S_028B70_ALPHA_TO_MASK_OFFSET0(2) |
> @@ -623,20 +624,27 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
>   	    old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
>   	    old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
>   		sctx->do_update_shaders = true;
>   
>   	if (sctx->screen->dpbb_allowed &&
>   	    (!old_blend ||
>   	     old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
>   	     old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
>   	     old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
>   		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
> +
> +	if (sctx->screen->has_out_of_order_rast &&
> +	    (!old_blend ||
> +	     (old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
> +	      old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
> +	      old_blend->logicop_enable != blend->logicop_enable)))
> +		si_mark_atom_dirty(sctx, &sctx->msaa_config);
>   }
>   
>   static void si_delete_blend_state(struct pipe_context *ctx, void *state)
>   {
>   	struct si_context *sctx = (struct si_context *)ctx;
>   	si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
>   }
>   
>   static void si_set_blend_color(struct pipe_context *ctx,
>   			       const struct pipe_blend_color *state)
> @@ -1118,20 +1126,47 @@ static void *si_create_dsa_state(struct pipe_context *ctx,
>   
>   	dsa->depth_enabled = state->depth.enabled;
>   	dsa->depth_write_enabled = state->depth.enabled &&
>   				   state->depth.writemask;
>   	dsa->stencil_enabled = state->stencil[0].enabled;
>   	dsa->stencil_write_enabled = state->stencil[0].enabled &&
>   				     (si_dsa_writes_stencil(&state->stencil[0]) ||
>   				      si_dsa_writes_stencil(&state->stencil[1]));
>   	dsa->db_can_write = dsa->depth_write_enabled ||
>   			    dsa->stencil_write_enabled;
> +	dsa->stencil_can_run_out_of_order = !dsa->stencil_enabled ||
> +					    !dsa->db_can_write;
> +
> +	bool depth_forces_ordering =
> +		dsa->depth_enabled &&
> +		(state->depth.func == PIPE_FUNC_NEVER ||
> +		 state->depth.func == PIPE_FUNC_LESS ||
> +		 state->depth.func == PIPE_FUNC_LEQUAL ||
> +		 state->depth.func == PIPE_FUNC_GREATER ||
> +		 state->depth.func == PIPE_FUNC_GEQUAL);
> +
> +	/* Option to allow inexact behavior with out-of-order rasterization
> +	 * when two overlapping pixels have the same Z, and depth and color
> +	 * writes are enabled, and the depth function is LESS/LEQUAL/GREATER/
> +	 * GEQUAL.
> +	 */
> +	const bool aggressive_opt_out_of_order = false;
> +
> +	/* Stencil isn't checked here and must be AND'ed with this. */
> +	dsa->depth_forces_ordering_color_off =
> +		!dsa->depth_enabled ||	     /* no depth tests and writes */
> +		!dsa->depth_write_enabled || /* any depth function is OK */
> +		depth_forces_ordering;
> +	dsa->depth_forces_ordering_color_on =
> +		depth_forces_ordering &&
> +	        (aggressive_opt_out_of_order || !dsa->depth_write_enabled);
> +
>   	return dsa;
>   }
>   
>   static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
>   {
>           struct si_context *sctx = (struct si_context *)ctx;
>   	struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
>           struct si_state_dsa *dsa = state;
>   
>           if (!state)
> @@ -1147,20 +1182,29 @@ static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
>   
>   	if (!old_dsa || old_dsa->alpha_func != dsa->alpha_func)
>   		sctx->do_update_shaders = true;
>   
>   	if (sctx->screen->dpbb_allowed &&
>   	    (!old_dsa ||
>   	     (old_dsa->depth_enabled != dsa->depth_enabled ||
>   	      old_dsa->stencil_enabled != dsa->stencil_enabled ||
>   	      old_dsa->db_can_write != dsa->db_can_write)))
>   		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
> +
> +	if (sctx->screen->has_out_of_order_rast &&
> +	    (!old_dsa ||
> +	     (old_dsa->depth_enabled != dsa->depth_enabled ||
> +	      old_dsa->stencil_enabled != dsa->stencil_enabled ||
> +	      old_dsa->stencil_can_run_out_of_order != dsa->stencil_can_run_out_of_order ||
> +	      old_dsa->depth_forces_ordering_color_off != dsa->depth_forces_ordering_color_off ||
> +	      old_dsa->depth_forces_ordering_color_on != dsa->depth_forces_ordering_color_on)))
> +		si_mark_atom_dirty(sctx, &sctx->msaa_config);
>   }
>   
>   static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
>   {
>   	struct si_context *sctx = (struct si_context *)ctx;
>   	si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
>   }
>   
>   static void *si_create_db_flush_dsa(struct si_context *sctx)
>   {
> @@ -2532,20 +2576,26 @@ static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *sta
>   
>   static void si_set_framebuffer_state(struct pipe_context *ctx,
>   				     const struct pipe_framebuffer_state *state)
>   {
>   	struct si_context *sctx = (struct si_context *)ctx;
>   	struct pipe_constant_buffer constbuf = {0};
>   	struct r600_surface *surf = NULL;
>   	struct r600_texture *rtex;
>   	bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
>   	unsigned old_nr_samples = sctx->framebuffer.nr_samples;
> +	unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
> +	bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
> +	bool old_has_stencil =
> +		old_has_zsbuf &&
> +		((struct r600_texture*)sctx->framebuffer.state.zsbuf)->surface.flags &
> +		RADEON_SURF_SBUFFER;
>   	bool unbound = false;
>   	int i;
>   
>   	si_update_fb_dirtiness_after_rendering(sctx);
>   
>   	for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
>   		if (!sctx->framebuffer.state.cbufs[i])
>   			continue;
>   
>   		rtex = (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
> @@ -2689,44 +2739,52 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
>   
>   		p_atomic_inc(&rtex->framebuffers_bound);
>   
>   		if (rtex->dcc_gather_statistics) {
>   			/* Dirty tracking must be enabled for DCC usage analysis. */
>   			sctx->framebuffer.compressed_cb_mask |= 1 << i;
>   			vi_separate_dcc_start_query(ctx, rtex);
>   		}
>   	}
>   
> +	struct r600_texture *zstex = NULL;
> +
>   	if (state->zsbuf) {
>   		surf = (struct r600_surface*)state->zsbuf;
> -		rtex = (struct r600_texture*)surf->base.texture;
> +		zstex = (struct r600_texture*)surf->base.texture;
>   
>   		if (!surf->depth_initialized) {
>   			si_init_depth_surface(sctx, surf);
>   		}
>   
> -		if (vi_tc_compat_htile_enabled(rtex, surf->base.u.tex.level))
> +		if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level))
>   			sctx->framebuffer.DB_has_shader_readable_metadata = true;
>   
>   		r600_context_add_resource_size(ctx, surf->base.texture);
>   	}
>   
>   	si_update_poly_offset_state(sctx);
>   	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
>   	si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
>   
>   	if (sctx->screen->dpbb_allowed)
>   		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
>   
>   	if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
>   		si_mark_atom_dirty(sctx, &sctx->msaa_config);
>   
> +	if (sctx->screen->has_out_of_order_rast &&
> +	    (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
> +	     !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
> +	     (zstex && !!(zstex->surface.flags & RADEON_SURF_SBUFFER) != old_has_stencil)))
> +		si_mark_atom_dirty(sctx, &sctx->msaa_config);
> +
>   	if (sctx->framebuffer.nr_samples != old_nr_samples) {
>   		si_mark_atom_dirty(sctx, &sctx->msaa_config);
>   		si_mark_atom_dirty(sctx, &sctx->db_render_state);
>   
>   		/* Set sample locations as fragment shader constants. */
>   		switch (sctx->framebuffer.nr_samples) {
>   		case 1:
>   			constbuf.user_buffer = sctx->b.sample_locations_1x;
>   			break;
>   		case 2:
> @@ -3049,30 +3107,111 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
>   		if (has_msaa_sample_loc_bug &&
>   		    sctx->framebuffer.nr_samples > 1 &&
>   		    rs && !rs->multisample_enable)
>   			small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
>   
>   		radeon_set_context_reg(cs, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
>   				       small_prim_filter_cntl);
>   	}
>   }
>   
> +static bool si_out_of_order_rasterization(struct si_context *sctx)
> +{
> +	struct si_state_blend *blend = sctx->queued.named.blend;
> +	struct si_state_dsa *dsa = sctx->queued.named.dsa;
> +
> +	if (!sctx->screen->has_out_of_order_rast)
> +		return false;
> +
> +	/* PS with memory stores can't run out-of-order. */
> +	if (sctx->ps_shader.cso &&
> +	    sctx->ps_shader.cso->info.writes_memory)
> +		return false;

I'm actually not sure this is necessary. The spec is quite relaxed about 
the order of pixel shader invocations and whether they happen at all.

> +
> +	unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit &
> +			     blend->cb_target_enabled_4bit;
> +
> +	/* No logic op. */
> +	if (colormask && blend->logicop_enable)
> +		return false;
> +
> +	struct r600_texture *zstex =
> +		(struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
> +	bool has_stencil = sctx->framebuffer.state.zsbuf &&
> +			   zstex->surface.flags & RADEON_SURF_SBUFFER;
> +	bool blend_enabled = (colormask & blend->blend_enable_4bit) != 0;
> +
> +	/* Out-of-order rasterization can be enabled for these cases:
> +	 *
> +	 * - color-only rendering:
> +	 *   + blending must be enabled and commutative
> +	 *   + only when inexact behavior due to rounding is allowed
> +	 *
> +	 * - depth-only rendering:
> +	 *   + depth must force ordering
> +	 *
> +	 * - stencil-only rendering:
> +	 *   + never --- can we do better here?
> +	 *
> +	 * - color rendering with read-only depth:
> +	 *   + blending must be disabled
> +	 *   + depth must force ordering
> +	 *
> +	 * - color rendering with read-only stencil:
> +	 *   + blending must be disabled
> +	 *
> +	 * - color+depth rendering:
> +	 *   + blending must be disabled
> +	 *   + depth must force ordering
> +	 *   + only when Z-fighting is allowed to result in inexact behavior
> +	 *
> +	 * - color+stencil rendering:
> +	 *   + never --- can we do better here?
> +	 *
> +	 * - color+depth+stencil rendering:
> +	 *   + never --- can we do better here?
> +	 */

I can't quite wrap my head around the logic here.

Here's a suggestion for cleaning it up conceptually:

- Record in DSA whether DSA *by itself* can run out-of-order or not, 
meaning that the final result in Z/S is unaffected by out-of-order
-- This is trivially the case when there are no Z/S writes
-- It is also the case when stencil writes are disabled and Zfunc is 
NEVER or one of the ordered ones ("depth_forces_ordering", currently)
-- It is also the case when depth writes are disabled, Sfunc is ALWAYS, 
and zpass_op/zfail_op are KEEP, ZERO, REPLACE, INVERT, INCR_WRAP, 
DECR_WRAP, or Sfunc is NEVER and the same applies to fail_op [I think 
this allows out-of-order to be enabled for stencil shadow passes]

- Record in DSA whether the set of fragments passing DSA is unaffected 
by out-of-order
-- This is trivially the case when there are no Z/S writes
-- It is the case when stencil writes are disabled and Zfunc is ALWAYS 
or NEVER
-- It is the case when depth writes are disabled and Sfunc is ALWAYS or 
NEVER

- Record in DSA whether the *last* fragment passing DSA for each sample 
is unaffected by out-of-order
-- This is *never* the case if we're being honest, but we can enable it 
in an optional "aggressive" mode when stencil writes are disabled, Z 
writes are enabled and Z func is one of the ordered functions

The overall out-of-order enable is then:

- if DSA by itself cannot run out-of-order, disable
- if color writes are disabled, enable
- if logic op is enabled, disable
- if blending is enabled:
  o disable if non-commutative
  o enable if commutative and the set of fragments passing DSA is 
unaffected by out-of-rder
- if blending is disabled, enable iff the *last* fragment passing DSA is 
unaffected

Thinking it through in this way, I believe I discovered at least one bug 
in the patch as-is, in the following configuration

- blending disabled and no stencil
- depth is enabled, depth writes are disabled, and Zfunc == LESS

In this case, dsa->depth_forces_ordering_color_on will be true and 
out-of-order will be enabled. But that's not correct, because there may 
be multiple triangles with Z-values less than whatever's in the depth 
buffer.

On second thought, the whole "record in DSA" thing gets a bit more 
complicated because it interacts with whether Z/S buffers are actually 
present. The no-Z/S case is easy (the first two bits are "Yes", the last 
one is "No"), but we need to distinguish whether stencil is present or 
not. Maybe both of these can be pre-calculated and stored in DSA.

Cheers,
Nicolai

> +
> +	/* If depth and stencil are disabled, blending must be enabled and commutative. */
> +	if (!sctx->framebuffer.state.zsbuf ||
> +	    (!dsa->depth_enabled && (!has_stencil || !dsa->stencil_enabled))) {
> +		if (!colormask)
> +			return true; /* Color writes are disabled. */
> +
> +		/* If color writes are enabled, blending must be enabled and
> +		 * commutative.
> +		 */
> +		return false; /* TODO: check for commutative blending */
> +	}
> +
> +	/* Depth or stencil is enabled. */
> +	if (blend_enabled ||
> +	    (has_stencil && !dsa->stencil_can_run_out_of_order))
> +		return false;
> +
> +	return colormask ? dsa->depth_forces_ordering_color_on :
> +			   dsa->depth_forces_ordering_color_off;
> +}
> +
>   static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
>   {
>   	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>   	unsigned num_tile_pipes = sctx->screen->b.info.num_tile_pipes;
>   	/* 33% faster rendering to linear color buffers */
>   	bool dst_is_linear = sctx->framebuffer.any_dst_linear;
> +	bool out_of_order_rast = si_out_of_order_rasterization(sctx);
>   	unsigned sc_mode_cntl_1 =
>   		S_028A4C_WALK_SIZE(dst_is_linear) |
>   		S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
>   		S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
> +		S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
> +		S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
>   		/* always 1: */
>   		S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) |
>   		S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
>   		S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
>   		S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
>   		S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
>   		S_028A4C_FORCE_EOV_REZ_ENABLE(1);
>   
>   	cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples,
>   				sctx->ps_iter_samples,
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index 17d210a..ed66a63 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -42,29 +42,30 @@
>   #define SI_NUM_IMAGES			16
>   #define SI_NUM_SHADER_BUFFERS		16
>   
>   struct si_screen;
>   struct si_shader;
>   struct si_shader_selector;
>   
>   struct si_state_blend {
>   	struct si_pm4_state	pm4;
>   	uint32_t		cb_target_mask;
> -	bool			alpha_to_coverage;
> -	bool			alpha_to_one;
> -	bool			dual_src_blend;
>   	/* Set 0xf or 0x0 (4 bits) per render target if the following is
>   	 * true. ANDed with spi_shader_col_format.
>   	 */
>   	unsigned		cb_target_enabled_4bit;
>   	unsigned		blend_enable_4bit;
>   	unsigned		need_src_alpha_4bit;
> +	bool			alpha_to_coverage:1;
> +	bool			alpha_to_one:1;
> +	bool			dual_src_blend:1;
> +	bool			logicop_enable:1;
>   };
>   
>   struct si_state_rasterizer {
>   	struct si_pm4_state	pm4;
>   	/* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
>   	struct si_pm4_state	*pm4_poly_offset;
>   	unsigned		pa_sc_line_stipple;
>   	unsigned		pa_cl_clip_cntl;
>   	unsigned		sprite_coord_enable:8;
>   	unsigned		clip_plane_enable:8;
> @@ -91,20 +92,23 @@ struct si_dsa_stencil_ref_part {
>   
>   struct si_state_dsa {
>   	struct si_pm4_state		pm4;
>   	struct si_dsa_stencil_ref_part	stencil_ref;
>   	ubyte				alpha_func:3;
>   	bool				depth_enabled:1;
>   	bool				depth_write_enabled:1;
>   	bool				stencil_enabled:1;
>   	bool				stencil_write_enabled:1;
>   	bool				db_can_write:1;
> +	bool				stencil_can_run_out_of_order:1;
> +	bool				depth_forces_ordering_color_off:1;
> +	bool				depth_forces_ordering_color_on:1;
>   };
>   
>   struct si_stencil_ref {
>   	struct r600_atom		atom;
>   	struct pipe_stencil_ref		state;
>   	struct si_dsa_stencil_ref_part	dsa_part;
>   };
>   
>   struct si_vertex_elements
>   {
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index fe25598..23ea1a3 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -2405,20 +2405,25 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
>   	sctx->ps_shader.current = sel ? sel->first_variant : NULL;
>   
>   	si_update_common_shader_state(sctx);
>   	if (sel) {
>   		if (sctx->ia_multi_vgt_param_key.u.uses_tess)
>   			si_update_tess_uses_prim_id(sctx);
>   
>   		if (!old_sel ||
>   		    old_sel->info.colors_written != sel->info.colors_written)
>   			si_mark_atom_dirty(sctx, &sctx->cb_render_state);
> +
> +		if (sctx->screen->has_out_of_order_rast &&
> +		    (!old_sel ||
> +		     (old_sel->info.writes_memory != sel->info.writes_memory)))
> +			si_mark_atom_dirty(sctx, &sctx->msaa_config);
>   	}
>   	si_set_active_descriptors_for_shader(sctx, sel);
>   }
>   
>   static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
>   {
>   	if (shader->is_optimized) {
>   		util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
>   				    &shader->optimized_ready);
>   		util_queue_fence_destroy(&shader->optimized_ready);
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.