[Mesa-dev] [PATCH 5/5] radeonsi: allow out-of-order rasterization in commutative blending cases

Thu Sep 14 03:55:55 UTC 2017

For the series:

Tested-by: Dieter Nützel <Dieter at nuetzel-hh.de>

Greetings,
Dieter

Am 09.09.2017 12:43, schrieb Nicolai Hähnle:
> From: Nicolai HÃ¤hnle <nicolai.haehnle at amd.com>
> 
> We do not enable this by default for additive blending, since it 
> slightly
> breaks OpenGL invariance guarantees due to non-determinism.
> 
> Still, there may be some applications can benefit from white-listing
> via the radeonsi_commutative_blend_add drirc setting without any real
> visible artifacts.
> ---
>  src/gallium/drivers/radeonsi/driinfo_radeonsi.h |  1 +
>  src/gallium/drivers/radeonsi/si_pipe.c          |  2 +
>  src/gallium/drivers/radeonsi/si_pipe.h          |  1 +
>  src/gallium/drivers/radeonsi/si_state.c         | 67 
> +++++++++++++++++++++++--
>  src/gallium/drivers/radeonsi/si_state.h         |  1 +
>  src/util/xmlpool/t_options.h                    |  5 ++
>  6 files changed, 73 insertions(+), 4 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
> b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
> index 8be85289a0c..989e5175cc0 100644
> --- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
> +++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h
> @@ -1,5 +1,6 @@
>  // DriConf options specific to radeonsi
>  DRI_CONF_SECTION_PERFORMANCE
>      DRI_CONF_RADEONSI_ENABLE_SISCHED("false")
>      DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
> +    DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
>  DRI_CONF_SECTION_END
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
> b/src/gallium/drivers/radeonsi/si_pipe.c
> index b4972be739c..c44ea3be740 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -1043,20 +1043,22 @@ struct pipe_screen
> *radeonsi_screen_create(struct radeon_winsys *ws,
>  		(sscreen->b.chip_class == SI &&
>  		 sscreen->b.info.pfp_fw_version >= 79 &&
>  		 sscreen->b.info.me_fw_version >= 142);
> 
>  	sscreen->has_ds_bpermute = sscreen->b.chip_class >= VI;
>  	sscreen->has_out_of_order_rast = sscreen->b.chip_class >= VI &&
>  					 sscreen->b.info.max_se >= 2 &&
>  					 !(sscreen->b.debug_flags & DBG_NO_OUT_OF_ORDER);
>  	sscreen->assume_no_z_fights =
>  		driQueryOptionb(config->options, "radeonsi_assume_no_z_fights");
> +	sscreen->commutative_blend_add =
> +		driQueryOptionb(config->options, "radeonsi_commutative_blend_add");
>  	sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= 
> CHIP_POLARIS10 &&
>  					    sscreen->b.family <= CHIP_POLARIS12) ||
>  					   sscreen->b.family == CHIP_VEGA10 ||
>  					   sscreen->b.family == CHIP_RAVEN;
>  	sscreen->dpbb_allowed = sscreen->b.chip_class >= GFX9 &&
>  				!(sscreen->b.debug_flags & DBG_NO_DPBB);
>  	sscreen->dfsm_allowed = sscreen->dpbb_allowed &&
>  				!(sscreen->b.debug_flags & DBG_NO_DFSM);
> 
>  	/* While it would be nice not to have this flag, we are constrained
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/drivers/radeonsi/si_pipe.h
> index d200c9f571f..27e2bc81172 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -90,20 +90,21 @@ struct u_suballocator;
>  struct si_screen {
>  	struct r600_common_screen	b;
>  	unsigned			gs_table_depth;
>  	unsigned			tess_offchip_block_dw_size;
>  	bool				has_clear_state;
>  	bool				has_distributed_tess;
>  	bool				has_draw_indirect_multi;
>  	bool				has_ds_bpermute;
>  	bool				has_out_of_order_rast;
>  	bool				assume_no_z_fights;
> +	bool				commutative_blend_add;
>  	bool				has_msaa_sample_loc_bug;
>  	bool				dpbb_allowed;
>  	bool				dfsm_allowed;
>  	bool				llvm_has_working_vgpr_indexing;
> 
>  	/* Whether shaders are monolithic (1-part) or separate (3-part). */
>  	bool				use_monolithic_shaders;
>  	bool				record_llvm_ir;
> 
>  	mtx_t			shader_parts_mutex;
> diff --git a/src/gallium/drivers/radeonsi/si_state.c
> b/src/gallium/drivers/radeonsi/si_state.c
> index a8af5752771..6a063e7f7a6 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -370,20 +370,62 @@ static uint32_t
> si_translate_blend_opt_factor(int blend_fact, bool is_alpha)
>  	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
>  		return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
>  	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
>  		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
>  				: V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
>  	default:
>  		return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
>  	}
>  }
> 
> +static void si_blend_check_commutativity(struct si_screen *sscreen,
> +					 struct si_state_blend *blend,
> +					 enum pipe_blend_func func,
> +					 enum pipe_blendfactor src,
> +					 enum pipe_blendfactor dst,
> +					 unsigned chanmask)
> +{
> +	/* Src factor is allowed when it does not depend on Dst */
> +	static const uint32_t src_allowed =
> +		(1u << PIPE_BLENDFACTOR_ONE) |
> +		(1u << PIPE_BLENDFACTOR_SRC_COLOR) |
> +		(1u << PIPE_BLENDFACTOR_SRC_ALPHA) |
> +		(1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
> +		(1u << PIPE_BLENDFACTOR_CONST_COLOR) |
> +		(1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
> +		(1u << PIPE_BLENDFACTOR_SRC1_COLOR) |
> +		(1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
> +		(1u << PIPE_BLENDFACTOR_ZERO) |
> +		(1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
> +		(1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) |
> +		(1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
> +		(1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) |
> +		(1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
> +		(1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
> +
> +	if (dst == PIPE_BLENDFACTOR_ONE &&
> +	    (src_allowed & (1u << src))) {
> +		/* Addition is commutative, but floating point addition isn't
> +		 * associative: subtle changes can be introduced via different
> +		 * rounding.
> +		 *
> +		 * Out-of-order is also non-deterministic, which means that
> +		 * this breaks OpenGL invariance requirements. So only enable
> +		 * out-of-order additive blending if explicitly allowed by a
> +		 * setting.
> +		 */
> +		if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
> +		    (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
> +			blend->commutative_4bit |= chanmask;
> +	}
> +}
> +
>  /**
>   * Get rid of DST in the blend factors by commuting the operands:
>   *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
>   */
>  static void si_blend_remove_dst(unsigned *func, unsigned *src_factor,
>  				unsigned *dst_factor, unsigned expected_dst,
>  				unsigned replacement_src)
>  {
>  	if (*src_factor == expected_dst &&
>  	    *dst_factor == PIPE_BLENDFACTOR_ZERO) {
> @@ -486,20 +528,25 @@ static void *si_create_blend_state_mode(struct
> pipe_context *ctx,
>  		/* cb_render_state will disable unused ones */
>  		blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * 
> i);
>  		if (state->rt[j].colormask)
>  			blend->cb_target_enabled_4bit |= 0xf << (4 * i);
> 
>  		if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
>  			si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, 
> blend_cntl);
>  			continue;
>  		}
> 
> +		si_blend_check_commutativity(sctx->screen, blend,
> +					     eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
> +		si_blend_check_commutativity(sctx->screen, blend,
> +					     eqA, srcA, dstA, 0x8 << (4 * i));
> +
>  		/* Blending optimizations for RB+.
>  		 * These transformations don't change the behavior.
>  		 *
>  		 * First, get rid of DST in the blend factors:
>  		 *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
>  		 */
>  		si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB,
>  				    PIPE_BLENDFACTOR_DST_COLOR,
>  				    PIPE_BLENDFACTOR_SRC_COLOR);
>  		si_blend_remove_dst(&eqA, &srcA, &dstA,
> @@ -629,20 +676,21 @@ static void si_bind_blend_state(struct
> pipe_context *ctx, void *state)
>  	    (!old_blend ||
>  	     old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
>  	     old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
>  	     old_blend->cb_target_enabled_4bit != 
> blend->cb_target_enabled_4bit))
>  		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
> 
>  	if (sctx->screen->has_out_of_order_rast &&
>  	    (!old_blend ||
>  	     (old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
>  	      old_blend->cb_target_enabled_4bit != 
> blend->cb_target_enabled_4bit ||
> +	      old_blend->commutative_4bit != blend->commutative_4bit ||
>  	      old_blend->logicop_enable != blend->logicop_enable)))
>  		si_mark_atom_dirty(sctx, &sctx->msaa_config);
>  }
> 
>  static void si_delete_blend_state(struct pipe_context *ctx, void 
> *state)
>  {
>  	struct si_context *sctx = (struct si_context *)ctx;
>  	si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
>  }
> 
> @@ -3193,26 +3241,37 @@ static bool
> si_out_of_order_rasterization(struct si_context *sctx)
>  			return false;
> 
>  		if (sctx->b.num_perfect_occlusion_queries != 0 &&
>  		    !dsa_order_invariant.pass_set)
>  			return false;
>  	}
> 
>  	if (!colormask)
>  		return true;
> 
> -	bool blend_enabled = (colormask & blend->blend_enable_4bit) != 0;
> +	unsigned blendmask = colormask & blend->blend_enable_4bit;
> 
> -	if (blend_enabled)
> -		return false; /* TODO */
> +	if (blendmask) {
> +		/* Only commutative blending. */
> +		if (blendmask & ~blend->commutative_4bit)
> +			return false;
> +
> +		if (!dsa_order_invariant.pass_set)
> +			return false;
> +	}
> +
> +	if (colormask & ~blendmask) {
> +		if (!dsa_order_invariant.pass_last)
> +			return false;
> +	}
> 
> -	return dsa_order_invariant.pass_last;
> +	return true;
>  }
> 
>  static void si_emit_msaa_config(struct si_context *sctx, struct
> r600_atom *atom)
>  {
>  	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>  	unsigned num_tile_pipes = sctx->screen->b.info.num_tile_pipes;
>  	/* 33% faster rendering to linear color buffers */
>  	bool dst_is_linear = sctx->framebuffer.any_dst_linear;
>  	bool out_of_order_rast = si_out_of_order_rasterization(sctx);
>  	unsigned sc_mode_cntl_1 =
> diff --git a/src/gallium/drivers/radeonsi/si_state.h
> b/src/gallium/drivers/radeonsi/si_state.h
> index 56e597a5813..5aa50c58932 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -48,20 +48,21 @@ struct si_shader_selector;
> 
>  struct si_state_blend {
>  	struct si_pm4_state	pm4;
>  	uint32_t		cb_target_mask;
>  	/* Set 0xf or 0x0 (4 bits) per render target if the following is
>  	 * true. ANDed with spi_shader_col_format.
>  	 */
>  	unsigned		cb_target_enabled_4bit;
>  	unsigned		blend_enable_4bit;
>  	unsigned		need_src_alpha_4bit;
> +	unsigned		commutative_4bit;
>  	bool			alpha_to_coverage:1;
>  	bool			alpha_to_one:1;
>  	bool			dual_src_blend:1;
>  	bool			logicop_enable:1;
>  };
> 
>  struct si_state_rasterizer {
>  	struct si_pm4_state	pm4;
>  	/* poly offset states for 16-bit, 24-bit, and 32-bit zbuffers */
>  	struct si_pm4_state	*pm4_poly_offset;
> diff --git a/src/util/xmlpool/t_options.h 
> b/src/util/xmlpool/t_options.h
> index c92215183a5..214c7c359ee 100644
> --- a/src/util/xmlpool/t_options.h
> +++ b/src/util/xmlpool/t_options.h
> @@ -436,10 +436,15 @@ DRI_CONF_OPT_END
> 
>  #define DRI_CONF_RADEONSI_ENABLE_SISCHED(def) \
>  DRI_CONF_OPT_BEGIN_B(radeonsi_enable_sisched, def) \
>          DRI_CONF_DESC(en,gettext("Use the LLVM sisched option for
> shader compiles")) \
>  DRI_CONF_OPT_END
> 
>  #define DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS(def) \
>  DRI_CONF_OPT_BEGIN_B(radeonsi_assume_no_z_fights, def) \
>          DRI_CONF_DESC(en,gettext("Assume no Z fights (enables
> aggressive out-of-order rasterization to improve performance; may
> cause rendering errors)")) \
>  DRI_CONF_OPT_END
> +
> +#define DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD(def) \
> +DRI_CONF_OPT_BEGIN_B(radeonsi_commutative_blend_add, def) \
> +        DRI_CONF_DESC(en,gettext("Commutative additive blending
> optimizations (may cause rendering errors)")) \
> +DRI_CONF_OPT_END