[Mesa-dev] [PATCH] nir: Allow opt_peephole_sel to be more aggressive in flattening IFs.
Ian Romanick
idr at freedesktop.org
Wed Sep 21 07:21:59 UTC 2016
Reviewed-by: Ian Romanick <ian.d.romanick at intel.com>
On 09/10/2016 12:43 AM, Eric Anholt wrote:
> VC4 was running into a major performance regression from enabling control
> flow in the glmark2 conditionals test, because of short if statements
> containing an ffract.
>
> This pass seems like it was was trying to ensure that we only flattened
> IFs that should be entirely a win by guaranteeing that there would be
> fewer bcsels than there were MOVs otherwise. However, if the number of
> ALU ops is small, we can avoid the overhead of branching (which itself
> costs cycles) and still get a win, even if it means moving real
> instructions out of the THEN/ELSE blocks.
>
> For now, just turn on aggressive flattening on vc4. i965 will need some
> tuning to avoid regressions. It does looks like this may be useful to
> replace freedreno code.
>
> Improves glmark2 -b conditionals:fragment-steps=5:vertex-steps=0 from 47
> fps to 95 fps on vc4.
>
> vc4 shader-db:
> total instructions in shared programs: 83220 -> 78887 (-5.21%)
> instructions in affected programs: 21095 -> 16762 (-20.54%)
> total uniforms in shared programs: 25977 -> 25606 (-1.43%)
> uniforms in affected programs: 4196 -> 3825 (-8.84%)
> total estimated cycles in shared programs: 195563 -> 192153 (-1.74%)
> estimated cycles in affected programs: 30171 -> 26761 (-11.30%)
>
> (This doesn't even count the test being optimized, due to a limitation in
> shader-db-2)
> ---
> src/compiler/nir/nir.h | 2 +-
> src/compiler/nir/nir_opt_peephole_select.c | 82 ++++++++++++++++++++----------
> src/gallium/drivers/vc4/vc4_program.c | 2 +-
> src/mesa/drivers/dri/i965/brw_nir.c | 2 +-
> 4 files changed, 57 insertions(+), 31 deletions(-)
>
> diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
> index c1cf94001f65..f56f67690d34 100644
> --- a/src/compiler/nir/nir.h
> +++ b/src/compiler/nir/nir.h
> @@ -2589,7 +2589,7 @@ bool nir_opt_dead_cf(nir_shader *shader);
>
> void nir_opt_gcm(nir_shader *shader);
>
> -bool nir_opt_peephole_select(nir_shader *shader);
> +bool nir_opt_peephole_select(nir_shader *shader, unsigned limit);
>
> bool nir_opt_remove_phis(nir_shader *shader);
>
> diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c
> index 633e9f486c08..6a73d737077c 100644
> --- a/src/compiler/nir/nir_opt_peephole_select.c
> +++ b/src/compiler/nir/nir_opt_peephole_select.c
> @@ -32,23 +32,33 @@
> * Implements a small peephole optimization that looks for
> *
> * if (cond) {
> - * <empty>
> + * <then SSA defs>
> * } else {
> - * <empty>
> + * <else SSA defs>
> * }
> * phi
> * ...
> * phi
> *
> - * and replaces it with a series of selects. It can also handle the case
> - * where, instead of being empty, the if may contain some move operations
> - * whose only use is one of the following phi nodes. This happens all the
> - * time when the SSA form comes from a conditional assignment with a
> - * swizzle.
> + * and replaces it with:
> + *
> + * <then SSA defs>
> + * <else SSA defs>
> + * bcsel
> + * ...
> + * bcsel
> + *
> + * where the SSA defs are ALU operations or other cheap instructions (not
> + * texturing, for example).
> + *
> + * If the number of ALU operations in the branches is greater than the limit
> + * parameter, then the optimization is skipped. In limit=0 mode, the SSA defs
> + * must only be MOVs which we expect to get copy-propagated away once they're
> + * out of the inner blocks.
> */
>
> static bool
> -block_check_for_allowed_instrs(nir_block *block)
> +block_check_for_allowed_instrs(nir_block *block, unsigned *count, bool alu_ok)
> {
> nir_foreach_instr(instr, block) {
> switch (instr->type) {
> @@ -67,6 +77,11 @@ block_check_for_allowed_instrs(nir_block *block)
> }
> break;
>
> + case nir_intrinsic_load_uniform:
> + if (!alu_ok)
> + return false;
> + break;
> +
> default:
> return false;
> }
> @@ -89,29 +104,36 @@ block_check_for_allowed_instrs(nir_block *block)
> case nir_op_vec2:
> case nir_op_vec3:
> case nir_op_vec4:
> - /* It must be a move-like operation. */
> break;
> default:
> - return false;
> + if (!alu_ok) {
> + /* It must be a move-like operation. */
> + return false;
> + }
> + break;
> }
>
> - /* Can't handle saturate */
> - if (mov->dest.saturate)
> - return false;
> -
> /* It must be SSA */
> if (!mov->dest.dest.is_ssa)
> return false;
>
> - /* It cannot have any if-uses */
> - if (!list_empty(&mov->dest.dest.ssa.if_uses))
> - return false;
> + if (alu_ok) {
> + (*count)++;
> + } else {
> + /* Can't handle saturate */
> + if (mov->dest.saturate)
> + return false;
>
> - /* The only uses of this definition must be phi's in the successor */
> - nir_foreach_use(use, &mov->dest.dest.ssa) {
> - if (use->parent_instr->type != nir_instr_type_phi ||
> - use->parent_instr->block != block->successors[0])
> + /* It cannot have any if-uses */
> + if (!list_empty(&mov->dest.dest.ssa.if_uses))
> return false;
> +
> + /* The only uses of this definition must be phi's in the successor */
> + nir_foreach_use(use, &mov->dest.dest.ssa) {
> + if (use->parent_instr->type != nir_instr_type_phi ||
> + use->parent_instr->block != block->successors[0])
> + return false;
> + }
> }
> break;
> }
> @@ -125,7 +147,7 @@ block_check_for_allowed_instrs(nir_block *block)
> }
>
> static bool
> -nir_opt_peephole_select_block(nir_block *block, void *mem_ctx)
> +nir_opt_peephole_select_block(nir_block *block, void *mem_ctx, unsigned limit)
> {
> if (nir_cf_node_is_first(&block->cf_node))
> return false;
> @@ -147,8 +169,12 @@ nir_opt_peephole_select_block(nir_block *block, void *mem_ctx)
> nir_block *else_block = nir_cf_node_as_block(else_node);
>
> /* ... and those blocks must only contain "allowed" instructions. */
> - if (!block_check_for_allowed_instrs(then_block) ||
> - !block_check_for_allowed_instrs(else_block))
> + unsigned count = 0;
> + if (!block_check_for_allowed_instrs(then_block, &count, limit != 0) ||
> + !block_check_for_allowed_instrs(else_block, &count, limit != 0))
> + return false;
> +
> + if (count > limit)
> return false;
>
> /* At this point, we know that the previous CFG node is an if-then
> @@ -212,13 +238,13 @@ nir_opt_peephole_select_block(nir_block *block, void *mem_ctx)
> }
>
> static bool
> -nir_opt_peephole_select_impl(nir_function_impl *impl)
> +nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit)
> {
> void *mem_ctx = ralloc_parent(impl);
> bool progress = false;
>
> nir_foreach_block_safe(block, impl) {
> - progress |= nir_opt_peephole_select_block(block, mem_ctx);
> + progress |= nir_opt_peephole_select_block(block, mem_ctx, limit);
> }
>
> if (progress)
> @@ -228,13 +254,13 @@ nir_opt_peephole_select_impl(nir_function_impl *impl)
> }
>
> bool
> -nir_opt_peephole_select(nir_shader *shader)
> +nir_opt_peephole_select(nir_shader *shader, unsigned limit)
> {
> bool progress = false;
>
> nir_foreach_function(function, shader) {
> if (function->impl)
> - progress |= nir_opt_peephole_select_impl(function->impl);
> + progress |= nir_opt_peephole_select_impl(function->impl, limit);
> }
>
> return progress;
> diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
> index b1189bf4c6bc..cc1b7ca8c34b 100644
> --- a/src/gallium/drivers/vc4/vc4_program.c
> +++ b/src/gallium/drivers/vc4/vc4_program.c
> @@ -1431,7 +1431,7 @@ vc4_optimize_nir(struct nir_shader *s)
> NIR_PASS(progress, s, nir_opt_dce);
> NIR_PASS(progress, s, nir_opt_dead_cf);
> NIR_PASS(progress, s, nir_opt_cse);
> - NIR_PASS(progress, s, nir_opt_peephole_select);
> + NIR_PASS(progress, s, nir_opt_peephole_select, 8);
> NIR_PASS(progress, s, nir_opt_algebraic);
> NIR_PASS(progress, s, nir_opt_constant_folding);
> NIR_PASS(progress, s, nir_opt_undef);
> diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
> index e8dafaebc914..886a0b655b91 100644
> --- a/src/mesa/drivers/dri/i965/brw_nir.c
> +++ b/src/mesa/drivers/dri/i965/brw_nir.c
> @@ -387,7 +387,7 @@ nir_optimize(nir_shader *nir, bool is_scalar)
> OPT(nir_copy_prop);
> OPT(nir_opt_dce);
> OPT(nir_opt_cse);
> - OPT(nir_opt_peephole_select);
> + OPT(nir_opt_peephole_select, 0);
> OPT(nir_opt_algebraic);
> OPT(nir_opt_constant_folding);
> OPT(nir_opt_dead_cf);
>
More information about the mesa-dev
mailing list