[Mesa-dev] [PATCH] nir: Allow opt_peephole_sel to be more aggressive in flattening IFs.

Wed Sep 21 07:21:59 UTC 2016

Reviewed-by: Ian Romanick <ian.d.romanick at intel.com>

On 09/10/2016 12:43 AM, Eric Anholt wrote:
> VC4 was running into a major performance regression from enabling control
> flow in the glmark2 conditionals test, because of short if statements
> containing an ffract.
> 
> This pass seems like it was was trying to ensure that we only flattened
> IFs that should be entirely a win by guaranteeing that there would be
> fewer bcsels than there were MOVs otherwise.  However, if the number of
> ALU ops is small, we can avoid the overhead of branching (which itself
> costs cycles) and still get a win, even if it means moving real
> instructions out of the THEN/ELSE blocks.
> 
> For now, just turn on aggressive flattening on vc4.  i965 will need some
> tuning to avoid regressions.  It does looks like this may be useful to
> replace freedreno code.
> 
> Improves glmark2 -b conditionals:fragment-steps=5:vertex-steps=0 from 47
> fps to 95 fps on vc4.
> 
> vc4 shader-db:
> total instructions in shared programs: 83220 -> 78887 (-5.21%)
> instructions in affected programs:     21095 -> 16762 (-20.54%)
> total uniforms in shared programs: 25977 -> 25606 (-1.43%)
> uniforms in affected programs:     4196 -> 3825 (-8.84%)
> total estimated cycles in shared programs: 195563 -> 192153 (-1.74%)
> estimated cycles in affected programs:     30171 -> 26761 (-11.30%)
> 
> (This doesn't even count the test being optimized, due to a limitation in
> shader-db-2)
> ---
>  src/compiler/nir/nir.h                     |  2 +-
>  src/compiler/nir/nir_opt_peephole_select.c | 82 ++++++++++++++++++++----------
>  src/gallium/drivers/vc4/vc4_program.c      |  2 +-
>  src/mesa/drivers/dri/i965/brw_nir.c        |  2 +-
>  4 files changed, 57 insertions(+), 31 deletions(-)
> 
> diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
> index c1cf94001f65..f56f67690d34 100644
> --- a/src/compiler/nir/nir.h
> +++ b/src/compiler/nir/nir.h
> @@ -2589,7 +2589,7 @@ bool nir_opt_dead_cf(nir_shader *shader);
>  
>  void nir_opt_gcm(nir_shader *shader);
>  
> -bool nir_opt_peephole_select(nir_shader *shader);
> +bool nir_opt_peephole_select(nir_shader *shader, unsigned limit);
>  
>  bool nir_opt_remove_phis(nir_shader *shader);
>  
> diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c
> index 633e9f486c08..6a73d737077c 100644
> --- a/src/compiler/nir/nir_opt_peephole_select.c
> +++ b/src/compiler/nir/nir_opt_peephole_select.c
> @@ -32,23 +32,33 @@
>   * Implements a small peephole optimization that looks for
>   *
>   * if (cond) {
> - *    <empty>
> + *    <then SSA defs>
>   * } else {
> - *    <empty>
> + *    <else SSA defs>
>   * }
>   * phi
>   * ...
>   * phi
>   *
> - * and replaces it with a series of selects.  It can also handle the case
> - * where, instead of being empty, the if may contain some move operations
> - * whose only use is one of the following phi nodes.  This happens all the
> - * time when the SSA form comes from a conditional assignment with a
> - * swizzle.
> + * and replaces it with:
> + *
> + * <then SSA defs>
> + * <else SSA defs>
> + * bcsel
> + * ...
> + * bcsel
> + *
> + * where the SSA defs are ALU operations or other cheap instructions (not
> + * texturing, for example).
> + *
> + * If the number of ALU operations in the branches is greater than the limit
> + * parameter, then the optimization is skipped.  In limit=0 mode, the SSA defs
> + * must only be MOVs which we expect to get copy-propagated away once they're
> + * out of the inner blocks.
>   */
>  
>  static bool
> -block_check_for_allowed_instrs(nir_block *block)
> +block_check_for_allowed_instrs(nir_block *block, unsigned *count, bool alu_ok)
>  {
>     nir_foreach_instr(instr, block) {
>        switch (instr->type) {
> @@ -67,6 +77,11 @@ block_check_for_allowed_instrs(nir_block *block)
>              }
>              break;
>  
> +         case nir_intrinsic_load_uniform:
> +            if (!alu_ok)
> +               return false;
> +            break;
> +
>           default:
>              return false;
>           }
> @@ -89,29 +104,36 @@ block_check_for_allowed_instrs(nir_block *block)
>           case nir_op_vec2:
>           case nir_op_vec3:
>           case nir_op_vec4:
> -            /* It must be a move-like operation. */
>              break;
>           default:
> -            return false;
> +            if (!alu_ok) {
> +               /* It must be a move-like operation. */
> +               return false;
> +            }
> +            break;
>           }
>  
> -         /* Can't handle saturate */
> -         if (mov->dest.saturate)
> -            return false;
> -
>           /* It must be SSA */
>           if (!mov->dest.dest.is_ssa)
>              return false;
>  
> -         /* It cannot have any if-uses */
> -         if (!list_empty(&mov->dest.dest.ssa.if_uses))
> -            return false;
> +         if (alu_ok) {
> +            (*count)++;
> +         } else {
> +            /* Can't handle saturate */
> +            if (mov->dest.saturate)
> +               return false;
>  
> -         /* The only uses of this definition must be phi's in the successor */
> -         nir_foreach_use(use, &mov->dest.dest.ssa) {
> -            if (use->parent_instr->type != nir_instr_type_phi ||
> -                use->parent_instr->block != block->successors[0])
> +            /* It cannot have any if-uses */
> +            if (!list_empty(&mov->dest.dest.ssa.if_uses))
>                 return false;
> +
> +            /* The only uses of this definition must be phi's in the successor */
> +            nir_foreach_use(use, &mov->dest.dest.ssa) {
> +               if (use->parent_instr->type != nir_instr_type_phi ||
> +                   use->parent_instr->block != block->successors[0])
> +                  return false;
> +            }
>           }
>           break;
>        }
> @@ -125,7 +147,7 @@ block_check_for_allowed_instrs(nir_block *block)
>  }
>  
>  static bool
> -nir_opt_peephole_select_block(nir_block *block, void *mem_ctx)
> +nir_opt_peephole_select_block(nir_block *block, void *mem_ctx, unsigned limit)
>  {
>     if (nir_cf_node_is_first(&block->cf_node))
>        return false;
> @@ -147,8 +169,12 @@ nir_opt_peephole_select_block(nir_block *block, void *mem_ctx)
>     nir_block *else_block = nir_cf_node_as_block(else_node);
>  
>     /* ... and those blocks must only contain "allowed" instructions. */
> -   if (!block_check_for_allowed_instrs(then_block) ||
> -       !block_check_for_allowed_instrs(else_block))
> +   unsigned count = 0;
> +   if (!block_check_for_allowed_instrs(then_block, &count, limit != 0) ||
> +       !block_check_for_allowed_instrs(else_block, &count, limit != 0))
> +      return false;
> +
> +   if (count > limit)
>        return false;
>  
>     /* At this point, we know that the previous CFG node is an if-then
> @@ -212,13 +238,13 @@ nir_opt_peephole_select_block(nir_block *block, void *mem_ctx)
>  }
>  
>  static bool
> -nir_opt_peephole_select_impl(nir_function_impl *impl)
> +nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit)
>  {
>     void *mem_ctx = ralloc_parent(impl);
>     bool progress = false;
>  
>     nir_foreach_block_safe(block, impl) {
> -      progress |= nir_opt_peephole_select_block(block, mem_ctx);
> +      progress |= nir_opt_peephole_select_block(block, mem_ctx, limit);
>     }
>  
>     if (progress)
> @@ -228,13 +254,13 @@ nir_opt_peephole_select_impl(nir_function_impl *impl)
>  }
>  
>  bool
> -nir_opt_peephole_select(nir_shader *shader)
> +nir_opt_peephole_select(nir_shader *shader, unsigned limit)
>  {
>     bool progress = false;
>  
>     nir_foreach_function(function, shader) {
>        if (function->impl)
> -         progress |= nir_opt_peephole_select_impl(function->impl);
> +         progress |= nir_opt_peephole_select_impl(function->impl, limit);
>     }
>  
>     return progress;
> diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
> index b1189bf4c6bc..cc1b7ca8c34b 100644
> --- a/src/gallium/drivers/vc4/vc4_program.c
> +++ b/src/gallium/drivers/vc4/vc4_program.c
> @@ -1431,7 +1431,7 @@ vc4_optimize_nir(struct nir_shader *s)
>                  NIR_PASS(progress, s, nir_opt_dce);
>                  NIR_PASS(progress, s, nir_opt_dead_cf);
>                  NIR_PASS(progress, s, nir_opt_cse);
> -                NIR_PASS(progress, s, nir_opt_peephole_select);
> +                NIR_PASS(progress, s, nir_opt_peephole_select, 8);
>                  NIR_PASS(progress, s, nir_opt_algebraic);
>                  NIR_PASS(progress, s, nir_opt_constant_folding);
>                  NIR_PASS(progress, s, nir_opt_undef);
> diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
> index e8dafaebc914..886a0b655b91 100644
> --- a/src/mesa/drivers/dri/i965/brw_nir.c
> +++ b/src/mesa/drivers/dri/i965/brw_nir.c
> @@ -387,7 +387,7 @@ nir_optimize(nir_shader *nir, bool is_scalar)
>        OPT(nir_copy_prop);
>        OPT(nir_opt_dce);
>        OPT(nir_opt_cse);
> -      OPT(nir_opt_peephole_select);
> +      OPT(nir_opt_peephole_select, 0);
>        OPT(nir_opt_algebraic);
>        OPT(nir_opt_constant_folding);
>        OPT(nir_opt_dead_cf);
>