Mesa (master): nir/opt_peephole_select: Don't try to remove flow control around indirect loads

Mon Dec 17 21:50:09 UTC 2018

Module: Mesa
Branch: master
Commit: 09b7e1d8e4e07e7c51debb20e85e213ab209985f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=09b7e1d8e4e07e7c51debb20e85e213ab209985f

Author: Ian Romanick <ian.d.romanick at intel.com>
Date:   Wed Jun 27 11:41:19 2018 -0700

nir/opt_peephole_select: Don't try to remove flow control around indirect loads

That flow control may be trying to avoid invalid loads.  On at least
some platforms, those loads can also be expensive.

No shader-db changes on any Intel platform (even with the later patch
"intel/compiler: More peephole select").

v2: Add a 'indirect_load_ok' flag to nir_opt_peephole_select.  Suggested
by Rob.  See also the big comment in src/intel/compiler/brw_nir.c.

v3: Use nir_deref_instr_has_indirect instead of deref_has_indirect (from
nir_lower_io_arrays_to_elements.c).

v4: Fix inverted condition in brw_nir.c.  Noticed by Lionel.

Signed-off-by: Ian Romanick <ian.d.romanick at intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>

---

 src/amd/vulkan/radv_shader.c                 |  2 +-
 src/broadcom/compiler/nir_to_vir.c           |  2 +-
 src/compiler/nir/nir.h                       |  3 ++-
 src/compiler/nir/nir_opt_peephole_select.c   | 37 ++++++++++++++++++++--------
 src/freedreno/ir3/ir3_nir.c                  |  2 +-
 src/gallium/drivers/radeonsi/si_shader_nir.c |  2 +-
 src/gallium/drivers/vc4/vc4_program.c        |  2 +-
 src/intel/compiler/brw_nir.c                 | 13 +++++++++-
 src/mesa/state_tracker/st_glsl_to_nir.cpp    |  2 +-
 9 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 1ce6baebff..f778e85b8d 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -159,7 +159,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively,
                 NIR_PASS(progress, shader, nir_opt_if);
                 NIR_PASS(progress, shader, nir_opt_dead_cf);
                 NIR_PASS(progress, shader, nir_opt_cse);
-                NIR_PASS(progress, shader, nir_opt_peephole_select, 8);
+                NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true);
                 NIR_PASS(progress, shader, nir_opt_algebraic);
                 NIR_PASS(progress, shader, nir_opt_constant_folding);
                 NIR_PASS(progress, shader, nir_opt_undef);
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 167f00750b..9b1066467c 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -1241,7 +1241,7 @@ v3d_optimize_nir(struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
                 NIR_PASS(progress, s, nir_opt_undef);
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index d99cc6b2d3..2bbfb3c6b1 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -3197,7 +3197,8 @@ bool nir_opt_move_comparisons(nir_shader *shader);
 
 bool nir_opt_move_load_ubo(nir_shader *shader);
 
-bool nir_opt_peephole_select(nir_shader *shader, unsigned limit);
+bool nir_opt_peephole_select(nir_shader *shader, unsigned limit,
+                             bool indirect_load_ok);
 
 bool nir_opt_remove_phis_impl(nir_function_impl *impl);
 bool nir_opt_remove_phis(nir_shader *shader);
diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c
index ad9d0abec0..6808d3eda6 100644
--- a/src/compiler/nir/nir_opt_peephole_select.c
+++ b/src/compiler/nir/nir_opt_peephole_select.c
@@ -58,7 +58,8 @@
  */
 
 static bool
-block_check_for_allowed_instrs(nir_block *block, unsigned *count, bool alu_ok)
+block_check_for_allowed_instrs(nir_block *block, unsigned *count,
+                               bool alu_ok, bool indirect_load_ok)
 {
    nir_foreach_instr(instr, block) {
       switch (instr->type) {
@@ -66,16 +67,26 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count, bool alu_ok)
          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
          switch (intrin->intrinsic) {
-         case nir_intrinsic_load_deref:
-            switch (nir_src_as_deref(intrin->src[0])->mode) {
+         case nir_intrinsic_load_deref: {
+            nir_deref_instr *const deref = nir_src_as_deref(intrin->src[0]);
+
+            switch (deref->mode) {
             case nir_var_shader_in:
             case nir_var_uniform:
+               /* Don't try to remove flow control around an indirect load
+                * because that flow control may be trying to avoid invalid
+                * loads.
+                */
+               if (!indirect_load_ok && nir_deref_instr_has_indirect(deref))
+                  return false;
+
                break;
 
             default:
                return false;
             }
             break;
+         }
 
          case nir_intrinsic_load_uniform:
             if (!alu_ok)
@@ -149,7 +160,7 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count, bool alu_ok)
 
 static bool
 nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
-                              unsigned limit)
+                              unsigned limit, bool indirect_load_ok)
 {
    if (nir_cf_node_is_first(&block->cf_node))
       return false;
@@ -169,8 +180,10 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
 
    /* ... and those blocks must only contain "allowed" instructions. */
    unsigned count = 0;
-   if (!block_check_for_allowed_instrs(then_block, &count, limit != 0) ||
-       !block_check_for_allowed_instrs(else_block, &count, limit != 0))
+   if (!block_check_for_allowed_instrs(then_block, &count, limit != 0,
+                                       indirect_load_ok) ||
+       !block_check_for_allowed_instrs(else_block, &count, limit != 0,
+                                       indirect_load_ok))
       return false;
 
    if (count > limit)
@@ -236,13 +249,15 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader,
 }
 
 static bool
-nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit)
+nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit,
+                             bool indirect_load_ok)
 {
    nir_shader *shader = impl->function->shader;
    bool progress = false;
 
    nir_foreach_block_safe(block, impl) {
-      progress |= nir_opt_peephole_select_block(block, shader, limit);
+      progress |= nir_opt_peephole_select_block(block, shader, limit,
+                                                indirect_load_ok);
    }
 
    if (progress)
@@ -252,13 +267,15 @@ nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit)
 }
 
 bool
-nir_opt_peephole_select(nir_shader *shader, unsigned limit)
+nir_opt_peephole_select(nir_shader *shader, unsigned limit,
+                        bool indirect_load_ok)
 {
    bool progress = false;
 
    nir_foreach_function(function, shader) {
       if (function->impl)
-         progress |= nir_opt_peephole_select_impl(function->impl, limit);
+         progress |= nir_opt_peephole_select_impl(function->impl, limit,
+                                                  indirect_load_ok);
    }
 
    return progress;
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 70c01ee059..112c092852 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -97,7 +97,7 @@ ir3_optimize_loop(nir_shader *s)
 			progress |= OPT(s, nir_opt_gcm, true);
 		else if (gcm == 2)
 			progress |= OPT(s, nir_opt_gcm, false);
-		progress |= OPT(s, nir_opt_peephole_select, 16);
+		progress |= OPT(s, nir_opt_peephole_select, 16, true);
 		progress |= OPT(s, nir_opt_intrinsics);
 		progress |= OPT(s, nir_opt_algebraic);
 		progress |= OPT(s, nir_opt_constant_folding);
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 0155986627..89acaab248 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -841,7 +841,7 @@ si_lower_nir(struct si_shader_selector* sel)
 		NIR_PASS(progress, sel->nir, nir_opt_if);
 		NIR_PASS(progress, sel->nir, nir_opt_dead_cf);
 		NIR_PASS(progress, sel->nir, nir_opt_cse);
-		NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8);
+		NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true);
 
 		/* Needed for algebraic lowering */
 		NIR_PASS(progress, sel->nir, nir_opt_algebraic);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 7053c66723..48d83061f9 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1591,7 +1591,7 @@ vc4_optimize_nir(struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select, 8);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8, true);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
                 NIR_PASS(progress, s, nir_opt_undef);
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 594edde541..e0aa927f2f 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -568,7 +568,18 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
       OPT(nir_copy_prop);
       OPT(nir_opt_dce);
       OPT(nir_opt_cse);
-      OPT(nir_opt_peephole_select, 0);
+
+      /* For indirect loads of uniforms (push constants), we assume that array
+       * indices will nearly always be in bounds and the cost of the load is
+       * low.  Therefore there shouldn't be a performance benefit to avoid it.
+       * However, in vec4 tessellation shaders, these loads operate by
+       * actually pulling from memory.
+       */
+      const bool is_vec4_tessellation = !is_scalar &&
+         (nir->info.stage == MESA_SHADER_TESS_CTRL ||
+          nir->info.stage == MESA_SHADER_TESS_EVAL);
+      OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation);
+
       OPT(nir_opt_intrinsics);
       OPT(nir_opt_idiv_const, 32);
       OPT(nir_opt_algebraic);
diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp
index 7406e26e2f..e6d5c86bfb 100644
--- a/src/mesa/state_tracker/st_glsl_to_nir.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp
@@ -328,7 +328,7 @@ st_nir_opts(nir_shader *nir, bool scalar)
       NIR_PASS(progress, nir, nir_opt_if);
       NIR_PASS(progress, nir, nir_opt_dead_cf);
       NIR_PASS(progress, nir, nir_opt_cse);
-      NIR_PASS(progress, nir, nir_opt_peephole_select, 8);
+      NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true);
 
       NIR_PASS(progress, nir, nir_opt_algebraic);
       NIR_PASS(progress, nir, nir_opt_constant_folding);