Mesa (master): nir/loop_unroll: unroll more aggressively if it can improve load scheduling

Wed Jan 13 19:16:25 UTC 2021

Module: Mesa
Branch: master
Commit: dfe429eb414511170f3dfc960d247c4aa295f924
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=dfe429eb414511170f3dfc960d247c4aa295f924

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Tue Sep  1 11:55:58 2020 +0100

nir/loop_unroll: unroll more aggressively if it can improve load scheduling

Significantly improves performance of a Control compute shader. Also seems
to increase FPS at the very start of the game by ~5% (RX 580, 1080p,
medium settings, no MSAA).

fossil-db (Sienna):
Totals from 81 (0.06% of 139391) affected shaders:
SGPRs: 3848 -> 4362 (+13.36%); split: -0.99%, +14.35%
VGPRs: 4132 -> 4648 (+12.49%)
CodeSize: 275532 -> 659188 (+139.24%)
MaxWaves: 986 -> 906 (-8.11%)
Instrs: 54422 -> 126865 (+133.11%)
Cycles: 1057240 -> 750464 (-29.02%); split: -42.61%, +13.60%
VMEM: 26507 -> 61829 (+133.26%); split: +135.56%, -2.30%
SMEM: 4748 -> 5895 (+24.16%); split: +31.47%, -7.31%
VClause: 1933 -> 6802 (+251.89%); split: -0.72%, +252.61%
SClause: 1179 -> 1810 (+53.52%); split: -3.14%, +56.66%
Branches: 1174 -> 1157 (-1.45%); split: -23.94%, +22.49%
PreVGPRs: 3219 -> 3387 (+5.22%); split: -0.96%, +6.18%

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6538>

---

 src/amd/vulkan/radv_shader.c           |  1 +
 src/compiler/nir/nir.h                 |  1 +
 src/compiler/nir/nir_opt_loop_unroll.c | 88 +++++++++++++++++++++++++++++++---
 3 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 846913875dd..e25ac04c29e 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -83,6 +83,7 @@ static const struct nir_shader_compiler_options nir_options = {
 	.has_isub = true,
 	.use_scoped_barrier = true,
 	.max_unroll_iterations = 32,
+	.max_unroll_iterations_aggressive = 128,
 	.use_interpolated_input_intrinsics = true,
 	.vectorize_vec2_16bit = true,
 	/* nir_lower_int64() isn't actually called for the LLVM backend, but
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 5d43d6aaf72..a60b1c8928b 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -3390,6 +3390,7 @@ typedef struct nir_shader_compiler_options {
    bool support_16bit_alu;
 
    unsigned max_unroll_iterations;
+   unsigned max_unroll_iterations_aggressive;
 
    /* For the non-zero value of the enum corresponds multiplier when
     * calling lower_uniforms_to_ubo */
diff --git a/src/compiler/nir/nir_opt_loop_unroll.c b/src/compiler/nir/nir_opt_loop_unroll.c
index 7dc27a10387..86095e45e03 100644
--- a/src/compiler/nir/nir_opt_loop_unroll.c
+++ b/src/compiler/nir/nir_opt_loop_unroll.c
@@ -750,6 +750,77 @@ partial_unroll(nir_shader *shader, nir_loop *loop, unsigned trip_count)
    _mesa_hash_table_destroy(remap_table, NULL);
 }
 
+static bool
+is_indirect_load(nir_instr *instr)
+{
+   if (instr->type == nir_instr_type_intrinsic) {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      if ((intrin->intrinsic == nir_intrinsic_load_ubo ||
+           intrin->intrinsic == nir_intrinsic_load_ssbo ||
+           intrin->intrinsic == nir_intrinsic_load_global) &&
+          !nir_src_is_const(intrin->src[1])) {
+         return true;
+      }
+
+      if (intrin->intrinsic == nir_intrinsic_load_deref ||
+          intrin->intrinsic == nir_intrinsic_store_deref) {
+         nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+         nir_variable_mode mem_modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_global;
+         if (!nir_deref_mode_may_be(deref, mem_modes))
+            return false;
+         while (deref) {
+            if ((deref->deref_type == nir_deref_type_array ||
+                 deref->deref_type == nir_deref_type_ptr_as_array) &&
+                !nir_src_is_const(deref->arr.index)) {
+               return true;
+            }
+            deref = nir_deref_instr_parent(deref);
+         }
+      }
+   } else if (instr->type == nir_instr_type_tex) {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+      for (unsigned i = 0; i < tex->num_srcs; i++) {
+         if (!nir_src_is_const(tex->src[i].src))
+            return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+can_pipeline_loads(nir_loop *loop)
+{
+   if (!loop->info->exact_trip_count_known)
+      return false;
+
+   bool interesting_loads = false;
+
+   foreach_list_typed(nir_cf_node, cf_node, node, &loop->body) {
+      if (cf_node == &loop->info->limiting_terminator->nif->cf_node)
+         continue;
+
+      /* Control flow usually prevents useful scheduling */
+      if (cf_node->type != nir_cf_node_block)
+         return false;
+
+      if (interesting_loads)
+         continue;
+
+      nir_block *block = nir_cf_node_as_block(cf_node);
+      nir_foreach_instr(instr, block) {
+         if (is_indirect_load(instr)) {
+            interesting_loads = true;
+            break;
+         }
+      }
+   }
+
+   return interesting_loads;
+}
+
 /*
  * Returns true if we should unroll the loop, otherwise false.
  */
@@ -764,19 +835,22 @@ check_unrolling_restrictions(nir_shader *shader, nir_loop *loop)
 
    nir_loop_info *li = loop->info;
    unsigned max_iter = shader->options->max_unroll_iterations;
+   /* Unroll much more aggressively if it can hide load latency. */
+   if (shader->options->max_unroll_iterations_aggressive && can_pipeline_loads(loop))
+      max_iter = shader->options->max_unroll_iterations_aggressive;
    unsigned trip_count =
       li->max_trip_count ? li->max_trip_count : li->guessed_trip_count;
 
-   if (trip_count > max_iter)
-      return false;
-
-   if (li->force_unroll && !li->guessed_trip_count)
+   if (li->force_unroll && !li->guessed_trip_count && trip_count <= max_iter)
       return true;
 
-   bool loop_not_too_large =
-      li->instr_cost * trip_count <= max_iter * LOOP_UNROLL_LIMIT;
+   unsigned cost_limit = max_iter * LOOP_UNROLL_LIMIT;
+   unsigned cost = li->instr_cost * trip_count;
+
+   if (cost <= cost_limit && trip_count <= max_iter)
+      return true;
 
-   return loop_not_too_large;
+   return false;
 }
 
 static bool