Mesa (main): broadcom/compiler: define max number of tmu spills for compile strategies

Fri Feb 18 09:01:49 UTC 2022

Module: Mesa
Branch: main
Commit: 40e091267dd02d729cc6d12d190309f103217111
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=40e091267dd02d729cc6d12d190309f103217111

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Fri Feb  4 13:40:50 2022 +0100

broadcom/compiler: define max number of tmu spills for compile strategies

Instead of whether they are allowed to spill or not. This is more flexible.
Also, while we are not currently enabling spilling on any 4-thread strategies,
should we do that in the future, always prefer a 4-thread compile.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15041>

---

 src/broadcom/compiler/nir_to_vir.c            |  2 +-
 src/broadcom/compiler/v3d_compiler.h          |  2 +-
 src/broadcom/compiler/vir.c                   | 41 +++++++++++++++------------
 src/broadcom/compiler/vir_register_allocate.c | 11 ++-----
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index abfbecc502f..b645ef5790a 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -4473,7 +4473,7 @@ v3d_nir_to_vir(struct v3d_compile *c)
         while (true) {
                 bool spilled;
                 temp_registers = v3d_register_allocate(c, &spilled);
-                if (spilled)
+                if (spilled && c->spills + c->fills <= c->max_tmu_spills)
                         continue;
 
                 if (temp_registers)
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index e42ea184c45..844a9603606 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -710,7 +710,7 @@ struct v3d_compile {
          * strategies that can reduce register pressure and hopefully reduce or
          * eliminate TMU spills in the shader.
          */
-        bool tmu_spilling_allowed;
+        uint32_t max_tmu_spills;
 
         /* The UBO index and block used with the last unifa load, as well as the
          * current unifa offset *after* emitting that load. This is used to skip
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index a08206fdf30..4753b2e274e 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -541,7 +541,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
                  int program_id, int variant_id,
                  uint32_t max_threads,
                  uint32_t min_threads_for_reg_alloc,
-                 bool tmu_spilling_allowed,
+                 uint32_t max_tmu_spills,
                  bool disable_loop_unrolling,
                  bool disable_constant_ubo_load_sorting,
                  bool disable_tmu_pipelining,
@@ -559,7 +559,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->debug_output_data = debug_output_data;
         c->compilation_result = V3D_COMPILATION_SUCCEEDED;
         c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
-        c->tmu_spilling_allowed = tmu_spilling_allowed;
+        c->max_tmu_spills = max_tmu_spills;
         c->fallback_scheduler = fallback_scheduler;
         c->disable_tmu_pipelining = disable_tmu_pipelining;
         c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
@@ -1624,17 +1624,17 @@ struct v3d_compiler_strategy {
         bool disable_loop_unrolling;
         bool disable_ubo_load_sorting;
         bool disable_tmu_pipelining;
-        bool tmu_spilling_allowed;
+        uint32_t max_tmu_spills;
 } static const strategies[] = {
-  /*0*/ { "default",                        4, 4, false, false, false, false },
-  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false, false },
-  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false, false },
-  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,  false },
-  /*4*/ { "lower thread count",             2, 1, false, false, false, true },
-  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, true },
-  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, true },
-  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  true },
-  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  true  }
+  /*0*/ { "default",                        4, 4, false, false, false,  0 },
+  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false,  0 },
+  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false,  0 },
+  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,   0 },
+  /*4*/ { "lower thread count",             2, 1, false, false, false, -1 },
+  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, -1 },
+  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, -1 },
+  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  -1 },
+  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  -1 }
 };
 
 /**
@@ -1655,8 +1655,8 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
    assert(idx > 0);
 
    /* Don't skip a strategy that changes spilling behavior */
-   if (strategies[idx].tmu_spilling_allowed !=
-       strategies[idx - 1].tmu_spilling_allowed) {
+   if (strategies[idx].max_tmu_spills !=
+       strategies[idx - 1].max_tmu_spills) {
            return false;
    }
 
@@ -1726,7 +1726,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                                      program_id, variant_id,
                                      strategies[strat].max_threads,
                                      strategies[strat].min_threads,
-                                     strategies[strat].tmu_spilling_allowed,
+                                     strategies[strat].max_tmu_spills,
                                      strategies[strat].disable_loop_unrolling,
                                      strategies[strat].disable_ubo_load_sorting,
                                      strategies[strat].disable_tmu_pipelining,
@@ -1738,11 +1738,16 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                 if (c->compilation_result == V3D_COMPILATION_FAILED)
                         break;
 
-                /* If we compiled without spills, choose this. Otherwise keep
-                 * going and track strategy with less spilling.
+                /* If we compiled without spills, choose this.
+                 * Otherwise if this is a 4-thread compile, choose this (these
+                 * have a very low cap on the allowed TMU spills so we assume
+                 * it will be better than a 2-thread compile without spills).
+                 * Otherwise, keep going while tracking the strategy with the
+                 * lowest spill count.
                  */
                 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
-                        if (c->spills == 0) {
+                        if (c->spills == 0 ||
+                            strategies[strat].min_threads == 4) {
                                 best_c = c;
                                 break;
                         } else if (c->spills + c->fills <
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index e26b790c946..05b71e3369a 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -561,15 +561,10 @@ get_spill_batch_size(struct v3d_compile *c)
    return 20;
 }
 
-/* Don't emit spills using the TMU until we've dropped thread count first. We,
- * may also disable spilling when certain optimizations that are known to
- * increase register pressure are active so we favor recompiling with
- * optimizations disabled instead of spilling.
- */
 static inline bool
-tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
+tmu_spilling_allowed(struct v3d_compile *c)
 {
-        return thread_index == 0 && c->tmu_spilling_allowed;
+        return c->spills + c->fills < c->max_tmu_spills;
 }
 
 #define CLASS_BIT_PHYS			(1 << 0)
@@ -818,7 +813,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
                         if (i > 0 && !is_uniform)
                                 break;
 
-                        if (is_uniform || tmu_spilling_allowed(c, thread_index)) {
+                        if (is_uniform || tmu_spilling_allowed(c)) {
                                 v3d_spill_reg(c, map[node].temp);
 
                                 /* Ask the outer loop to call back in. */