Mesa (master): broadcom/compiler: sort constant UBO loads by index and offset

Fri Apr 9 10:46:10 UTC 2021

Module: Mesa
Branch: master
Commit: 8998666de7e827f5fe62b51186c7f81d362a2be1
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8998666de7e827f5fe62b51186c7f81d362a2be1

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Tue Apr  6 13:53:36 2021 +0200

broadcom/compiler: sort constant UBO loads by index and offset

This implements a NIR pass that groups together constant UBO loads
for the same UBO index in order of increasing offset when the distance
between them is small enough that it enables the "skip unifa write"
optimization.

This may increase register pressure because it can move UBO loads
earlier, so we also add a compiler strategy fallback to disable the
optimization if we need to drop thread count to compile the shader
with this optimization enabled.

total instructions in shared programs: 13557555 -> 13550300 (-0.05%)
instructions in affected programs: 814684 -> 807429 (-0.89%)
helped: 4485
HURT: 2377
Instructions are helped.

total uniforms in shared programs: 3777243 -> 3760990 (-0.43%)
uniforms in affected programs: 112554 -> 96301 (-14.44%)
helped: 7226
HURT: 36
Uniforms are helped.

total max-temps in shared programs: 2318133 -> 2333761 (0.67%)
max-temps in affected programs: 63230 -> 78858 (24.72%)
helped: 23
HURT: 3044
Max-temps are HURT.

total sfu-stalls in shared programs: 32245 -> 32567 (1.00%)
sfu-stalls in affected programs: 389 -> 711 (82.78%)
helped: 139
HURT: 451
Inconclusive result.

total inst-and-stalls in shared programs: 13589800 -> 13582867 (-0.05%)
inst-and-stalls in affected programs: 817738 -> 810805 (-0.85%)
helped: 4478
HURT: 2395
Inst-and-stalls are helped.

total nops in shared programs: 354365 -> 342202 (-3.43%)
nops in affected programs: 31000 -> 18837 (-39.24%)
helped: 4405
HURT: 265
Nops are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10100>

---

 src/broadcom/compiler/v3d_compiler.h |   6 +
 src/broadcom/compiler/vir.c          | 255 ++++++++++++++++++++++++++++++++++-
 2 files changed, 258 insertions(+), 3 deletions(-)

diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 6ea0e8b5679..116273dd07a 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -647,6 +647,12 @@ struct v3d_compile {
          */
         bool disable_tmu_pipelining;
 
+        /* Disable sorting of UBO loads with constant offset. This may
+         * increase the chances of being able to compile shaders with high
+         * register pressure.
+         */
+        bool disable_constant_ubo_load_sorting;
+
         /* Emits ldunif for each new uniform, even if the uniform was already
          * emitted in the same block. Useful to compile shaders with high
          * register pressure or to disable the optimization during uniform
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 941dc5b0fbc..5468364a6b0 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -526,6 +526,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
                  void *debug_output_data,
                  int program_id, int variant_id,
                  uint32_t min_threads_for_reg_alloc,
+                 bool disable_constant_ubo_load_sorting,
                  bool disable_tmu_pipelining,
                  bool fallback_scheduler)
 {
@@ -543,6 +544,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
         c->fallback_scheduler = fallback_scheduler;
         c->disable_tmu_pipelining = disable_tmu_pipelining;
+        c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
 
         s = nir_shader_clone(c, s);
         c->s = s;
@@ -1101,6 +1103,248 @@ should_split_wrmask(const nir_instr *instr, const void *data)
         }
 }
 
+static nir_intrinsic_instr *
+nir_instr_as_constant_ubo_load(nir_instr *inst)
+{
+        if (inst->type != nir_instr_type_intrinsic)
+                return NULL;
+
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
+        if (intr->intrinsic != nir_intrinsic_load_ubo)
+                return NULL;
+
+        assert(nir_src_is_const(intr->src[0]));
+        if (!nir_src_is_const(intr->src[1]))
+                return NULL;
+
+        return intr;
+}
+
+static bool
+v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
+{
+        bool progress = false;
+
+        nir_instr *ref_inst = &ref->instr;
+        uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
+        uint32_t ref_index = nir_src_as_uint(ref->src[0]);
+
+        /* Go through all instructions after ref searching for constant UBO
+         * loads for the same UBO index.
+         */
+        bool seq_break = false;
+        nir_instr *inst = &ref->instr;
+        nir_instr *next_inst = NULL;
+        while (true) {
+                inst = next_inst ? next_inst : nir_instr_next(inst);
+                if (!inst)
+                        break;
+
+                next_inst = NULL;
+
+                if (inst->type != nir_instr_type_intrinsic)
+                        continue;
+
+                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
+                if (intr->intrinsic != nir_intrinsic_load_ubo)
+                        continue;
+
+                /* We only produce unifa sequences for non-divergent loads */
+                if (nir_src_is_divergent(intr->src[1]))
+                        continue;
+
+                /* If there are any UBO loads that are not constant or that
+                 * use a different UBO index in between the reference load and
+                 * any other constant load for the same index, they would break
+                 * the unifa sequence. We will flag that so we can then move
+                 * all constant UBO loads for the reference index before these
+                 * and not just the ones that are not ordered to avoid breaking
+                 * the sequence and reduce unifa writes.
+                 */
+                if (!nir_src_is_const(intr->src[1])) {
+                        seq_break = true;
+                        continue;
+                }
+                uint32_t offset = nir_src_as_uint(intr->src[1]);
+
+                assert(nir_src_is_const(intr->src[0]));
+                uint32_t index = nir_src_as_uint(intr->src[0]);
+                if (index != ref_index) {
+                       seq_break = true;
+                       continue;
+                }
+
+                /* Only move loads with an offset that is close enough to the
+                 * reference offset, since otherwise we would not be able to
+                 * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
+                 */
+                if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
+                        continue;
+
+                /* We will move this load if its offset is smaller than ref's
+                 * (in which case we will move it before ref) or if the offset
+                 * is larger than ref's but there are sequence breakers in
+                 * in between (in which case we will move it after ref and
+                 * before the sequence breakers).
+                 */
+                if (!seq_break && offset >= ref_offset)
+                        continue;
+
+                /* Find where exactly we want to move this load:
+                 *
+                 * If we are moving it before ref, we want to check any other
+                 * UBO loads we placed before ref and make sure we insert this
+                 * one properly ordered with them. Likewise, if we are moving
+                 * it after ref.
+                 */
+                nir_instr *pos = ref_inst;
+                nir_instr *tmp = pos;
+                do {
+                        if (offset < ref_offset)
+                                tmp = nir_instr_prev(tmp);
+                        else
+                                tmp = nir_instr_next(tmp);
+
+                        if (!tmp || tmp == inst)
+                                break;
+
+                        /* Ignore non-unifa UBO loads */
+                        if (tmp->type != nir_instr_type_intrinsic)
+                                continue;
+
+                        nir_intrinsic_instr *tmp_intr =
+                                nir_instr_as_intrinsic(tmp);
+                        if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
+                                continue;
+
+                        if (nir_src_is_divergent(tmp_intr->src[1]))
+                                continue;
+
+                        /* Stop if we find a unifa UBO load that breaks the
+                         * sequence.
+                         */
+                        if (!nir_src_is_const(tmp_intr->src[1]))
+                                break;
+
+                        if (nir_src_as_uint(tmp_intr->src[0]) != index)
+                                break;
+
+                        uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
+                        if (offset < ref_offset) {
+                                if (tmp_offset < offset ||
+                                    tmp_offset >= ref_offset) {
+                                        break;
+                                } else {
+                                        pos = tmp;
+                                }
+                        } else {
+                                if (tmp_offset > offset ||
+                                    tmp_offset <= ref_offset) {
+                                        break;
+                                } else {
+                                        pos = tmp;
+                                }
+                        }
+                } while (true);
+
+                /* We can't move the UBO load before the instruction that
+                 * defines its constant offset. If that instruction is placed
+                 * in between the new location (pos) and the current location
+                 * of this load, we will have to move that instruction too.
+                 *
+                 * We don't care about the UBO index definition because that
+                 * is optimized to be reused by all UBO loads for the same
+                 * index and therefore is certain to be defined before the
+                 * first UBO load that uses it.
+                 */
+                nir_instr *offset_inst = NULL;
+                tmp = inst;
+                while ((tmp = nir_instr_prev(tmp)) != NULL) {
+                        if (pos == tmp) {
+                                /* We reached the target location without
+                                 * finding the instruction that defines the
+                                 * offset, so that instruction must be before
+                                 * the new position and we don't have to fix it.
+                                 */
+                                break;
+                        }
+                        if (intr->src[1].ssa->parent_instr == tmp) {
+                                offset_inst = tmp;
+                                break;
+                        }
+                }
+
+                if (offset_inst) {
+                        exec_node_remove(&offset_inst->node);
+                        exec_node_insert_node_before(&pos->node,
+                                                     &offset_inst->node);
+                }
+
+                /* Since we are moving the instruction before its current
+                 * location, grab its successor before the move so that
+                 * we can continue the next iteration of the main loop from
+                 * that instruction.
+                 */
+                next_inst = nir_instr_next(inst);
+
+                /* Move this load to the selected location */
+                exec_node_remove(&inst->node);
+                if (offset < ref_offset)
+                        exec_node_insert_node_before(&pos->node, &inst->node);
+                else
+                        exec_node_insert_after(&pos->node, &inst->node);
+
+                progress = true;
+        }
+
+        return progress;
+}
+
+static bool
+v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
+                                      nir_block *block)
+{
+        bool progress = false;
+        bool local_progress;
+        do {
+                local_progress = false;
+                nir_foreach_instr_safe(inst, block) {
+                        nir_intrinsic_instr *intr =
+                                nir_instr_as_constant_ubo_load(inst);
+                        if (intr) {
+                                local_progress |=
+                                        v3d_nir_sort_constant_ubo_load(block, intr);
+                        }
+                }
+                progress |= local_progress;
+        } while (local_progress);
+
+        return progress;
+}
+
+/**
+ * Sorts constant UBO loads in each block by offset to maximize chances of
+ * skipping unifa writes when converting to VIR. This can increase register
+ * pressure.
+ */
+static bool
+v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
+{
+        bool progress = false;
+        nir_foreach_function(function, s) {
+                if (function->impl) {
+                        nir_foreach_block(block, function->impl) {
+                                progress |=
+                                        v3d_nir_sort_constant_ubo_loads_block(c, block);
+                        }
+                        nir_metadata_preserve(function->impl,
+                                              nir_metadata_block_index |
+                                              nir_metadata_dominance);
+                }
+        }
+        return progress;
+}
+
 static void
 v3d_attempt_compile(struct v3d_compile *c)
 {
@@ -1211,6 +1455,9 @@ v3d_attempt_compile(struct v3d_compile *c)
         };
         NIR_PASS_V(c->s, nir_schedule, &schedule_options);
 
+        if (!c->disable_constant_ubo_load_sorting)
+                NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
+
         v3d_nir_to_vir(c);
 }
 
@@ -1284,7 +1531,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                 const char *name;
                 uint32_t min_threads_for_reg_alloc;
         } static const strategies[] = {
-                { "default",                  1 },
+                { "default",                  4 },
+                { "disable UBO load sorting", 1 },
                 { "disable TMU pipelining",   1 },
                 { "fallback scheduler",       1 }
         };
@@ -1294,8 +1542,9 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                                      debug_output, debug_output_data,
                                      program_id, variant_id,
                                      strategies[i].min_threads_for_reg_alloc,
-                                     i > 0, /* Disable TMU pipelining */
-                                     i > 1  /* Fallback_scheduler */);
+                                     i > 0, /* Disable UBO load sorting */
+                                     i > 1, /* Disable TMU pipelining */
+                                     i > 2  /* Fallback_scheduler */);
 
                 v3d_attempt_compile(c);