Mesa (main): pan/bi: Use nodearrays for linear constraints

Thu Jun 2 17:30:01 UTC 2022

Module: Mesa
Branch: main
Commit: 1bfff407b96e286ce8ef4d2089f3f3123075351a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=1bfff407b96e286ce8ef4d2089f3f3123075351a

Author: Icecream95 <ixn at disroot.org>
Date:   Thu Jun  2 11:57:08 2022 -0400

pan/bi: Use nodearrays for linear constraints

Speeds up compiling shaders/skia/781.shader_test in shader-db by 8x
(Icecream95).

...At least it did before I extended to support register allocation of vec8.  On
Valhall, texture instructions require up to 8 consecutive registers. To handle
this, provide for vec8 register allocation. Liveness was already (accidentally?)
vec8. The increased memory requirement is acceptable given that the interference
matrix is now stored sparsely (Alyssa).

Icecream95 reports the vec8 changes hurt RA performance by about 1% on average.
I consider this acceptable for now.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16780>

---

 src/panfrost/bifrost/bi_ra.c    | 115 +++++++++++++++++++++++++++++-----------
 src/panfrost/bifrost/compiler.h |   4 +-
 2 files changed, 86 insertions(+), 33 deletions(-)

diff --git a/src/panfrost/bifrost/bi_ra.c b/src/panfrost/bifrost/bi_ra.c
index 7bd8c9fc912..1b576394450 100644
--- a/src/panfrost/bifrost/bi_ra.c
+++ b/src/panfrost/bifrost/bi_ra.c
@@ -25,6 +25,7 @@
  */
 
 #include "compiler.h"
+#include "nodearray.h"
 #include "bi_builder.h"
 #include "util/u_memory.h"
 
@@ -32,16 +33,17 @@ struct lcra_state {
         unsigned node_count;
         uint64_t *affinity;
 
-        /* Linear constraints imposed. Nested array sized upfront, organized as
-         * linear[node_left][node_right]. That is, calculate indices as:
+        /* Linear constraints imposed. For each node there there is a
+         * 'nodearray' structure, which changes between a sparse and dense
+         * array depending on the number of elements.
          *
          * Each element is itself a bit field denoting whether (c_j - c_i) bias
          * is present or not, including negative biases.
          *
-         * Note for Bifrost, there are 4 components so the bias is in range
-         * [-3, 3] so encoded by 8-bit field. */
-
-        uint8_t *linear;
+         * We support up to 8 components so the bias is in range
+         * [-7, 7] encoded by a 16-bit field
+         */
+        nodearray *linear;
 
         /* Before solving, forced registers; after solving, solutions. */
         unsigned *solutions;
@@ -63,7 +65,7 @@ lcra_alloc_equations(unsigned node_count)
 
         l->node_count = node_count;
 
-        l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);
+        l->linear = calloc(sizeof(l->linear[0]), node_count);
         l->solutions = calloc(sizeof(l->solutions[0]), node_count);
         l->affinity = calloc(sizeof(l->affinity[0]), node_count);
 
@@ -75,6 +77,9 @@ lcra_alloc_equations(unsigned node_count)
 static void
 lcra_free(struct lcra_state *l)
 {
+        for (unsigned i = 0; i < l->node_count; ++i)
+                nodearray_reset(&l->linear[i]);
+
         free(l->linear);
         free(l->affinity);
         free(l->solutions);
@@ -87,44 +92,65 @@ lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, u
         if (i == j)
                 return;
 
-        uint8_t constraint_fw = 0;
-        uint8_t constraint_bw = 0;
+        nodearray_value constraint_fw = 0;
+        nodearray_value constraint_bw = 0;
 
         /* The constraint bits are reversed from lcra.c so that register
          * allocation can be done in parallel for every possible solution,
          * with lower-order bits representing smaller registers. */
 
-        for (unsigned D = 0; D < 4; ++D) {
+        for (unsigned D = 0; D < 8; ++D) {
                 if (cmask_i & (cmask_j << D)) {
-                        constraint_fw |= (1 << (3 + D));
-                        constraint_bw |= (1 << (3 - D));
+                        constraint_fw |= (1 << (7 + D));
+                        constraint_bw |= (1 << (7 - D));
                 }
 
                 if (cmask_i & (cmask_j >> D)) {
-                        constraint_bw |= (1 << (3 + D));
-                        constraint_fw |= (1 << (3 - D));
+                        constraint_bw |= (1 << (7 + D));
+                        constraint_fw |= (1 << (7 - D));
                 }
         }
 
-        l->linear[j * l->node_count + i] |= constraint_fw;
-        l->linear[i * l->node_count + j] |= constraint_bw;
+        /* Use dense arrays after adding 256 elements */
+        nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count);
+        nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count);
 }
 
 static bool
 lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
 {
-        uint8_t *row = &l->linear[i * l->node_count];
         signed constant = solutions[i];
 
+        if (nodearray_is_sparse(&l->linear[i])) {
+                nodearray_sparse_foreach(&l->linear[i], elem) {
+                        unsigned j = nodearray_sparse_key(elem);
+                        nodearray_value constraint = nodearray_sparse_value(elem);
+
+                        if (solutions[j] == ~0) continue;
+
+                        signed lhs = constant - solutions[j];
+
+                        if (lhs < -7 || lhs > 7)
+                                continue;
+
+                        if (constraint & (1 << (lhs + 7)))
+                                return false;
+                }
+
+                return true;
+        }
+
+        nodearray_value *row = l->linear[i].dense;
+
         for (unsigned j = 0; j < l->node_count; ++j) {
                 if (solutions[j] == ~0) continue;
 
                 signed lhs = constant - solutions[j];
 
-                if (lhs < -3 || lhs > 3)
+                if (lhs < -7 || lhs > 7)
                         continue;
 
-                if (row[j] & (1 << (lhs + 3)))
+                if (row[j] & (1 << (lhs + 7)))
                         return false;
         }
 
@@ -166,10 +192,15 @@ static unsigned
 lcra_count_constraints(struct lcra_state *l, unsigned i)
 {
         unsigned count = 0;
-        uint8_t *constraints = &l->linear[i * l->node_count];
+        nodearray *constraints = &l->linear[i];
 
-        for (unsigned j = 0; j < l->node_count; ++j)
-                count += util_bitcount(constraints[j]);
+        if (nodearray_is_sparse(constraints)) {
+                nodearray_sparse_foreach(constraints, elem)
+                        count += util_bitcount(nodearray_sparse_value(elem));
+        } else {
+                nodearray_dense_foreach_64(constraints, elem)
+                        count += util_bitcount64(*elem);
+        }
 
         return count;
 }
@@ -522,18 +553,40 @@ bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
         unsigned best_benefit = 0.0;
         signed best_node = -1;
 
-        for (unsigned i = 0; i < l->node_count; ++i) {
-                if (BITSET_TEST(no_spill, i)) continue;
+        if (nodearray_is_sparse(&l->linear[l->spill_node])) {
+                nodearray_sparse_foreach(&l->linear[l->spill_node], elem) {
+                        unsigned i = nodearray_sparse_key(elem);
+                        unsigned constraint = nodearray_sparse_value(elem);
+
+                        /* Only spill nodes that interfere with the node failing
+                         * register allocation. It's pointless to spill anything else */
+                        if (!constraint) continue;
+
+                        if (BITSET_TEST(no_spill, i)) continue;
+
+                        unsigned benefit = lcra_count_constraints(l, i);
 
-                /* Only spill nodes that interfere with the node failing
-                 * register allocation. It's pointless to spill anything else */
-                if (!l->linear[(l->spill_node * l->node_count) + i]) continue;
+                        if (benefit > best_benefit) {
+                                best_benefit = benefit;
+                                best_node = i;
+                        }
+                }
+        } else {
+                nodearray_value *row = l->linear[l->spill_node].dense;
 
-                unsigned benefit = lcra_count_constraints(l, i);
+                for (unsigned i = 0; i < l->node_count; ++i) {
+                        /* Only spill nodes that interfere with the node failing
+                         * register allocation. It's pointless to spill anything else */
+                        if (!row[i]) continue;
 
-                if (benefit > best_benefit) {
-                        best_benefit = benefit;
-                        best_node = i;
+                        if (BITSET_TEST(no_spill, i)) continue;
+
+                        unsigned benefit = lcra_count_constraints(l, i);
+
+                        if (benefit > best_benefit) {
+                                best_benefit = benefit;
+                                best_node = i;
+                        }
                 }
         }
 
diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h
index 7ca480ebc9e..f42dac0c40e 100644
--- a/src/panfrost/bifrost/compiler.h
+++ b/src/panfrost/bifrost/compiler.h
@@ -130,12 +130,12 @@ typedef struct {
          * write mask. Identity for the full 32-bit, H00 for only caring about
          * the lower half, other values unused. */
         enum bi_swizzle swizzle : 4;
-        uint32_t offset : 2;
+        uint32_t offset : 3;
         bool reg : 1;
         enum bi_index_type type : 3;
 
         /* Must be zeroed so we can hash the whole 64-bits at a time */
-        unsigned padding : (32 - 13);
+        unsigned padding : (32 - 14);
 } bi_index;
 
 static inline bi_index