Mesa (main): ra: Add fast-path support for register classes of contiguous regs.

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Fri Jun 4 19:32:16 UTC 2021


Module: Mesa
Branch: main
Commit: 2d7bcdaf6b3857c19fde007e1ba706bcf400f166
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=2d7bcdaf6b3857c19fde007e1ba706bcf400f166

Author: Eric Anholt <eric at anholt.net>
Date:   Thu Mar  4 13:51:36 2021 -0800

ra: Add fast-path support for register classes of contiguous regs.

In the fully general case of register classes, to expose an allocation
class of unaligned 2-contiguous-regs allocations, for example, you'd have
your base individual regs (128 on intel), and another set of 127 regs that
each conflicted with the corresponding pair of the base regs.  Single-reg
nodes would allocate in the 128, and double-reg nodes would allocate in
the 127 and the user would remap from the 127 down to the base regs with
some irritating table.

If you need many different contiguous allocation sizes (16 is a pretty
common number across drivers), your number of regs explodes, wasting
memory and making the q computation expensive at startup.

If all the user has is contiguous-reg classes, we can easily compute the q
value up front (as found in the intel driver and nouveau, for example),
and we only have to change a couple of places in the conflict-checking
logic so the contiguous-reg classes can use the base registers.

Reviewed-by: Jason Ekstrand <jason at jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9437>

---

 src/util/register_allocate.c          | 172 ++++++++++++++++++++++++++++------
 src/util/register_allocate.h          |   1 +
 src/util/register_allocate_internal.h |   9 ++
 src/util/register_allocate_test.cpp   | 141 +++++++++++++++++++++++++---
 4 files changed, 283 insertions(+), 40 deletions(-)

diff --git a/src/util/register_allocate.c b/src/util/register_allocate.c
index b9886e802fd..93a7a11709c 100644
--- a/src/util/register_allocate.c
+++ b/src/util/register_allocate.c
@@ -234,6 +234,24 @@ ra_alloc_reg_class(struct ra_regs *regs)
    return class;
 }
 
+/**
+ * Creates a register class for contiguous register groups of a base register
+ * set.
+ *
+ * A reg set using this type of register class must use only this type of
+ * register class.
+ */
+struct ra_class *
+ra_alloc_contig_reg_class(struct ra_regs *regs, int contig_len)
+{
+   struct ra_class *c = ra_alloc_reg_class(regs);
+
+   assert(contig_len != 0);
+   c->contig_len = contig_len;
+
+   return c;
+}
+
 struct ra_class *
 ra_get_class_from_index(struct ra_regs *regs, unsigned int class)
 {
@@ -250,6 +268,7 @@ void
 ra_class_add_reg(struct ra_class *class, unsigned int r)
 {
    assert(r < class->regset->count);
+   assert(r + class->contig_len <= class->regset->count);
 
    BITSET_SET(class->regs, r);
    class->p++;
@@ -291,21 +310,67 @@ ra_set_finalize(struct ra_regs *regs, unsigned int **q_values)
        */
       for (b = 0; b < regs->class_count; b++) {
          for (c = 0; c < regs->class_count; c++) {
-            unsigned int rc;
-            int max_conflicts = 0;
+            struct ra_class *class_b = regs->classes[b];
+            struct ra_class *class_c = regs->classes[c];
+
+            if (class_b->contig_len && class_c->contig_len) {
+               if (class_b->contig_len == 1 && class_c->contig_len == 1) {
+                  /* If both classes are single registers, then they only
+                   * conflict if there are any regs shared between them.  This
+                   * is a cheap test for a common case.
+                   */
+                  class_b->q[c] = 0;
+                  for (int i = 0; i < BITSET_WORDS(regs->count); i++) {
+                     if (class_b->regs[i] & class_c->regs[i]) {
+                        class_b->q[c] = 1;
+                        break;
+                     }
+                  }
+               } else {
+                  int max_possible_conflicts = class_b->contig_len + class_c->contig_len - 1;
+
+                  unsigned int max_conflicts = 0;
+                  unsigned int rc;
+                  BITSET_FOREACH_SET(rc, regs->classes[c]->regs, regs->count) {
+                     int start = MAX2(0, (int)rc - class_b->contig_len + 1);
+                     int end = MIN2(regs->count, rc + class_c->contig_len);
+                     unsigned int conflicts = 0;
+                     for (int i = start; i < end; i++) {
+                        if (BITSET_TEST(class_b->regs, i))
+                           conflicts++;
+                     }
+                     max_conflicts = MAX2(max_conflicts, conflicts);
+                     /* Unless a class has some restriction like the register
+                      * bases are all aligned, then we should quickly find this
+                      * limit and exit the loop.
+                      */
+                     if (max_conflicts == max_possible_conflicts)
+                        break;
+                  }
+                  class_b->q[c] = max_conflicts;
+               }
+            } else {
+               /* If you're doing contiguous classes, you have to be all in
+                * because I don't want to deal with it.
+                */
+               assert(!class_b->contig_len && !class_c->contig_len);
+
+               unsigned int rc;
+               int max_conflicts = 0;
 
-            BITSET_FOREACH_SET(rc, regs->classes[c]->regs, regs->count) {
-               int conflicts = 0;
+               BITSET_FOREACH_SET(rc, regs->classes[c]->regs, regs->count) {
+                  int conflicts = 0;
 
-               util_dynarray_foreach(&regs->regs[rc].conflict_list,
-                                     unsigned int, rbp) {
-                  unsigned int rb = *rbp;
-                  if (reg_belongs_to_class(rb, regs->classes[b]))
-                     conflicts++;
+                  util_dynarray_foreach(&regs->regs[rc].conflict_list,
+                                       unsigned int, rbp) {
+                     unsigned int rb = *rbp;
+                     if (reg_belongs_to_class(rb, regs->classes[b]))
+                        conflicts++;
+                  }
+                  max_conflicts = MAX2(max_conflicts, conflicts);
                }
-               max_conflicts = MAX2(max_conflicts, conflicts);
+               regs->classes[b]->q[c] = max_conflicts;
             }
-            regs->classes[b]->q[c] = max_conflicts;
          }
       }
    }
@@ -313,6 +378,20 @@ ra_set_finalize(struct ra_regs *regs, unsigned int **q_values)
    for (b = 0; b < regs->count; b++) {
       util_dynarray_fini(&regs->regs[b].conflict_list);
    }
+
+   bool all_contig = true;
+   for (int c = 0; c < regs->class_count; c++)
+      all_contig &= regs->classes[c]->contig_len != 0;
+   if (all_contig) {
+      /* In this case, we never need the conflicts lists (and it would probably
+       * be a mistake to look at conflicts when doing contiguous classes!), so
+       * free them.  TODO: Avoid the allocation in the first place.
+       */
+      for (int i = 0; i < regs->count; i++) {
+         ralloc_free(regs->regs[i].conflicts);
+         regs->regs[i].conflicts = NULL;
+      }
+   }
 }
 
 void
@@ -321,17 +400,23 @@ ra_set_serialize(const struct ra_regs *regs, struct blob *blob)
    blob_write_uint32(blob, regs->count);
    blob_write_uint32(blob, regs->class_count);
 
-   for (unsigned int r = 0; r < regs->count; r++) {
-      struct ra_reg *reg = &regs->regs[r];
-      blob_write_bytes(blob, reg->conflicts, BITSET_WORDS(regs->count) *
-                                             sizeof(BITSET_WORD));
-      assert(util_dynarray_num_elements(&reg->conflict_list, unsigned int) == 0);
+   bool is_contig = regs->classes[0]->contig_len != 0;
+   blob_write_uint8(blob, is_contig);
+
+   if (!is_contig) {
+      for (unsigned int r = 0; r < regs->count; r++) {
+         struct ra_reg *reg = &regs->regs[r];
+         blob_write_bytes(blob, reg->conflicts, BITSET_WORDS(regs->count) *
+                                                sizeof(BITSET_WORD));
+         assert(util_dynarray_num_elements(&reg->conflict_list, unsigned int) == 0);
+      }
    }
 
    for (unsigned int c = 0; c < regs->class_count; c++) {
       struct ra_class *class = regs->classes[c];
       blob_write_bytes(blob, class->regs, BITSET_WORDS(regs->count) *
                                           sizeof(BITSET_WORD));
+      blob_write_uint32(blob, class->contig_len);
       blob_write_uint32(blob, class->p);
       blob_write_bytes(blob, class->q, regs->class_count * sizeof(*class->q));
    }
@@ -344,14 +429,22 @@ ra_set_deserialize(void *mem_ctx, struct blob_reader *blob)
 {
    unsigned int reg_count = blob_read_uint32(blob);
    unsigned int class_count = blob_read_uint32(blob);
+   bool is_contig = blob_read_uint8(blob);
 
    struct ra_regs *regs = ra_alloc_reg_set(mem_ctx, reg_count, false);
    assert(regs->count == reg_count);
 
-   for (unsigned int r = 0; r < reg_count; r++) {
-      struct ra_reg *reg = &regs->regs[r];
-      blob_copy_bytes(blob, reg->conflicts, BITSET_WORDS(reg_count) *
-                                            sizeof(BITSET_WORD));
+   if (is_contig) {
+      for (int i = 0; i < regs->count; i++) {
+         ralloc_free(regs->regs[i].conflicts);
+         regs->regs[i].conflicts = NULL;
+      }
+   } else {
+      for (unsigned int r = 0; r < reg_count; r++) {
+         struct ra_reg *reg = &regs->regs[r];
+         blob_copy_bytes(blob, reg->conflicts, BITSET_WORDS(reg_count) *
+                                             sizeof(BITSET_WORD));
+      }
    }
 
    assert(regs->classes == NULL);
@@ -366,6 +459,7 @@ ra_set_deserialize(void *mem_ctx, struct blob_reader *blob)
       blob_copy_bytes(blob, class->regs, BITSET_WORDS(reg_count) *
                                          sizeof(BITSET_WORD));
 
+      class->contig_len = blob_read_uint32(blob);
       class->p = blob_read_uint32(blob);
 
       class->q = ralloc_array(regs->classes[c], unsigned int, class_count);
@@ -695,14 +789,31 @@ ra_simplify(struct ra_graph *g)
    g->tmp.stack_optimistic_start = stack_optimistic_start;
 }
 
+bool
+ra_class_allocations_conflict(struct ra_class *c1, unsigned int r1,
+                              struct ra_class *c2, unsigned int r2)
+{
+   if (c1->contig_len) {
+      assert(c2->contig_len);
+
+      int r1_end = r1 + c1->contig_len;
+      int r2_end = r2 + c2->contig_len;
+      return !(r2 >= r1_end || r1 >= r2_end);
+   } else {
+      return BITSET_TEST(c1->regset->regs[r1].conflicts, r2);
+   }
+}
+
 static bool
 ra_any_neighbors_conflict(struct ra_graph *g, unsigned int n, unsigned int r)
 {
    util_dynarray_foreach(&g->nodes[n].adjacency_list, unsigned int, n2p) {
       unsigned int n2 = *n2p;
 
+      /* If our adjacent node is in the stack, it's not allocated yet. */
       if (!BITSET_TEST(g->tmp.in_stack, n2) &&
-          BITSET_TEST(g->regs->regs[r].conflicts, g->nodes[n2].reg)) {
+          ra_class_allocations_conflict(g->regs->classes[g->nodes[n].class], r,
+                                        g->regs->classes[g->nodes[n2].class], g->nodes[n2].reg)) {
          return true;
       }
    }
@@ -728,12 +839,19 @@ ra_compute_available_regs(struct ra_graph *g, unsigned int n, BITSET_WORD *regs)
     * already colored.
     */
    util_dynarray_foreach(&g->nodes[n].adjacency_list, unsigned int, n2p) {
-      unsigned int n2 = *n2p;
-      unsigned int r = g->nodes[n2].reg;
-
-      if (!BITSET_TEST(g->tmp.in_stack, n2)) {
-         for (int j = 0; j < BITSET_WORDS(g->regs->count); j++)
-            regs[j] &= ~g->regs->regs[r].conflicts[j];
+      struct ra_node *n2 = &g->nodes[*n2p];
+      struct ra_class *n2c = g->regs->classes[n2->class];
+
+      if (!BITSET_TEST(g->tmp.in_stack, *n2p)) {
+         if (c->contig_len) {
+            int start = MAX2(0, (int)n2->reg - c->contig_len + 1);
+            int end = MIN2(g->regs->count, n2->reg + n2c->contig_len);
+            for (unsigned i = start; i < end; i++)
+               BITSET_CLEAR(regs, i);
+         } else {
+            for (int j = 0; j < BITSET_WORDS(g->regs->count); j++)
+               regs[j] &= ~g->regs->regs[n2->reg].conflicts[j];
+         }
       }
    }
 
diff --git a/src/util/register_allocate.h b/src/util/register_allocate.h
index 820f7569475..34dcc09650e 100644
--- a/src/util/register_allocate.h
+++ b/src/util/register_allocate.h
@@ -54,6 +54,7 @@ struct ra_regs *ra_alloc_reg_set(void *mem_ctx, unsigned int count,
                                  bool need_conflict_lists);
 void ra_set_allocate_round_robin(struct ra_regs *regs);
 struct ra_class *ra_alloc_reg_class(struct ra_regs *regs);
+struct ra_class *ra_alloc_contig_reg_class(struct ra_regs *regs, int contig_len);
 unsigned int ra_class_index(struct ra_class *c);
 void ra_add_reg_conflict(struct ra_regs *regs,
                          unsigned int r1, unsigned int r2);
diff --git a/src/util/register_allocate_internal.h b/src/util/register_allocate_internal.h
index de5a5564db9..b62b8e8143e 100644
--- a/src/util/register_allocate_internal.h
+++ b/src/util/register_allocate_internal.h
@@ -63,6 +63,12 @@ struct ra_class {
     */
    BITSET_WORD *regs;
 
+   /**
+    * Number of regs after each bit in *regs that are also conflicted by an
+    * allocation to that reg for this class.
+    */
+   int contig_len;
+
    /**
     * p(B) in Runeson/Nyström paper.
     *
@@ -164,6 +170,9 @@ struct ra_graph {
    } tmp;
 };
 
+bool ra_class_allocations_conflict(struct ra_class *c1, unsigned int r1,
+                                   struct ra_class *c2, unsigned int r2);
+
 #ifdef __cplusplus
 } /* extern "C" */
 
diff --git a/src/util/register_allocate_test.cpp b/src/util/register_allocate_test.cpp
index 09b499d9a54..9a0cb3a12ae 100644
--- a/src/util/register_allocate_test.cpp
+++ b/src/util/register_allocate_test.cpp
@@ -45,6 +45,44 @@ ra_test::~ra_test()
    ralloc_free(mem_ctx);
 }
 
+void
+thumb_checks(struct ra_regs *regs, unsigned reg32_base, unsigned reg64_base)
+{
+   struct ra_class *reg32low = ra_get_class_from_index(regs, 0);
+   struct ra_class *reg64low = ra_get_class_from_index(regs, 1);
+   struct ra_class *reg96 = ra_get_class_from_index(regs, 2);
+
+   /* Table 4.1 */
+   ASSERT_EQ(reg32low->p, 8);
+   ASSERT_EQ(reg32low->q[reg32low->index], 1);
+   ASSERT_EQ(reg32low->q[reg64low->index], 2);
+   ASSERT_EQ(reg32low->q[reg96->index], 3);
+   ASSERT_EQ(reg64low->p, 8);
+   ASSERT_EQ(reg64low->q[reg32low->index], 2);
+   ASSERT_EQ(reg64low->q[reg64low->index], 3);
+   ASSERT_EQ(reg64low->q[reg96->index], 4);
+   ASSERT_EQ(reg96->p, 2);
+   ASSERT_EQ(reg96->q[reg96->index], 2);
+   ASSERT_EQ(reg96->q[reg64low->index], 2);
+   ASSERT_EQ(reg96->q[reg96->index], 2);
+
+   /* These individual regs should conflict with themselves, but nothing else from their class */
+   for (int i = 0; i < 7; i++) {
+      ASSERT_FALSE(ra_class_allocations_conflict(reg32low, reg32_base + i, reg32low, reg32_base + i + 1));
+      ASSERT_TRUE(ra_class_allocations_conflict(reg32low, reg32_base + i, reg32low, reg32_base + i));
+   }
+
+   /* Check that reg64low conflicts with the pairs of reg32low but not neighbors */
+   ASSERT_TRUE(ra_class_allocations_conflict(reg64low, reg64_base + 0, reg32low, reg32_base + 0));
+   ASSERT_TRUE(ra_class_allocations_conflict(reg64low, reg64_base + 0, reg32low, reg32_base + 1));
+   ASSERT_FALSE(ra_class_allocations_conflict(reg64low, reg64_base + 0, reg32low, reg32_base + 2));
+
+   ASSERT_FALSE(ra_class_allocations_conflict(reg64low, reg64_base + 1, reg32low, reg32_base + 0));
+   ASSERT_TRUE(ra_class_allocations_conflict(reg64low, reg64_base + 1, reg32low, reg32_base + 1));
+   ASSERT_TRUE(ra_class_allocations_conflict(reg64low, reg64_base + 1, reg32low, reg32_base + 2));
+   ASSERT_FALSE(ra_class_allocations_conflict(reg64low, reg64_base + 1, reg32low, reg32_base + 3));
+}
+
 TEST_F(ra_test, thumb)
 {
    struct ra_regs *regs = ra_alloc_reg_set(mem_ctx, 100, true);
@@ -53,6 +91,7 @@ TEST_F(ra_test, thumb)
    int next_vreg = 16;
 
    /* reg32low is any of the low 8 registers. */
+   unsigned int reg32_base = next_vreg;
    struct ra_class *reg32low = ra_alloc_reg_class(regs);
    for (int i = 0; i < 8; i++) {
       int vreg = next_vreg++;
@@ -61,6 +100,7 @@ TEST_F(ra_test, thumb)
    }
 
    /* reg64low is pairs of the low 8 registers (with wraparound!) */
+   unsigned int reg64_base = next_vreg;
    struct ra_class *reg64low = ra_alloc_reg_class(regs);
    for (int i = 0; i < 8; i++) {
       int vreg = next_vreg++;
@@ -80,17 +120,92 @@ TEST_F(ra_test, thumb)
 
    ra_set_finalize(regs, NULL);
 
-   /* Table 4.1 */
-   ASSERT_EQ(reg32low->p, 8);
-   ASSERT_EQ(reg32low->q[reg32low->index], 1);
-   ASSERT_EQ(reg32low->q[reg64low->index], 2);
-   ASSERT_EQ(reg32low->q[reg96->index], 3);
-   ASSERT_EQ(reg64low->p, 8);
-   ASSERT_EQ(reg64low->q[reg32low->index], 2);
-   ASSERT_EQ(reg64low->q[reg64low->index], 3);
-   ASSERT_EQ(reg64low->q[reg96->index], 4);
-   ASSERT_EQ(reg96->p, 2);
-   ASSERT_EQ(reg96->q[reg96->index], 2);
-   ASSERT_EQ(reg96->q[reg64low->index], 2);
-   ASSERT_EQ(reg96->q[reg96->index], 2);
+   thumb_checks(regs, reg32_base, reg64_base);
+}
+
+TEST_F(ra_test, thumb_contigregs)
+{
+   struct ra_regs *regs = ra_alloc_reg_set(mem_ctx, 16, true);
+
+   /* reg32low is any of the low 8 registers. */
+   struct ra_class *reg32low = ra_alloc_contig_reg_class(regs, 1);
+   for (int i = 0; i < 8; i++)
+      ra_class_add_reg(reg32low, i);
+
+   /* reg64low is pairs of the low 8 registers (we're ignoring the wraparound thing here) */
+   struct ra_class *reg64low = ra_alloc_contig_reg_class(regs, 2);
+   for (int i = 0; i < 8; i++)
+      ra_class_add_reg(reg64low, i);
+
+   /* reg96 is one of either r[0..2] or r[1..3] */
+   struct ra_class *reg96 = ra_alloc_contig_reg_class(regs, 3);
+   for (int i = 0; i < 2; i++)
+      ra_class_add_reg(reg96, i);
+
+   ra_set_finalize(regs, NULL);
+
+   thumb_checks(regs, 0, 0);
+}
+
+TEST_F(ra_test, nonintersect_contigregs)
+{
+   struct ra_regs *regs = ra_alloc_reg_set(mem_ctx, 16, true);
+
+   struct ra_class *low = ra_alloc_contig_reg_class(regs, 1);
+   for (int i = 0; i < 8; i++)
+      ra_class_add_reg(low, i);
+
+   struct ra_class *high = ra_alloc_contig_reg_class(regs, 1);
+   for (int i = 8; i < 16; i++)
+      ra_class_add_reg(high, i);
+
+   ra_set_finalize(regs, NULL);
+
+   ASSERT_EQ(low->q[low->index], 1);
+   ASSERT_EQ(low->q[high->index], 0);
+   ASSERT_EQ(high->q[low->index], 0);
+   ASSERT_EQ(high->q[high->index], 1);
+}
+
+TEST_F(ra_test, aligned_contigregs)
+{
+   int base_regs = 32;
+   struct ra_regs *regs = ra_alloc_reg_set(mem_ctx, base_regs, true);
+
+   struct ra_class *c1 = ra_alloc_contig_reg_class(regs, 1);
+   for (int i = 0; i < base_regs; i++)
+      ra_class_add_reg(c1, i);
+
+   struct ra_class *c2 = ra_alloc_contig_reg_class(regs, 2);
+   for (int i = 8; i < base_regs; i += 2)
+      ra_class_add_reg(c2, i);
+
+   struct ra_class *c4 = ra_alloc_contig_reg_class(regs, 4);
+   for (int i = 8; i < base_regs; i += 4)
+      ra_class_add_reg(c4, i);
+
+   ra_set_finalize(regs, NULL);
+
+   ASSERT_EQ(c1->q[c1->index], 1);
+   ASSERT_EQ(c1->q[c2->index], 2);
+   ASSERT_EQ(c1->q[c4->index], 4);
+   ASSERT_EQ(c2->q[c1->index], 1);
+   ASSERT_EQ(c2->q[c2->index], 1);
+   ASSERT_EQ(c2->q[c4->index], 2);
+   ASSERT_EQ(c4->q[c1->index], 1);
+   ASSERT_EQ(c4->q[c2->index], 1);
+   ASSERT_EQ(c4->q[c4->index], 1);
+
+   /* Check conflicts for a c4 allocation at i against other classes. */
+   for (int i = 0; i < base_regs / 4; i += 4) {
+      for (int j = 0; j < base_regs; j++) {
+         ASSERT_EQ(ra_class_allocations_conflict(c4, i, c1, j),
+                   j >= i && j < i + 4);
+      }
+
+      for (int j = 0; j < base_regs; j += 2) {
+         ASSERT_EQ(ra_class_allocations_conflict(c4, i, c2, j),
+                   j >= i && j < i + 4);
+      }
+   }
 }



More information about the mesa-commit mailing list