Mesa (main): intel/fs: Use ra_alloc_contig_reg_class() to speed up RA.

Fri Jun 4 19:32:16 UTC 2021

Module: Mesa
Branch: main
Commit: 40e1d798c6d5ab351c5393456a56aa1682f6c620
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=40e1d798c6d5ab351c5393456a56aa1682f6c620

Author: Eric Anholt <eric at anholt.net>
Date:   Fri Mar  5 09:20:01 2021 -0800

intel/fs: Use ra_alloc_contig_reg_class() to speed up RA.

By using the new class type, we don't need to make 1928 different
registers to represent each contigous reg size starting from the actual
128 HW register, or have a mapping between RA regs and HW base regs.  With
the number of regs reduced, and the fast q computation when using the new
classes, we no longer need to compute our own q.

This drops the FS RA initialization time on my CFL system from about 1ms to
50us.

Reviewed-by: Jason Ekstrand <jason at jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9437>

---

 src/intel/compiler/brw_compiler.h          |  16 ---
 src/intel/compiler/brw_fs_reg_allocate.cpp | 166 ++++-------------------------
 2 files changed, 19 insertions(+), 163 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 7c7128634d9..4bb9e7be777 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -68,22 +68,6 @@ struct brw_compiler {
        */
       struct ra_class *classes[16];
 
-      /**
-       * Mapping from classes to ra_reg ranges.  Each of the per-size
-       * classes corresponds to a range of ra_reg nodes.  This array stores
-       * those ranges in the form of first ra_reg in each class and the
-       * total number of ra_reg elements in the last array element.  This
-       * way the range of the i'th class is given by:
-       * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] )
-       */
-      int class_to_ra_reg_range[17];
-
-      /**
-       * Mapping for register-allocated objects in *regs to the first
-       * GRF for that object.
-       */
-      uint8_t *ra_reg_to_grf;
-
       /**
        * ra class for the aligned barycentrics we use for PLN, which doesn't
        * appear in *classes.
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
index f260325125f..028922a0416 100644
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -118,13 +118,18 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
    for (unsigned i = 0; i < MAX_VGRF_SIZE; i++)
       class_sizes[i] = i + 1;
 
-   memset(compiler->fs_reg_sets[index].class_to_ra_reg_range, 0,
-          sizeof(compiler->fs_reg_sets[index].class_to_ra_reg_range));
-   int *class_to_ra_reg_range = compiler->fs_reg_sets[index].class_to_ra_reg_range;
+   struct ra_regs *regs = ra_alloc_reg_set(compiler, BRW_MAX_GRF, false);
+   if (devinfo->ver >= 6)
+      ra_set_allocate_round_robin(regs);
+   struct ra_class **classes = ralloc_array(compiler, struct ra_class *, class_count);
+   struct ra_class *aligned_bary_class = NULL;
 
-   /* Compute the total number of registers across all classes. */
-   int ra_reg_count = 0;
+   /* Now, make the register classes for each size of contiguous register
+    * allocation we might need to make.
+    */
    for (int i = 0; i < class_count; i++) {
+      classes[i] = ra_alloc_contig_reg_class(regs, class_sizes[i]);
+
       if (devinfo->ver <= 5 && dispatch_width >= 16) {
          /* From the G45 PRM:
           *
@@ -136,166 +141,33 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
           *   even 256-bit physical register with a region size equal to
           *   two 256-bit physical register
           */
-         ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2;
-      } else {
-         ra_reg_count += base_reg_count - (class_sizes[i] - 1);
-      }
-      /* Mark the last register. We'll fill in the beginnings later. */
-      class_to_ra_reg_range[class_sizes[i]] = ra_reg_count;
-   }
-
-   /* Fill out the rest of the range markers */
-   for (int i = 1; i < 17; ++i) {
-      if (class_to_ra_reg_range[i] == 0)
-         class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1];
-   }
-
-   uint8_t *ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
-   struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
-   if (devinfo->ver >= 6)
-      ra_set_allocate_round_robin(regs);
-   struct ra_class **classes = ralloc_array(compiler, struct ra_class *, class_count);
-   struct ra_class *aligned_bary_class = NULL;
-
-   /* Allocate space for q values.  We allocate class_count + 1 because we
-    * want to leave room for the aligned barycentric class if we have it.
-    */
-   unsigned int **q_values = ralloc_array(compiler, unsigned int *,
-                                          class_count + 1);
-   for (int i = 0; i < class_count + 1; ++i)
-      q_values[i] = ralloc_array(q_values, unsigned int, class_count + 1);
-
-   /* Now, add the registers to their classes, and add the conflicts
-    * between them and the base GRF registers (and also each other).
-    */
-   int reg = 0;
-   int aligned_bary_base_reg = 0;
-   int aligned_bary_reg_count = 0;
-   for (int i = 0; i < class_count; i++) {
-      int class_reg_count;
-      if (devinfo->ver <= 5 && dispatch_width >= 16) {
-         class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
-
-         /* See comment below.  The only difference here is that we are
-          * dealing with pairs of registers instead of single registers.
-          * Registers of odd sizes simply get rounded up. */
-         for (int j = 0; j < class_count; j++)
-            q_values[i][j] = (class_sizes[i] + 1) / 2 +
-                             (class_sizes[j] + 1) / 2 - 1;
-      } else {
-         class_reg_count = base_reg_count - (class_sizes[i] - 1);
-
-         /* From register_allocate.c:
-          *
-          * q(B,C) (indexed by C, B is this register class) in
-          * Runeson/Nyström paper.  This is "how many registers of B could
-          * the worst choice register from C conflict with".
-          *
-          * If we just let the register allocation algorithm compute these
-          * values, is extremely expensive.  However, since all of our
-          * registers are laid out, we can very easily compute them
-          * ourselves.  View the register from C as fixed starting at GRF n
-          * somwhere in the middle, and the register from B as sliding back
-          * and forth.  Then the first register to conflict from B is the
-          * one starting at n - class_size[B] + 1 and the last register to
-          * conflict will start at n + class_size[B] - 1.  Therefore, the
-          * number of conflicts from B is class_size[B] + class_size[C] - 1.
-          *
-          *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-          * B | | | | | |n| --> | | | | | | |
-          *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
-          *             +-+-+-+-+-+
-          * C           |n| | | | |
-          *             +-+-+-+-+-+
-          */
-         for (int j = 0; j < class_count; j++)
-            q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
-      }
-      classes[i] = ra_alloc_reg_class(regs);
-
-      /* Save this off for the aligned barycentric class at the end. */
-      if (class_sizes[i] == int(aligned_bary_size(dispatch_width))) {
-         aligned_bary_base_reg = reg;
-         aligned_bary_reg_count = class_reg_count;
-      }
-
-      if (devinfo->ver <= 5 && dispatch_width >= 16) {
-         for (int j = 0; j < class_reg_count; j++) {
+         for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg += 2)
             ra_class_add_reg(classes[i], reg);
-
-            ra_reg_to_grf[reg] = j * 2;
-
-            for (int base_reg = j;
-                 base_reg < j + (class_sizes[i] + 1) / 2;
-                 base_reg++) {
-               ra_add_reg_conflict(regs, base_reg, reg);
-            }
-
-            reg++;
-         }
       } else {
-         for (int j = 0; j < class_reg_count; j++) {
+         for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg++)
             ra_class_add_reg(classes[i], reg);
-
-            ra_reg_to_grf[reg] = j;
-
-            for (int base_reg = j;
-                 base_reg < j + class_sizes[i];
-                 base_reg++) {
-               ra_add_reg_conflict(regs, base_reg, reg);
-            }
-
-            reg++;
-         }
       }
    }
-   assert(reg == ra_reg_count);
-
-   /* Applying transitivity to all of the base registers gives us the
-    * appropreate register conflict relationships everywhere.
-    */
-   for (int reg = 0; reg < base_reg_count; reg++)
-      ra_make_reg_conflicts_transitive(regs, reg);
 
    /* Add a special class for aligned barycentrics, which we'll put the
     * first source of LINTERP on so that we can do PLN on Gen <= 6.
     */
    if (devinfo->has_pln && (devinfo->ver == 6 ||
                             (dispatch_width == 8 && devinfo->ver <= 5))) {
-      aligned_bary_class = ra_alloc_reg_class(regs);
+      int contig_len = aligned_bary_size(dispatch_width);
+      aligned_bary_class = ra_alloc_contig_reg_class(regs, contig_len);
 
-      for (int i = 0; i < aligned_bary_reg_count; i++) {
-	 if ((ra_reg_to_grf[aligned_bary_base_reg + i] & 1) == 0) {
-	    ra_class_add_reg(aligned_bary_class,
-                             aligned_bary_base_reg + i);
-	 }
-      }
-
-      for (int i = 0; i < class_count; i++) {
-         /* These are a little counter-intuitive because the barycentric
-          * registers are required to be aligned while the register they are
-          * potentially interferring with are not.  In the case where the size
-          * is even, the worst-case is that the register is odd-aligned.  In
-          * the odd-size case, it doesn't matter.
-          */
-         q_values[class_count][i] = class_sizes[i] / 2 +
-                                    aligned_bary_size(dispatch_width) / 2;
-         q_values[i][class_count] = class_sizes[i] +
-                                    aligned_bary_size(dispatch_width) - 1;
-      }
-      q_values[class_count][class_count] = aligned_bary_size(dispatch_width) - 1;
+      for (int i = 0; i <= base_reg_count - contig_len; i += 2)
+         ra_class_add_reg(aligned_bary_class, i);
    }
 
-   ra_set_finalize(regs, q_values);
-
-   ralloc_free(q_values);
+   ra_set_finalize(regs, NULL);
 
    compiler->fs_reg_sets[index].regs = regs;
    for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++)
       compiler->fs_reg_sets[index].classes[i] = NULL;
    for (int i = 0; i < class_count; i++)
       compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
-   compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
    compiler->fs_reg_sets[index].aligned_bary_class = aligned_bary_class;
 }
 
@@ -743,7 +615,7 @@ fs_reg_alloc::setup_inst_interference(const fs_inst *inst)
       const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
                        inst->src[2].nr : inst->src[0].nr;
       int size = fs->alloc.sizes[vgrf];
-      int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
+      int reg = BRW_MAX_GRF - size;
 
       if (first_mrf_hack_node >= 0) {
          /* If something happened to spill, we want to push the EOT send
@@ -1342,7 +1214,7 @@ fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
    for (unsigned i = 0; i < fs->alloc.count; i++) {
       int reg = ra_get_node_reg(g, first_vgrf_node + i);
 
-      hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
+      hw_reg_mapping[i] = reg;
       fs->grf_used = MAX2(fs->grf_used,
 			  hw_reg_mapping[i] + fs->alloc.sizes[i]);
    }