Mesa (main): intel/vec4: Use ra_alloc_contig_reg_class() to reduce RA overhead.

Fri Jun 4 19:32:16 UTC 2021

Module: Mesa
Branch: main
Commit: cf33316ec0bc1040dfe96f11650a4887720dad71
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=cf33316ec0bc1040dfe96f11650a4887720dad71

Author: Eric Anholt <eric at anholt.net>
Date:   Fri Mar  5 10:11:07 2021 -0800

intel/vec4: Use ra_alloc_contig_reg_class() to reduce RA overhead.

We go from 1672 RA regs to the real 128 HW regs.

Reviewed-by: Jason Ekstrand <jason at jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9437>

---

 src/intel/compiler/brw_compiler.h            |  6 ----
 src/intel/compiler/brw_vec4_reg_allocate.cpp | 54 ++++------------------------
 2 files changed, 7 insertions(+), 53 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 4bb9e7be777..afb3a615e9f 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -51,12 +51,6 @@ struct brw_compiler {
        * block sizes used.
        */
       struct ra_class **classes;
-
-      /**
-       * Mapping for register-allocated objects in *regs to the first
-       * GRF for that object.
-       */
-      uint8_t *ra_reg_to_grf;
    } vec4_reg_set;
 
    struct {
diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
index 284af171b98..8d55e264314 100644
--- a/src/intel/compiler/brw_vec4_reg_allocate.cpp
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@@ -104,16 +104,9 @@ brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
    for (int i = 0; i < class_count; i++)
       class_sizes[i] = i + 1;
 
-   /* Compute the total number of registers across all classes. */
-   int ra_reg_count = 0;
-   for (int i = 0; i < class_count; i++) {
-      ra_reg_count += base_reg_count - (class_sizes[i] - 1);
-   }
 
-   ralloc_free(compiler->vec4_reg_set.ra_reg_to_grf);
-   compiler->vec4_reg_set.ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
    ralloc_free(compiler->vec4_reg_set.regs);
-   compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
+   compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, base_reg_count, false);
    if (compiler->devinfo->ver >= 6)
       ra_set_allocate_round_robin(compiler->vec4_reg_set.regs);
    ralloc_free(compiler->vec4_reg_set.classes);
@@ -122,47 +115,16 @@ brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
    /* Now, add the registers to their classes, and add the conflicts
     * between them and the base GRF registers (and also each other).
     */
-   int reg = 0;
-   unsigned *q_values[MAX_VGRF_SIZE];
    for (int i = 0; i < class_count; i++) {
       int class_reg_count = base_reg_count - (class_sizes[i] - 1);
-      compiler->vec4_reg_set.classes[i] = ra_alloc_reg_class(compiler->vec4_reg_set.regs);
-
-      q_values[i] = new unsigned[MAX_VGRF_SIZE];
-
-      for (int j = 0; j < class_reg_count; j++) {
-	 ra_class_add_reg(compiler->vec4_reg_set.classes[i], reg);
-
-	 compiler->vec4_reg_set.ra_reg_to_grf[reg] = j;
-
-	 for (int base_reg = j;
-	      base_reg < j + class_sizes[i];
-	      base_reg++) {
-	    ra_add_reg_conflict(compiler->vec4_reg_set.regs, base_reg, reg);
-	 }
+      compiler->vec4_reg_set.classes[i] =
+         ra_alloc_contig_reg_class(compiler->vec4_reg_set.regs, class_sizes[i]);
 
-	 reg++;
-      }
-
-      for (int j = 0; j < class_count; j++) {
-         /* Calculate the q values manually because the algorithm used by
-          * ra_set_finalize() to do it has higher complexity affecting the
-          * start-up time of some applications.  q(i, j) is just the maximum
-          * number of registers from class i a register from class j can
-          * conflict with.
-          */
-         q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
-      }
+      for (int j = 0; j < class_reg_count; j++)
+         ra_class_add_reg(compiler->vec4_reg_set.classes[i], j);
    }
-   assert(reg == ra_reg_count);
 
-   for (int reg = 0; reg < base_reg_count; reg++)
-      ra_make_reg_conflicts_transitive(compiler->vec4_reg_set.regs, reg);
-
-   ra_set_finalize(compiler->vec4_reg_set.regs, q_values);
-
-   for (int i = 0; i < MAX_VGRF_SIZE; i++)
-      delete[] q_values[i];
+   ra_set_finalize(compiler->vec4_reg_set.regs, NULL);
 }
 
 void
@@ -258,9 +220,7 @@ vec4_visitor::reg_allocate()
     */
    prog_data->total_grf = payload_reg_count;
    for (unsigned i = 0; i < alloc.count; i++) {
-      int reg = ra_get_node_reg(g, i);
-
-      hw_reg_mapping[i] = compiler->vec4_reg_set.ra_reg_to_grf[reg];
+      hw_reg_mapping[i] = ra_get_node_reg(g, i);
       prog_data->total_grf = MAX2(prog_data->total_grf,
 				  hw_reg_mapping[i] + alloc.sizes[i]);
    }