[Mesa-dev] [PATCH 2/2] i965/fs: Register allocator shoudn't use grf127 for sends dest

Thu Apr 12 02:30:58 UTC 2018

Since Gen8+ Intel PRM states that "r127 must not be used for
return address  when there is a src and dest overlap in send
instruction."

This patch implements this restriction creating new register allocator
classes that are copies of the normal classes. These new classes
exclude in their set of registers the last one of the original classes
(the only one that includes the grf127).

So vgrf that are used as destination of send messages sent from a grf are
re-assigned to one of these new classes based on its size. So the register
allocator would never assign to these vgrf a register that involves the
grf127.

If dispatch_width > 8 we don't re-assign to the new classes because all
instructions have a node interference between source and destination. And
that is enought to avoid the r127 restriction.

This fixes CTS tests that raised this issue as they were executed as SIMD8:
  dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.uniform_16struct_to_32struct.uniform_buffer_block_vert
  dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.uniform_16struct_to_32struct.uniform_buffer_block_tessc

Shader-db results on Skylake:

    total instructions in shared programs: 7686798 -> 7686790 (<.01%)
    instructions in affected programs: 1476 -> 1468 (-0.54%)
    helped: 4
    HURT: 0

    total cycles in shared programs: 337092322 -> 337095944 (<.01%)
    cycles in affected programs: 765861 -> 769483 (0.47%)
    helped: 167
    HURT: 161

Shader-db results on Broadwell:

    total instructions in shared programs: 7658574 -> 7658561 (<.01%)
    instructions in affected programs: 2355 -> 2342 (-0.55%)
    helped: 5
    HURT: 0

    total cycles in shared programs: 340694553 -> 340689774 (<.01%)
    cycles in affected programs: 1200517 -> 1195738 (-0.40%)
    helped: 204
    HURT: 267

    total spills in shared programs: 4300 -> 4299 (-0.02%)
    spills in affected programs: 72 -> 71 (-1.39%)
    helped: 1
    HURT: 0

    total fills in shared programs: 5370 -> 5369 (-0.02%)
    fills in affected programs: 58 -> 57 (-1.72%)
    helped: 1
    HURT: 0

As expected Shader-db reports no changes on previous generations.

Cc: Jason Ekstrand <jason at jlekstrand.net>
Cc: Francisco Jerez <currojerez at riseup.net>
---
 src/intel/compiler/brw_compiler.h          |  7 ++-
 src/intel/compiler/brw_fs_reg_allocate.cpp | 70 +++++++++++++++++++++++-------
 2 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index d3ae6499b91..572d373ff0c 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -61,9 +61,12 @@ struct brw_compiler {
 
       /**
        * Array of the ra classes for the unaligned contiguous register
-       * block sizes used, indexed by register size.
+       * block sizes used, indexed by register size. Classes starting at
+       * index 16 are classes for registers that should be used for
+       * send destination registers. They are equivalent to 0-15 classes
+       * but not including grf127.
        */
-      int classes[16];
+      int classes[32];
 
       /**
        * Mapping from classes to ra_reg ranges.  Each of the per-size
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
index ec8e116cb38..66e4a342d0d 100644
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -102,19 +102,31 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
     * Additionally, on gen5 we need aligned pairs of registers for the PLN
     * instruction, and on gen4 we need 8 contiguous regs for workaround simd16
     * texturing.
+    *
+    * For Gen8+ we duplicate classes with a new type of classes for vgrfs when
+    * they are used as destination of SEND messages. The only difference is
+    * that these classes don't include grf127 to implement the restriction of
+    * not overlaping source and destination when register grf127 is the
+    * destination of a SEND message. So 0-15 are the normal classes indexed by
+    * size and 16-31 classes reuse the same registers but don't include
+    * grf127.
     */
-   const int class_count = MAX_VGRF_SIZE;
-   int class_sizes[MAX_VGRF_SIZE];
-   for (unsigned i = 0; i < MAX_VGRF_SIZE; i++)
-      class_sizes[i] = i + 1;
+   const int class_types = devinfo->gen >= 8 ? 2 : 1;
+   const int class_count = class_types * MAX_VGRF_SIZE;
+   int class_sizes[class_count];
+   for (int i = 0; i < class_count; i++)
+      class_sizes[i] = (i % MAX_VGRF_SIZE) + 1;
 
    memset(compiler->fs_reg_sets[index].class_to_ra_reg_range, 0,
           sizeof(compiler->fs_reg_sets[index].class_to_ra_reg_range));
    int *class_to_ra_reg_range = compiler->fs_reg_sets[index].class_to_ra_reg_range;
 
-   /* Compute the total number of registers across all classes. */
+   /* Compute the total number of registers across all classes. The duplicated
+    * classes to avoid grf127 reuse the registers of the normal classes. That is
+    * the reason we don't iterate over them here.
+    */
    int ra_reg_count = 0;
-   for (int i = 0; i < class_count; i++) {
+   for (int i = 0; i < class_count / class_types; i++) {
       if (devinfo->gen <= 5 && dispatch_width >= 16) {
          /* From the G45 PRM:
           *
@@ -131,11 +143,11 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
          ra_reg_count += base_reg_count - (class_sizes[i] - 1);
       }
       /* Mark the last register. We'll fill in the beginnings later. */
-      class_to_ra_reg_range[class_sizes[i]] = ra_reg_count;
+      class_to_ra_reg_range[i + 1] = ra_reg_count;
    }
 
    /* Fill out the rest of the range markers */
-   for (int i = 1; i < 17; ++i) {
+   for (unsigned i = 1; i < MAX_VGRF_SIZE + 1; ++i) {
       if (class_to_ra_reg_range[i] == 0)
          class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1];
    }
@@ -160,11 +172,8 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
    int reg = 0;
    int pairs_base_reg = 0;
    int pairs_reg_count = 0;
-   for (int i = 0; i < class_count; i++) {
-      int class_reg_count;
+   for (int i = 0; i < class_count / class_types; i++) {
       if (devinfo->gen <= 5 && dispatch_width >= 16) {
-         class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
-
          /* See comment below.  The only difference here is that we are
           * dealing with pairs of registers instead of single registers.
           * Registers of odd sizes simply get rounded up. */
@@ -172,8 +181,6 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
             q_values[i][j] = (class_sizes[i] + 1) / 2 +
                              (class_sizes[j] + 1) / 2 - 1;
       } else {
-         class_reg_count = base_reg_count - (class_sizes[i] - 1);
-
          /* From register_allocate.c:
           *
           * q(B,C) (indexed by C, B is this register class) in
@@ -201,6 +208,8 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
             q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
       }
       classes[i] = ra_alloc_reg_class(regs);
+      int class_reg_count = class_to_ra_reg_range[i % MAX_VGRF_SIZE + 1] -
+                            class_to_ra_reg_range[i % MAX_VGRF_SIZE];
 
       /* Save this off for the aligned pair class at the end. */
       if (class_sizes[i] == 2) {
@@ -233,13 +242,28 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
                  base_reg++) {
                ra_add_reg_conflict(regs, base_reg, reg);
             }
-
             reg++;
          }
       }
    }
    assert(reg == ra_reg_count);
 
+   /* If there are more than MAX_VGRF_SIZE classes we generate duplicated
+    * classes that don't haver registers using grf127. New clases copy the
+    * q_values of the original ones and associate all previous registers
+    * with the original ones.
+    */
+   for (int i = MAX_VGRF_SIZE; i < class_count; i++) {
+      classes[i] = ra_alloc_reg_class(regs);
+      memcpy(q_values[i], q_values[i - MAX_VGRF_SIZE],
+             sizeof(unsigned int) * (class_count + 1));
+      for (int reg = class_to_ra_reg_range[i % MAX_VGRF_SIZE];
+           reg < class_to_ra_reg_range[i % MAX_VGRF_SIZE + 1] - 1;
+           reg++) {
+         ra_class_add_reg(regs, classes[i], reg);
+      }
+   }
+
    /* Applying transitivity to all of the base registers gives us the
     * appropreate register conflict relationships everywhere.
     */
@@ -279,7 +303,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
    for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++)
       compiler->fs_reg_sets[index].classes[i] = -1;
    for (int i = 0; i < class_count; i++)
-      compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
+      compiler->fs_reg_sets[index].classes[i] = classes[i];
    compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
    compiler->fs_reg_sets[index].aligned_pairs_class = aligned_pairs_class;
 }
@@ -516,6 +540,7 @@ setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g,
       /* Since we don't have any live/dead analysis on the MRFs, just mark all
        * that are used as conflicting with all virtual GRFs.
        */
+
       if (mrf_used[i]) {
          if (i < *first_used_mrf)
             *first_used_mrf = i;
@@ -653,6 +678,19 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
       }
    }
 
+   if (devinfo->gen >= 8 && dispatch_width == 8) {
+      /* For Gen8+ with SIMD8 we reasign VGRF destinations of SEND
+       * instructions to a different register class that doesn't includes the
+       * GRF127. These classes are also indexed by size and start after
+       * MAX_VGRF_SIZE position.
+       */
+         foreach_block_and_inst(block, fs_inst, inst, cfg) {
+         if (inst->is_send_from_grf() && inst->dst.file == VGRF) {
+            ra_set_node_class(g, inst->dst.nr, this->alloc.sizes[inst->dst.nr] - 1 + MAX_VGRF_SIZE);
+         }
+      }
+   }
+
    /* Debug of register spilling: Go spill everything. */
    if (unlikely(spill_all)) {
       int reg = choose_spill_reg(g);
-- 
2.16.3