[Mesa-dev] [PATCH 08/21] etnaviv: nir: add virtual register classes

Tue Jun 5 14:38:32 UTC 2018

Since all threads share a global temporary vec4 register file, it is
important to reduce temporary register use of shaders.
Using source swizzles and destination write mask of ALU operations we
can layer smaller virtual registers on top of the physical base
registers that overlap with their base register and partially with each
other:

 +----+---------+-------------+---------+
 |VEC4|  VEC3   |    VEC2     | SCALAR  |
 +----+---------+-------------+---------+
 |  X | X X X   | X X X       | X       |
 |  Y | Y Y   Y | Y     Y Y   |   Y     |
 |  Z | Z   Z Z |   Z   Z   Z |     Z   |
 |  W |   W W W |     W   W W |       W |
 +----+---------+-------------+---------+

There are four possible virtual vec3 registers that leave the remaining
component usable as a scalar virtual register, six possible vec2
registers, and four possible scalar registers that only use a single
component.

This patch adds an interference graph for virtual registers to the
register allocator, using information about SSA interference and virtual
register overlap. If possible, SSAs with smaller num_components are
allocated from the unused components of already partially used temporary
registers.

Signed-off-by: Philipp Zabel <p.zabel at pengutronix.de>
Signed-off-by: Michael Tretter <m.tretter at pengutronix.de>
---
 src/gallium/drivers/etnaviv/etnaviv_nir.c | 282 ++++++++++++++++++++--
 1 file changed, 259 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/etnaviv/etnaviv_nir.c b/src/gallium/drivers/etnaviv/etnaviv_nir.c
index b73d4be31bc6..752e87248e31 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_nir.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_nir.c
@@ -375,11 +375,111 @@ etna_instr_replaceable_ssa_dest(nir_instr *instr)
    return NULL;
 }
 
-/* Return the NIR global register corresponding to a given temporary register,
- * creating it if necessary.
+/* Swizzles and write masks can be used to layer virtual non-interfering
+ * registers on top of the real VEC4 registers. For example, the virtual
+ * VEC3_XYZ register and the virtual SCALAR_W register that use the same
+ * physical VEC4 base register do not interfere.
+ */
+enum {
+   ETNA_REG_CLASS_VEC4,
+   ETNA_REG_CLASS_VIRT_VEC3,
+   ETNA_REG_CLASS_VIRT_VEC2,
+   ETNA_REG_CLASS_VIRT_SCALAR,
+   ETNA_NUM_REG_CLASSES,
+} etna_reg_class;
+
+enum {
+   ETNA_REG_TYPE_VEC4,
+   ETNA_REG_TYPE_VIRT_VEC3_XYZ,
+   ETNA_REG_TYPE_VIRT_VEC3_XYW,
+   ETNA_REG_TYPE_VIRT_VEC3_XZW,
+   ETNA_REG_TYPE_VIRT_VEC3_YZW,
+   ETNA_REG_TYPE_VIRT_VEC2_XY,
+   ETNA_REG_TYPE_VIRT_VEC2_XZ,
+   ETNA_REG_TYPE_VIRT_VEC2_XW,
+   ETNA_REG_TYPE_VIRT_VEC2_YZ,
+   ETNA_REG_TYPE_VIRT_VEC2_YW,
+   ETNA_REG_TYPE_VIRT_VEC2_ZW,
+   ETNA_REG_TYPE_VIRT_SCALAR_X,
+   ETNA_REG_TYPE_VIRT_SCALAR_Y,
+   ETNA_REG_TYPE_VIRT_SCALAR_Z,
+   ETNA_REG_TYPE_VIRT_SCALAR_W,
+   ETNA_NUM_REG_TYPES,
+} etna_reg_type;
+
+static const uint8_t
+etna_reg_writemask[ETNA_NUM_REG_TYPES] = {
+   [ETNA_REG_TYPE_VEC4] = 0xf,
+   [ETNA_REG_TYPE_VIRT_SCALAR_X] = 0x1,
+   [ETNA_REG_TYPE_VIRT_SCALAR_Y] = 0x2,
+   [ETNA_REG_TYPE_VIRT_VEC2_XY] = 0x3,
+   [ETNA_REG_TYPE_VIRT_SCALAR_Z] = 0x4,
+   [ETNA_REG_TYPE_VIRT_VEC2_XZ] = 0x5,
+   [ETNA_REG_TYPE_VIRT_VEC2_YZ] = 0x6,
+   [ETNA_REG_TYPE_VIRT_VEC3_XYZ] = 0x7,
+   [ETNA_REG_TYPE_VIRT_SCALAR_W] = 0x8,
+   [ETNA_REG_TYPE_VIRT_VEC2_XW] = 0x9,
+   [ETNA_REG_TYPE_VIRT_VEC2_YW] = 0xa,
+   [ETNA_REG_TYPE_VIRT_VEC3_XYW] = 0xb,
+   [ETNA_REG_TYPE_VIRT_VEC2_ZW] = 0xc,
+   [ETNA_REG_TYPE_VIRT_VEC3_XZW] = 0xd,
+   [ETNA_REG_TYPE_VIRT_VEC3_YZW] = 0xe,
+};
+
+static inline int etna_reg_get_type(int virt_reg)
+{
+   return virt_reg % ETNA_NUM_REG_TYPES;
+}
+
+static inline int etna_reg_get_base(int virt_reg)
+{
+   return virt_reg / ETNA_NUM_REG_TYPES;
+}
+
+static inline int etna_reg_get_class(int virt_reg)
+{
+   switch (etna_reg_get_type(virt_reg)) {
+   case ETNA_REG_TYPE_VEC4:
+      return ETNA_REG_CLASS_VEC4;
+   case ETNA_REG_TYPE_VIRT_VEC3_XYZ:
+   case ETNA_REG_TYPE_VIRT_VEC3_XYW:
+   case ETNA_REG_TYPE_VIRT_VEC3_XZW:
+   case ETNA_REG_TYPE_VIRT_VEC3_YZW:
+      return ETNA_REG_CLASS_VIRT_VEC3;
+   case ETNA_REG_TYPE_VIRT_VEC2_XY:
+   case ETNA_REG_TYPE_VIRT_VEC2_XZ:
+   case ETNA_REG_TYPE_VIRT_VEC2_XW:
+   case ETNA_REG_TYPE_VIRT_VEC2_YZ:
+   case ETNA_REG_TYPE_VIRT_VEC2_YW:
+   case ETNA_REG_TYPE_VIRT_VEC2_ZW:
+      return ETNA_REG_CLASS_VIRT_VEC2;
+   case ETNA_REG_TYPE_VIRT_SCALAR_X:
+   case ETNA_REG_TYPE_VIRT_SCALAR_Y:
+   case ETNA_REG_TYPE_VIRT_SCALAR_Z:
+   case ETNA_REG_TYPE_VIRT_SCALAR_W:
+      return ETNA_REG_CLASS_VIRT_SCALAR;
+   }
+
+   assert(false);
+}
+
+/* Q values for the full set. Each virtual register interferes
+ * with exactly one base register. And possibly with other virtual
+ * registers on top of the same base register.
+ */
+static const unsigned int
+q_val[ETNA_NUM_REG_CLASSES][ETNA_NUM_REG_CLASSES] = {
+   { 0, 4, 6, 4 },
+   { 1, 3, 6, 3 },
+   { 1, 4, 4, 2 },
+   { 1, 3, 3, 0 },
+};
+
+/* Return a NIR global register corresponding to a given temporary register.
+ * The register is created if necessary.
  */
 static nir_register *
-etna_ensure_temporary(nir_shader *shader, int index)
+etna_ensure_register(nir_shader *shader, int index)
 {
    nir_foreach_register(reg, &shader->registers) {
       if (reg->index == index)
@@ -387,13 +487,9 @@ etna_ensure_temporary(nir_shader *shader, int index)
    }
 
    nir_register *reg = nir_global_reg_create(shader);
+   shader->reg_alloc = MAX2(shader->reg_alloc - 1, index + 1);
    reg->num_components = 4;
-   reg->num_array_elems = 0;
-   reg->bit_size = 32;
    reg->index = index;
-   if (shader->reg_alloc < index + 1)
-      shader->reg_alloc = index + 1;
-   reg->name = NULL;
 
    return reg;
 }
@@ -405,15 +501,19 @@ etna_ensure_temporary(nir_shader *shader, int index)
  * and all store intrinsics to be moved to the end of the function already, so
  * that interference between input, output, and temporary values is described
  * correctly.
+ * All SSAs that qualify will be replaced with the assigned registers.
+ * Destination SSAs to constant/uniform load and output store intrinsics
+ * as well as undefined assignments are kept and will be removed later.
  */
 static void
 etna_assign_registers(nir_shader *shader)
 {
-   struct ra_regs *regs = ra_alloc_reg_set(NULL, 64, false);
-   int class = ra_alloc_reg_class(regs);
+   struct ra_regs *regs = ra_alloc_reg_set(NULL, ETNA_MAX_TEMPS *
+					   ETNA_NUM_REG_TYPES, false);
+   int class[ETNA_NUM_REG_CLASSES];
    unsigned int **q_values;
-   unsigned int *input_reg;
-   unsigned int *output_reg;
+   unsigned int *input_reg = NULL;
+   unsigned int *output_reg = NULL;
 
    /* Input/output registers only have to be assigned manually to the beginning
     * of the temporary register range in the fragment shader. Otherwise the
@@ -427,13 +527,29 @@ etna_assign_registers(nir_shader *shader)
    }
 
    /* A single register file with 64 registers is available to each running
-    * shader, with no conflicts between them.
+    * shader, with no conflicts between them. We add virtual registers on
+    * top of that.
     */
-   for (int r = 0; r < 64; r++)
-      ra_class_add_reg(regs, class, r);
-   q_values = ralloc_array(regs, unsigned *, 1);
-   q_values[0] = rzalloc_array(q_values, unsigned, 1);
-   q_values[0][0] = 0;
+   for (int c = 0; c < ETNA_NUM_REG_CLASSES; c++)
+      class[c] = ra_alloc_reg_class(regs);
+   for (int r = 0; r < ETNA_NUM_REG_TYPES * ETNA_MAX_TEMPS; r++)
+      ra_class_add_reg(regs, class[etna_reg_get_class(r)], r);
+   q_values = ralloc_array(regs, unsigned *, ETNA_NUM_REG_CLASSES);
+   for (int i = 0; i < ETNA_NUM_REG_CLASSES; i++) {
+      q_values[i] = rzalloc_array(q_values, unsigned, ETNA_NUM_REG_CLASSES);
+      for (int j = 0; j < ETNA_NUM_REG_CLASSES; j++)
+         q_values[i][j] = q_val[i][j];
+   }
+   for (int r = 0; r < ETNA_MAX_TEMPS; r++) {
+      for (int i = 0; i < ETNA_NUM_REG_TYPES; i++) {
+         for (int j = 0; j < i; j++) {
+            if (etna_reg_writemask[i] & etna_reg_writemask[j]) {
+               ra_add_reg_conflict(regs, ETNA_NUM_REG_TYPES * r + i,
+                                         ETNA_NUM_REG_TYPES * r + j);
+            }
+         }
+      }
+   }
    ra_set_finalize(regs, q_values);
 
    nir_foreach_function(function, shader) {
@@ -462,7 +578,54 @@ etna_assign_registers(nir_shader *shader)
          }
       }
 
-      struct ra_graph *g = ra_alloc_interference_graph(regs, count);
+      int num_nodes = count;
+
+      /* Add space for one dummy node, to grab the position register */
+      if (shader->info.stage == MESA_SHADER_FRAGMENT)
+         num_nodes++;
+
+      struct ra_graph *g = ra_alloc_interference_graph(regs, num_nodes);
+
+      /* Assign nodes to the appropriate register class */
+      for (i = 0; i < count; i++) {
+         bool can_use_virt = list_empty(&ssa_defs[i]->if_uses);
+         can_use_virt &= ssa_defs[i]->parent_instr->type == nir_instr_type_alu;
+         if (can_use_virt) {
+            nir_foreach_use(use_src, ssa_defs[i]) {
+               if (use_src->parent_instr->type != nir_instr_type_alu) {
+                  can_use_virt = false;
+                  break;
+               }
+               /* These instructions are scalar and only read src.x */
+               nir_alu_instr *alu = nir_instr_as_alu(use_src->parent_instr);
+               if (alu->op == nir_op_fexp2 ||
+                   alu->op == nir_op_flog2) {
+                  can_use_virt = false;
+                  break;
+               }
+            }
+         }
+
+         /* Only choose virtual registers if all uses can be swizzled */
+         if (can_use_virt && ssa_defs[i]->num_components == 1)
+            ra_set_node_class(g, i, ETNA_REG_CLASS_VIRT_SCALAR);
+         else if (can_use_virt && ssa_defs[i]->num_components == 2)
+            ra_set_node_class(g, i, ETNA_REG_CLASS_VIRT_VEC2);
+         else if (can_use_virt && ssa_defs[i]->num_components == 3)
+            ra_set_node_class(g, i, ETNA_REG_CLASS_VIRT_VEC3);
+         else
+            ra_set_node_class(g, i, ETNA_REG_CLASS_VEC4);
+      }
+
+      /* Prevent writes to the position register (temporary register 0) by
+       * assigning it to a dummy node that interferes with all other nodes.
+       */
+      if (shader->info.stage == MESA_SHADER_FRAGMENT) {
+         ra_set_node_class(g, num_nodes - 1, ETNA_REG_CLASS_VEC4);
+         ra_set_node_reg(g, num_nodes - 1, 0);
+         for (int i = 0; i < count; i++)
+            ra_add_node_interference(g, i, num_nodes - 1);
+      }
 
       /* Collect SSA interference information and force input loads to
        * the correct registers in the fragment shader.
@@ -491,7 +654,7 @@ etna_assign_registers(nir_shader *shader)
 
             assert(offset == 0);
 
-            ra_set_node_reg(g, i, base);
+            ra_set_node_reg(g, i, base * ETNA_NUM_REG_TYPES);
          }
       }
 
@@ -517,7 +680,7 @@ etna_assign_registers(nir_shader *shader)
                /* Find the replaceable SSA used as source */
                for (i = 0; i < count; i++) {
                   if (ssa_defs[i] == intr->src[0].ssa)
-                     ra_set_node_reg(g, i, base);
+                     ra_set_node_reg(g, i, base * ETNA_NUM_REG_TYPES);
                }
             }
          }
@@ -530,19 +693,92 @@ etna_assign_registers(nir_shader *shader)
       /* Replace SSA assignments with allocated registers */
       for (i = 0; i < count; i++) {
          int r = ra_get_node_reg(g, i);
-         nir_register *reg = etna_ensure_temporary(shader, r);
+         nir_register *reg = etna_ensure_register(shader, etna_reg_get_base(r));
          nir_ssa_def *ssa = ssa_defs[i];
 
-         nir_ssa_def_rewrite_uses(ssa, nir_src_for_reg(reg));
+         /* Rewrite uses */
+         if (etna_reg_get_type(r) == ETNA_REG_TYPE_VEC4) {
+            nir_ssa_def_rewrite_uses(ssa, nir_src_for_reg(reg));
+         } else {
+            nir_src new_src = nir_src_for_reg(reg);
+            nir_foreach_use_safe(use_src, ssa) {
+               static const unsigned reswizzle[ETNA_NUM_REG_TYPES][4] = {
+                  { 0, 1, 2, 3 }, /* XYZW */
+                  { 0, 1, 2, 2 }, /* XYZ */
+                  { 0, 1, 3, 3 }, /* XYW */
+                  { 0, 2, 3, 3 }, /* XZW */
+                  { 1, 2, 3, 3 }, /* YZW */
+                  { 0, 1, 1, 1 }, /* XY */
+                  { 0, 2, 2, 2 }, /* XZ */
+                  { 0, 3, 3, 3 }, /* XW */
+                  { 1, 2, 2, 2 }, /* YZ */
+                  { 1, 3, 3, 3 }, /* YW */
+                  { 2, 3, 3, 3 }, /* ZW */
+                  { 0, 0, 0, 0 }, /* X */
+                  { 1, 1, 1 ,1 }, /* Y */
+                  { 2, 2, 2, 2 }, /* Z */
+                  { 3, 3, 3, 3 }, /* W */
+               };
+               nir_instr_rewrite_src(use_src->parent_instr, use_src, new_src);
+               nir_alu_src *alu_src = container_of(use_src, alu_src, src);
+               int t = etna_reg_get_type(r);
+               alu_src->swizzle[0] = reswizzle[t][alu_src->swizzle[0]];
+               alu_src->swizzle[1] = reswizzle[t][alu_src->swizzle[1]];
+               alu_src->swizzle[2] = reswizzle[t][alu_src->swizzle[2]];
+               alu_src->swizzle[3] = reswizzle[t][alu_src->swizzle[3]];
+            }
+         }
+
          assert(list_empty(&ssa->uses) && list_empty(&ssa->if_uses));
 
          nir_instr *instr = ssa->parent_instr;
 
+         /* Rewrite destination */
          if (instr->type == nir_instr_type_alu) {
             nir_alu_instr *alu = nir_instr_as_alu(instr);
 
             nir_instr_rewrite_dest(&alu->instr, &alu->dest.dest,
                                    nir_dest_for_reg(reg));
+            int t = etna_reg_get_type(r);
+            alu->dest.write_mask = etna_reg_writemask[t];
+            /* The dot product instructions broadcast their result to all
+             * destination components. There is no need to reswizzle their
+             * sources here.
+             */
+            if (alu->op != nir_op_fdot2 &&
+                alu->op != nir_op_fdot3 &&
+                alu->op != nir_op_fdot4) {
+               unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
+               for (unsigned i = 0; i < num_srcs; i++) {
+                  static const unsigned reswizzle[ETNA_NUM_REG_TYPES][4] = {
+                     { 0, 1, 2, 3 }, /* XYZW */
+                     { 0, 1, 2, 2 }, /* XYZ */
+                     { 0, 1, 1, 2 }, /* XYW */
+                     { 0, 0, 1, 2 }, /* XZW */
+                     { 0, 0, 1, 2 }, /* YZW */
+                     { 0, 1, 1, 1 }, /* XY */
+                     { 0, 0, 1, 1 }, /* XZ */
+                     { 0, 0, 0, 1 }, /* XW */
+                     { 0, 0, 1, 1 }, /* YZ */
+                     { 0, 0, 0, 1 }, /* YW */
+                     { 0, 0, 0, 1 }, /* ZW */
+                     { 0, 0, 0, 0 }, /* X */
+                     { 0, 0, 0, 0 }, /* Y */
+                     { 0, 0, 0, 0 }, /* Z */
+                     { 0, 0, 0, 0 }, /* W */
+                  };
+                  nir_alu_src *alu_src = &alu->src[i];
+                  uint8_t swizzle[4];
+                  swizzle[0] = alu_src->swizzle[0];
+                  swizzle[1] = alu_src->swizzle[1];
+                  swizzle[2] = alu_src->swizzle[2];
+                  swizzle[3] = alu_src->swizzle[3];
+                  alu_src->swizzle[0] = swizzle[reswizzle[t][0]];
+                  alu_src->swizzle[1] = swizzle[reswizzle[t][1]];
+                  alu_src->swizzle[2] = swizzle[reswizzle[t][2]];
+                  alu_src->swizzle[3] = swizzle[reswizzle[t][3]];
+               }
+            }
          } else if (instr->type == nir_instr_type_tex) {
             nir_tex_instr *tex = nir_instr_as_tex(instr);
 
-- 
2.17.1