[Mesa-dev] [PATCH 08/21] etnaviv: nir: add virtual register classes

Tue Jun 5 15:39:00 UTC 2018

On Tue, Jun 5, 2018 at 10:38 AM, Philipp Zabel <p.zabel at pengutronix.de> wrote:
> Since all threads share a global temporary vec4 register file, it is
> important to reduce temporary register use of shaders.
> Using source swizzles and destination write mask of ALU operations we
> can layer smaller virtual registers on top of the physical base
> registers that overlap with their base register and partially with each
> other:
>
>  +----+---------+-------------+---------+
>  |VEC4|  VEC3   |    VEC2     | SCALAR  |
>  +----+---------+-------------+---------+
>  |  X | X X X   | X X X       | X       |
>  |  Y | Y Y   Y | Y     Y Y   |   Y     |
>  |  Z | Z   Z Z |   Z   Z   Z |     Z   |
>  |  W |   W W W |     W   W W |       W |
>  +----+---------+-------------+---------+
>
> There are four possible virtual vec3 registers that leave the remaining
> component usable as a scalar virtual register, six possible vec2
> registers, and four possible scalar registers that only use a single
> component.
>
> This patch adds an interference graph for virtual registers to the
> register allocator, using information about SSA interference and virtual
> register overlap. If possible, SSAs with smaller num_components are
> allocated from the unused components of already partially used temporary
> registers.
>
> Signed-off-by: Philipp Zabel <p.zabel at pengutronix.de>
> Signed-off-by: Michael Tretter <m.tretter at pengutronix.de>
> ---

so one quick note, constructing the register classes can be
expensive.. you probably only want to do this once and then re-use for
each shader

BR,
-R

>  src/gallium/drivers/etnaviv/etnaviv_nir.c | 282 ++++++++++++++++++++--
>  1 file changed, 259 insertions(+), 23 deletions(-)
>
> diff --git a/src/gallium/drivers/etnaviv/etnaviv_nir.c b/src/gallium/drivers/etnaviv/etnaviv_nir.c
> index b73d4be31bc6..752e87248e31 100644
> --- a/src/gallium/drivers/etnaviv/etnaviv_nir.c
> +++ b/src/gallium/drivers/etnaviv/etnaviv_nir.c
> @@ -375,11 +375,111 @@ etna_instr_replaceable_ssa_dest(nir_instr *instr)
>     return NULL;
>  }
>
> -/* Return the NIR global register corresponding to a given temporary register,
> - * creating it if necessary.
> +/* Swizzles and write masks can be used to layer virtual non-interfering
> + * registers on top of the real VEC4 registers. For example, the virtual
> + * VEC3_XYZ register and the virtual SCALAR_W register that use the same
> + * physical VEC4 base register do not interfere.
> + */
> +enum {
> +   ETNA_REG_CLASS_VEC4,
> +   ETNA_REG_CLASS_VIRT_VEC3,
> +   ETNA_REG_CLASS_VIRT_VEC2,
> +   ETNA_REG_CLASS_VIRT_SCALAR,
> +   ETNA_NUM_REG_CLASSES,
> +} etna_reg_class;
> +
> +enum {
> +   ETNA_REG_TYPE_VEC4,
> +   ETNA_REG_TYPE_VIRT_VEC3_XYZ,
> +   ETNA_REG_TYPE_VIRT_VEC3_XYW,
> +   ETNA_REG_TYPE_VIRT_VEC3_XZW,
> +   ETNA_REG_TYPE_VIRT_VEC3_YZW,
> +   ETNA_REG_TYPE_VIRT_VEC2_XY,
> +   ETNA_REG_TYPE_VIRT_VEC2_XZ,
> +   ETNA_REG_TYPE_VIRT_VEC2_XW,
> +   ETNA_REG_TYPE_VIRT_VEC2_YZ,
> +   ETNA_REG_TYPE_VIRT_VEC2_YW,
> +   ETNA_REG_TYPE_VIRT_VEC2_ZW,
> +   ETNA_REG_TYPE_VIRT_SCALAR_X,
> +   ETNA_REG_TYPE_VIRT_SCALAR_Y,
> +   ETNA_REG_TYPE_VIRT_SCALAR_Z,
> +   ETNA_REG_TYPE_VIRT_SCALAR_W,
> +   ETNA_NUM_REG_TYPES,
> +} etna_reg_type;
> +
> +static const uint8_t
> +etna_reg_writemask[ETNA_NUM_REG_TYPES] = {
> +   [ETNA_REG_TYPE_VEC4] = 0xf,
> +   [ETNA_REG_TYPE_VIRT_SCALAR_X] = 0x1,
> +   [ETNA_REG_TYPE_VIRT_SCALAR_Y] = 0x2,
> +   [ETNA_REG_TYPE_VIRT_VEC2_XY] = 0x3,
> +   [ETNA_REG_TYPE_VIRT_SCALAR_Z] = 0x4,
> +   [ETNA_REG_TYPE_VIRT_VEC2_XZ] = 0x5,
> +   [ETNA_REG_TYPE_VIRT_VEC2_YZ] = 0x6,
> +   [ETNA_REG_TYPE_VIRT_VEC3_XYZ] = 0x7,
> +   [ETNA_REG_TYPE_VIRT_SCALAR_W] = 0x8,
> +   [ETNA_REG_TYPE_VIRT_VEC2_XW] = 0x9,
> +   [ETNA_REG_TYPE_VIRT_VEC2_YW] = 0xa,
> +   [ETNA_REG_TYPE_VIRT_VEC3_XYW] = 0xb,
> +   [ETNA_REG_TYPE_VIRT_VEC2_ZW] = 0xc,
> +   [ETNA_REG_TYPE_VIRT_VEC3_XZW] = 0xd,
> +   [ETNA_REG_TYPE_VIRT_VEC3_YZW] = 0xe,
> +};
> +
> +static inline int etna_reg_get_type(int virt_reg)
> +{
> +   return virt_reg % ETNA_NUM_REG_TYPES;
> +}
> +
> +static inline int etna_reg_get_base(int virt_reg)
> +{
> +   return virt_reg / ETNA_NUM_REG_TYPES;
> +}
> +
> +static inline int etna_reg_get_class(int virt_reg)
> +{
> +   switch (etna_reg_get_type(virt_reg)) {
> +   case ETNA_REG_TYPE_VEC4:
> +      return ETNA_REG_CLASS_VEC4;
> +   case ETNA_REG_TYPE_VIRT_VEC3_XYZ:
> +   case ETNA_REG_TYPE_VIRT_VEC3_XYW:
> +   case ETNA_REG_TYPE_VIRT_VEC3_XZW:
> +   case ETNA_REG_TYPE_VIRT_VEC3_YZW:
> +      return ETNA_REG_CLASS_VIRT_VEC3;
> +   case ETNA_REG_TYPE_VIRT_VEC2_XY:
> +   case ETNA_REG_TYPE_VIRT_VEC2_XZ:
> +   case ETNA_REG_TYPE_VIRT_VEC2_XW:
> +   case ETNA_REG_TYPE_VIRT_VEC2_YZ:
> +   case ETNA_REG_TYPE_VIRT_VEC2_YW:
> +   case ETNA_REG_TYPE_VIRT_VEC2_ZW:
> +      return ETNA_REG_CLASS_VIRT_VEC2;
> +   case ETNA_REG_TYPE_VIRT_SCALAR_X:
> +   case ETNA_REG_TYPE_VIRT_SCALAR_Y:
> +   case ETNA_REG_TYPE_VIRT_SCALAR_Z:
> +   case ETNA_REG_TYPE_VIRT_SCALAR_W:
> +      return ETNA_REG_CLASS_VIRT_SCALAR;
> +   }
> +
> +   assert(false);
> +}
> +
> +/* Q values for the full set. Each virtual register interferes
> + * with exactly one base register. And possibly with other virtual
> + * registers on top of the same base register.
> + */
> +static const unsigned int
> +q_val[ETNA_NUM_REG_CLASSES][ETNA_NUM_REG_CLASSES] = {
> +   { 0, 4, 6, 4 },
> +   { 1, 3, 6, 3 },
> +   { 1, 4, 4, 2 },
> +   { 1, 3, 3, 0 },
> +};
> +
> +/* Return a NIR global register corresponding to a given temporary register.
> + * The register is created if necessary.
>   */
>  static nir_register *
> -etna_ensure_temporary(nir_shader *shader, int index)
> +etna_ensure_register(nir_shader *shader, int index)
>  {
>     nir_foreach_register(reg, &shader->registers) {
>        if (reg->index == index)
> @@ -387,13 +487,9 @@ etna_ensure_temporary(nir_shader *shader, int index)
>     }
>
>     nir_register *reg = nir_global_reg_create(shader);
> +   shader->reg_alloc = MAX2(shader->reg_alloc - 1, index + 1);
>     reg->num_components = 4;
> -   reg->num_array_elems = 0;
> -   reg->bit_size = 32;
>     reg->index = index;
> -   if (shader->reg_alloc < index + 1)
> -      shader->reg_alloc = index + 1;
> -   reg->name = NULL;
>
>     return reg;
>  }
> @@ -405,15 +501,19 @@ etna_ensure_temporary(nir_shader *shader, int index)
>   * and all store intrinsics to be moved to the end of the function already, so
>   * that interference between input, output, and temporary values is described
>   * correctly.
> + * All SSAs that qualify will be replaced with the assigned registers.
> + * Destination SSAs to constant/uniform load and output store intrinsics
> + * as well as undefined assignments are kept and will be removed later.
>   */
>  static void
>  etna_assign_registers(nir_shader *shader)
>  {
> -   struct ra_regs *regs = ra_alloc_reg_set(NULL, 64, false);
> -   int class = ra_alloc_reg_class(regs);
> +   struct ra_regs *regs = ra_alloc_reg_set(NULL, ETNA_MAX_TEMPS *
> +                                          ETNA_NUM_REG_TYPES, false);
> +   int class[ETNA_NUM_REG_CLASSES];
>     unsigned int **q_values;
> -   unsigned int *input_reg;
> -   unsigned int *output_reg;
> +   unsigned int *input_reg = NULL;
> +   unsigned int *output_reg = NULL;
>
>     /* Input/output registers only have to be assigned manually to the beginning
>      * of the temporary register range in the fragment shader. Otherwise the
> @@ -427,13 +527,29 @@ etna_assign_registers(nir_shader *shader)
>     }
>
>     /* A single register file with 64 registers is available to each running
> -    * shader, with no conflicts between them.
> +    * shader, with no conflicts between them. We add virtual registers on
> +    * top of that.
>      */
> -   for (int r = 0; r < 64; r++)
> -      ra_class_add_reg(regs, class, r);
> -   q_values = ralloc_array(regs, unsigned *, 1);
> -   q_values[0] = rzalloc_array(q_values, unsigned, 1);
> -   q_values[0][0] = 0;
> +   for (int c = 0; c < ETNA_NUM_REG_CLASSES; c++)
> +      class[c] = ra_alloc_reg_class(regs);
> +   for (int r = 0; r < ETNA_NUM_REG_TYPES * ETNA_MAX_TEMPS; r++)
> +      ra_class_add_reg(regs, class[etna_reg_get_class(r)], r);
> +   q_values = ralloc_array(regs, unsigned *, ETNA_NUM_REG_CLASSES);
> +   for (int i = 0; i < ETNA_NUM_REG_CLASSES; i++) {
> +      q_values[i] = rzalloc_array(q_values, unsigned, ETNA_NUM_REG_CLASSES);
> +      for (int j = 0; j < ETNA_NUM_REG_CLASSES; j++)
> +         q_values[i][j] = q_val[i][j];
> +   }
> +   for (int r = 0; r < ETNA_MAX_TEMPS; r++) {
> +      for (int i = 0; i < ETNA_NUM_REG_TYPES; i++) {
> +         for (int j = 0; j < i; j++) {
> +            if (etna_reg_writemask[i] & etna_reg_writemask[j]) {
> +               ra_add_reg_conflict(regs, ETNA_NUM_REG_TYPES * r + i,
> +                                         ETNA_NUM_REG_TYPES * r + j);
> +            }
> +         }
> +      }
> +   }
>     ra_set_finalize(regs, q_values);
>
>     nir_foreach_function(function, shader) {
> @@ -462,7 +578,54 @@ etna_assign_registers(nir_shader *shader)
>           }
>        }
>
> -      struct ra_graph *g = ra_alloc_interference_graph(regs, count);
> +      int num_nodes = count;
> +
> +      /* Add space for one dummy node, to grab the position register */
> +      if (shader->info.stage == MESA_SHADER_FRAGMENT)
> +         num_nodes++;
> +
> +      struct ra_graph *g = ra_alloc_interference_graph(regs, num_nodes);
> +
> +      /* Assign nodes to the appropriate register class */
> +      for (i = 0; i < count; i++) {
> +         bool can_use_virt = list_empty(&ssa_defs[i]->if_uses);
> +         can_use_virt &= ssa_defs[i]->parent_instr->type == nir_instr_type_alu;
> +         if (can_use_virt) {
> +            nir_foreach_use(use_src, ssa_defs[i]) {
> +               if (use_src->parent_instr->type != nir_instr_type_alu) {
> +                  can_use_virt = false;
> +                  break;
> +               }
> +               /* These instructions are scalar and only read src.x */
> +               nir_alu_instr *alu = nir_instr_as_alu(use_src->parent_instr);
> +               if (alu->op == nir_op_fexp2 ||
> +                   alu->op == nir_op_flog2) {
> +                  can_use_virt = false;
> +                  break;
> +               }
> +            }
> +         }
> +
> +         /* Only choose virtual registers if all uses can be swizzled */
> +         if (can_use_virt && ssa_defs[i]->num_components == 1)
> +            ra_set_node_class(g, i, ETNA_REG_CLASS_VIRT_SCALAR);
> +         else if (can_use_virt && ssa_defs[i]->num_components == 2)
> +            ra_set_node_class(g, i, ETNA_REG_CLASS_VIRT_VEC2);
> +         else if (can_use_virt && ssa_defs[i]->num_components == 3)
> +            ra_set_node_class(g, i, ETNA_REG_CLASS_VIRT_VEC3);
> +         else
> +            ra_set_node_class(g, i, ETNA_REG_CLASS_VEC4);
> +      }
> +
> +      /* Prevent writes to the position register (temporary register 0) by
> +       * assigning it to a dummy node that interferes with all other nodes.
> +       */
> +      if (shader->info.stage == MESA_SHADER_FRAGMENT) {
> +         ra_set_node_class(g, num_nodes - 1, ETNA_REG_CLASS_VEC4);
> +         ra_set_node_reg(g, num_nodes - 1, 0);
> +         for (int i = 0; i < count; i++)
> +            ra_add_node_interference(g, i, num_nodes - 1);
> +      }
>
>        /* Collect SSA interference information and force input loads to
>         * the correct registers in the fragment shader.
> @@ -491,7 +654,7 @@ etna_assign_registers(nir_shader *shader)
>
>              assert(offset == 0);
>
> -            ra_set_node_reg(g, i, base);
> +            ra_set_node_reg(g, i, base * ETNA_NUM_REG_TYPES);
>           }
>        }
>
> @@ -517,7 +680,7 @@ etna_assign_registers(nir_shader *shader)
>                 /* Find the replaceable SSA used as source */
>                 for (i = 0; i < count; i++) {
>                    if (ssa_defs[i] == intr->src[0].ssa)
> -                     ra_set_node_reg(g, i, base);
> +                     ra_set_node_reg(g, i, base * ETNA_NUM_REG_TYPES);
>                 }
>              }
>           }
> @@ -530,19 +693,92 @@ etna_assign_registers(nir_shader *shader)
>        /* Replace SSA assignments with allocated registers */
>        for (i = 0; i < count; i++) {
>           int r = ra_get_node_reg(g, i);
> -         nir_register *reg = etna_ensure_temporary(shader, r);
> +         nir_register *reg = etna_ensure_register(shader, etna_reg_get_base(r));
>           nir_ssa_def *ssa = ssa_defs[i];
>
> -         nir_ssa_def_rewrite_uses(ssa, nir_src_for_reg(reg));
> +         /* Rewrite uses */
> +         if (etna_reg_get_type(r) == ETNA_REG_TYPE_VEC4) {
> +            nir_ssa_def_rewrite_uses(ssa, nir_src_for_reg(reg));
> +         } else {
> +            nir_src new_src = nir_src_for_reg(reg);
> +            nir_foreach_use_safe(use_src, ssa) {
> +               static const unsigned reswizzle[ETNA_NUM_REG_TYPES][4] = {
> +                  { 0, 1, 2, 3 }, /* XYZW */
> +                  { 0, 1, 2, 2 }, /* XYZ */
> +                  { 0, 1, 3, 3 }, /* XYW */
> +                  { 0, 2, 3, 3 }, /* XZW */
> +                  { 1, 2, 3, 3 }, /* YZW */
> +                  { 0, 1, 1, 1 }, /* XY */
> +                  { 0, 2, 2, 2 }, /* XZ */
> +                  { 0, 3, 3, 3 }, /* XW */
> +                  { 1, 2, 2, 2 }, /* YZ */
> +                  { 1, 3, 3, 3 }, /* YW */
> +                  { 2, 3, 3, 3 }, /* ZW */
> +                  { 0, 0, 0, 0 }, /* X */
> +                  { 1, 1, 1 ,1 }, /* Y */
> +                  { 2, 2, 2, 2 }, /* Z */
> +                  { 3, 3, 3, 3 }, /* W */
> +               };
> +               nir_instr_rewrite_src(use_src->parent_instr, use_src, new_src);
> +               nir_alu_src *alu_src = container_of(use_src, alu_src, src);
> +               int t = etna_reg_get_type(r);
> +               alu_src->swizzle[0] = reswizzle[t][alu_src->swizzle[0]];
> +               alu_src->swizzle[1] = reswizzle[t][alu_src->swizzle[1]];
> +               alu_src->swizzle[2] = reswizzle[t][alu_src->swizzle[2]];
> +               alu_src->swizzle[3] = reswizzle[t][alu_src->swizzle[3]];
> +            }
> +         }
> +
>           assert(list_empty(&ssa->uses) && list_empty(&ssa->if_uses));
>
>           nir_instr *instr = ssa->parent_instr;
>
> +         /* Rewrite destination */
>           if (instr->type == nir_instr_type_alu) {
>              nir_alu_instr *alu = nir_instr_as_alu(instr);
>
>              nir_instr_rewrite_dest(&alu->instr, &alu->dest.dest,
>                                     nir_dest_for_reg(reg));
> +            int t = etna_reg_get_type(r);
> +            alu->dest.write_mask = etna_reg_writemask[t];
> +            /* The dot product instructions broadcast their result to all
> +             * destination components. There is no need to reswizzle their
> +             * sources here.
> +             */
> +            if (alu->op != nir_op_fdot2 &&
> +                alu->op != nir_op_fdot3 &&
> +                alu->op != nir_op_fdot4) {
> +               unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
> +               for (unsigned i = 0; i < num_srcs; i++) {
> +                  static const unsigned reswizzle[ETNA_NUM_REG_TYPES][4] = {
> +                     { 0, 1, 2, 3 }, /* XYZW */
> +                     { 0, 1, 2, 2 }, /* XYZ */
> +                     { 0, 1, 1, 2 }, /* XYW */
> +                     { 0, 0, 1, 2 }, /* XZW */
> +                     { 0, 0, 1, 2 }, /* YZW */
> +                     { 0, 1, 1, 1 }, /* XY */
> +                     { 0, 0, 1, 1 }, /* XZ */
> +                     { 0, 0, 0, 1 }, /* XW */
> +                     { 0, 0, 1, 1 }, /* YZ */
> +                     { 0, 0, 0, 1 }, /* YW */
> +                     { 0, 0, 0, 1 }, /* ZW */
> +                     { 0, 0, 0, 0 }, /* X */
> +                     { 0, 0, 0, 0 }, /* Y */
> +                     { 0, 0, 0, 0 }, /* Z */
> +                     { 0, 0, 0, 0 }, /* W */
> +                  };
> +                  nir_alu_src *alu_src = &alu->src[i];
> +                  uint8_t swizzle[4];
> +                  swizzle[0] = alu_src->swizzle[0];
> +                  swizzle[1] = alu_src->swizzle[1];
> +                  swizzle[2] = alu_src->swizzle[2];
> +                  swizzle[3] = alu_src->swizzle[3];
> +                  alu_src->swizzle[0] = swizzle[reswizzle[t][0]];
> +                  alu_src->swizzle[1] = swizzle[reswizzle[t][1]];
> +                  alu_src->swizzle[2] = swizzle[reswizzle[t][2]];
> +                  alu_src->swizzle[3] = swizzle[reswizzle[t][3]];
> +               }
> +            }
>           } else if (instr->type == nir_instr_type_tex) {
>              nir_tex_instr *tex = nir_instr_as_tex(instr);
>
> --
> 2.17.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev