[Mesa-dev] [PATCH v2 11/11] nir/lower_vec_to_movs: Coalesce into destinations of fdot instructions

Fri Sep 11 08:53:00 PDT 2015

Now that we have a replicating fdot instruction, we can actually coalesce
into the destinations of vec4 instructions.  We couldn't really do this
before because, if the destination had to end up in .z, we couldn't
reswizzle the instruction.  With a replicated destination, the result ends
up in all channels so we can just set the writemask and we're done.

Shader-db results for vec4 programs on Haswell:

   total instructions in shared programs: 1778849 -> 1751223 (-1.55%)
   instructions in affected programs:     763104 -> 735478 (-3.62%)
   helped:                                7067
   HURT:                                  26

It turns out that dot-products matter...

Cc: Eduardo Lima Mitev <elima at igalia.com>
---
 src/glsl/nir/nir_lower_vec_to_movs.c | 49 ++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c
index 9ff86ea..2cb0457 100644
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@@ -79,6 +79,14 @@ insert_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
    return mov->dest.write_mask;
 }
 
+static bool
+has_replicated_dest(nir_alu_instr *alu)
+{
+   return alu->op == nir_op_fdot_replicated2 ||
+          alu->op == nir_op_fdot_replicated3 ||
+          alu->op == nir_op_fdot_replicated4;
+}
+
 /* Attempts to coalesce the "move" from the given source of the vec to the
  * destination of the instruction generating the value. If, for whatever
  * reason, we cannot coalesce the mmove, it does nothing and returns 0.  We
@@ -116,19 +124,28 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
    nir_alu_instr *src_alu =
       nir_instr_as_alu(vec->src[start_idx].src.ssa->parent_instr);
 
-   /* We only care about being able to re-swizzle the instruction if it is
-    * something that we can reswizzle.  It must be per-component.
-    */
-   if (nir_op_infos[src_alu->op].output_size != 0)
-      return 0;
-
-   /* If we are going to reswizzle the instruction, we can't have any
-    * non-per-component sources either.
-    */
-   for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
-      if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
+   if (has_replicated_dest(src_alu)) {
+      /* The fdot instruction is special: It replicates its result to all
+       * components.  This means that we can always rewrite its destination
+       * and we don't need to swizzle anything.
+       */
+   } else {
+      /* We only care about being able to re-swizzle the instruction if it is
+       * something that we can reswizzle.  It must be per-component.  The one
+       * exception to this is the fdotN instructions which implicitly splat
+       * their result out to all channels.
+       */
+      if (nir_op_infos[src_alu->op].output_size != 0)
          return 0;
 
+      /* If we are going to reswizzle the instruction, we can't have any
+       * non-per-component sources either.
+       */
+      for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+         if (nir_op_infos[src_alu->op].input_sizes[j] != 0)
+            return 0;
+   }
+
    /* Stash off all of the ALU instruction's swizzles. */
    uint8_t swizzles[4][4];
    for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
@@ -148,8 +165,14 @@ try_coalesce(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
        * instruction so we can re-swizzle that component to match.
        */
       write_mask |= 1 << i;
-      for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
-         src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
+      if (has_replicated_dest(src_alu)) {
+         /* Since the destination is a single replicated value, we don't need
+          * to do any reswizzling
+          */
+      } else {
+         for (unsigned j = 0; j < nir_op_infos[src_alu->op].num_inputs; j++)
+            src_alu->src[j].swizzle[i] = swizzles[j][vec->src[i].swizzle[0]];
+      }
 
       /* Clear the no longer needed vec source */
       nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, NIR_SRC_INIT);
-- 
2.5.0.400.gff86faf