[Mesa-dev] [PATCH 4/5] i965/vs: Extend opt_compute_to_mrf to handle limited "reswizzling"

Tue Dec 11 13:43:01 PST 2012

The way our visitor works, scalar expression/swizzle results that get
stored in channels other than .x will have an intermediate MOV from
their result in the .x channel to the real .y (or whatever) channel, and
similarly for vec2/vec3 results.

By knowing how to adjust DP4-type instructions for optimizing out a
swizzled MOV, we can reduce instructions in common matrix multiplication
cases.
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp             |   99 ++++++++++++++++++--
 src/mesa/drivers/dri/i965/brw_vec4.h               |    2 +
 .../dri/i965/test_vec4_register_coalesce.cpp       |   21 +++++
 3 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 436ba97..7ab37e7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -600,6 +600,85 @@ vec4_visitor::move_push_constants_to_pull_constants()
    pack_uniform_registers();
 }
 
+bool
+vec4_instruction::can_reswizzle_dst(int dst_writemask,
+                                    int swizzle,
+                                    int swizzle_mask)
+{
+   /* If this instruction sets anything not referenced by swizzle, then we'd
+    * totally break it when we reswizzle.
+    */
+   if (dst.writemask & ~swizzle_mask)
+      return false;
+
+   switch (opcode) {
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP2:
+      return true;
+   default:
+      /* Check if there happens to be no reswizzling required. */
+      for (int c = 0; c < 4; c++) {
+         int bit = 1 << BRW_GET_SWZ(swizzle, c);
+         /* Skip components of the swizzle not used by the dst. */
+         if (!(dst_writemask & (1 << c)))
+            continue;
+
+         /* We don't do the reswizzling yet, so just sanity check that we
+          * don't have to.
+          */
+         if (bit != (1 << c))
+            return false;
+      }
+      return true;
+   }
+}
+
+/**
+ * For any channels in the swizzle's source that were populated by this
+ * instruction, rewrite the instruction to put the appropriate result directly
+ * in those channels.
+ *
+ * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
+ */
+void
+vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle)
+{
+   int new_writemask = 0;
+
+   switch (opcode) {
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP2:
+      for (int c = 0; c < 4; c++) {
+         int bit = 1 << BRW_GET_SWZ(swizzle, c);
+         /* Skip components of the swizzle not used by the dst. */
+         if (!(dst_writemask & (1 << c)))
+            continue;
+         /* If we were populating this component, then populate the
+          * corresponding channel of the new dst.
+          */
+         if (dst.writemask & bit)
+            new_writemask |= (1 << c);
+      }
+      dst.writemask = new_writemask;
+      break;
+   default:
+      for (int c = 0; c < 4; c++) {
+         int bit = 1 << BRW_GET_SWZ(swizzle, c);
+         /* Skip components of the swizzle not used by the dst. */
+         if (!(dst_writemask & (1 << c)))
+            continue;
+
+         /* We don't do the reswizzling yet, so just sanity check that we
+          * don't have to.
+          */
+         assert(bit == (1 << c));
+      }
+      break;
+   }
+}
+
 /*
  * Tries to reduce extra MOV instructions by taking GRFs that get just
  * written and then MOVed into an MRF and making the original write of
@@ -641,26 +720,20 @@ vec4_visitor::opt_compute_to_mrf()
        */
       bool chans_needed[4] = {false, false, false, false};
       int chans_remaining = 0;
+      int swizzle_mask = 0;
       for (int i = 0; i < 4; i++) {
 	 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i);
 
 	 if (!(inst->dst.writemask & (1 << i)))
 	    continue;
 
-	 /* We don't handle compute-to-MRF across a swizzle.  We would
-	  * need to be able to rewrite instructions above to output
-	  * results to different channels.
-	  */
-	 if (chan != i)
-	    chans_remaining = 5;
+         swizzle_mask |= (1 << chan);
 
 	 if (!chans_needed[chan]) {
 	    chans_needed[chan] = true;
 	    chans_remaining++;
 	 }
       }
-      if (chans_remaining > 4)
-	 continue;
 
       /* Now walk up the instruction stream trying to see if we can
        * rewrite everything writing to the GRF into the MRF instead.
@@ -689,6 +762,13 @@ vec4_visitor::opt_compute_to_mrf()
 	       }
 	    }
 
+            /* If we can't handle the swizzle, bail. */
+            if (!scan_inst->can_reswizzle_dst(inst->dst.writemask,
+                                              inst->src[0].swizzle,
+                                              swizzle_mask)) {
+               break;
+            }
+
 	    /* Mark which channels we found unconditional writes for. */
 	    if (!scan_inst->predicate) {
 	       for (int i = 0; i < 4; i++) {
@@ -759,10 +839,11 @@ vec4_visitor::opt_compute_to_mrf()
 	    if (scan_inst->dst.file == GRF &&
 		scan_inst->dst.reg == inst->src[0].reg &&
 		scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
+               scan_inst->reswizzle_dst(inst->dst.writemask,
+                                        inst->src[0].swizzle);
 	       scan_inst->dst.file = MRF;
 	       scan_inst->dst.reg = mrf;
 	       scan_inst->dst.reg_offset = 0;
-	       scan_inst->dst.writemask &= inst->dst.writemask;
 	       scan_inst->saturate |= inst->saturate;
 	    }
 	    scan_inst = (vec4_instruction *)scan_inst->next;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 92d7bfd..b4d4e7e 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -194,6 +194,8 @@ public:
 
    bool is_tex();
    bool is_math();
+   bool can_reswizzle_dst(int dst_writemask, int swizzle, int swizzle_mask);
+   void reswizzle_dst(int dst_writemask, int swizzle);
 };
 
 /**
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index c79b0fd..fa9c155 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -122,3 +122,24 @@ TEST_F(register_coalesce_test, test_multiple_use)
 
    EXPECT_NE(mul->dst.file, MRF);
 }
+
+TEST_F(register_coalesce_test, test_dp4_mrf)
+{
+   src_reg some_src_1 = src_reg(v, glsl_type::vec4_type);
+   src_reg some_src_2 = src_reg(v, glsl_type::vec4_type);
+   dst_reg init;
+
+   dst_reg m0 = dst_reg(MRF, 0);
+   m0.writemask = WRITEMASK_Y;
+   m0.type = BRW_REGISTER_TYPE_F;
+
+   dst_reg temp = dst_reg(v, glsl_type::float_type);
+
+   vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
+   v->emit(v->MOV(m0, src_reg(temp)));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(dp4->dst.file, MRF);
+   EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
+}
-- 
1.7.10.4