[Mesa-dev] [PATCH v2 1/1] i965: add opportunistic behaviour to opt_vector_float()

Wed Dec 2 07:43:09 PST 2015

opt_vector_float() transforms several scalar MOV operations to a single
vectorial MOV.

This is done when those MOV covers all the components of the destination
register. So something like:

mov vgrf3.0.xy:D, 0D
mov vgrf3.0.w:D, 1065353216D
mov vgrf3.0.z:D, 0D

is transformed in:

mov vgrf3.0:F, [0F, 0F, 0F, 1F]

But there are cases where not all the components are written. For
example, in:

mov vgrf2.0.x:D, 1073741824D
mov vgrf3.0.xy:D, 0D
mov vgrf3.0.w:D, 1065353216D
mov vgrf4.0.xy:D, 1065353216D
mov vgrf4.0.w:D, 0D
mov vgrf6.0:UD, u4.xyzw:UD

Nor vgrf3 nor vgrf4 .z components are written, so the optimization is
not applied.

But it could be applied anyway with the components covered, using a
writemask to select the ones written. So we could transform it in:

mov vgrf2.0.x:D, 1073741824D
mov vgrf3.0.xyw:F, [0F, 0F, 0F, 1F]
mov vgrf4.0.xyw:F, [1F, 1F, 0F, 0F]
mov vgrf6.0:UD, u4.xyzw:UD

This commit does precisely that: opportunistically apply
opt_vector_float() when possible.

The improvement obtained regarding current upstream (56aff6bb4eaf) is:

total instructions in shared programs: 6819484 -> 6811698 (-0.11%)
instructions in affected programs:     387245 -> 379459 (-2.01%)
total loops in shared programs:        1971 -> 1971 (0.00%)
helped:                                3980
HURT:                                  0
GAINED:                                3
LOST:                                  0

Signed-off-by: Juan A. Suarez Romero <jasuarez at igalia.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 60 +++++++++++++++++++++-------------
 src/mesa/drivers/dri/i965/brw_vec4.h   |  3 ++
 2 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index a697bdf..f9bf820 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -309,6 +309,28 @@ src_reg::equals(const src_reg &r) const
 }
 
 bool
+vec4_visitor::vectorize_mov(vec4_instruction *current_inst, vec4_instruction *imm_inst[],
+              int inst_count, uint writemask, uint8_t *imm, bblock_t *block)
+{
+   if (inst_count < 2) {
+      return false;
+   }
+
+   unsigned vf;
+   memcpy(&vf, imm, sizeof(vf));
+   vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
+   mov->dst.type = BRW_REGISTER_TYPE_F;
+   mov->dst.writemask = writemask;
+   current_inst->insert_before(block, mov);
+
+   for (int i = 0; i < inst_count; i++) {
+      imm_inst[i]->remove(block);
+   }
+
+   return true;
+}
+
+bool
 vec4_visitor::opt_vector_float()
 {
    bool progress = false;
@@ -316,27 +338,36 @@ vec4_visitor::opt_vector_float()
    int last_reg = -1, last_reg_offset = -1;
    enum brw_reg_file last_reg_file = BAD_FILE;
 
-   int remaining_channels = 0;
-   uint8_t imm[4];
+   uint8_t imm[4] = { 0 };
    int inst_count = 0;
    vec4_instruction *imm_inst[4];
+   unsigned int writemask = 0;
 
    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
       if (last_reg != inst->dst.nr ||
           last_reg_offset != inst->dst.reg_offset ||
           last_reg_file != inst->dst.file) {
+
+         progress |= vectorize_mov(inst, imm_inst, inst_count, writemask, imm, block);
+
+         inst_count = 0;
+         writemask = 0;
          last_reg = inst->dst.nr;
          last_reg_offset = inst->dst.reg_offset;
          last_reg_file = inst->dst.file;
-         remaining_channels = WRITEMASK_XYZW;
-
-         inst_count = 0;
+         for (int i = 0; i < 4; i++) {
+            imm[i] = 0;
+         }
       }
 
       if (inst->opcode != BRW_OPCODE_MOV ||
           inst->dst.writemask == WRITEMASK_XYZW ||
-          inst->src[0].file != IMM)
+          inst->src[0].file != IMM) {
+         progress |= vectorize_mov(inst, imm_inst, inst_count, writemask, imm, block);
+         inst_count = 0;
+         last_reg = -1;
          continue;
+      }
 
       int vf = brw_float_to_vf(inst->src[0].f);
       if (vf == -1)
@@ -351,23 +382,8 @@ vec4_visitor::opt_vector_float()
       if ((inst->dst.writemask & WRITEMASK_W) != 0)
          imm[3] = vf;
 
+      writemask |= inst->dst.writemask;
       imm_inst[inst_count++] = inst;
-
-      remaining_channels &= ~inst->dst.writemask;
-      if (remaining_channels == 0) {
-         unsigned vf;
-         memcpy(&vf, imm, sizeof(vf));
-         vec4_instruction *mov = MOV(inst->dst, brw_imm_vf(vf));
-         mov->dst.type = BRW_REGISTER_TYPE_F;
-         mov->dst.writemask = WRITEMASK_XYZW;
-         inst->insert_after(block, mov);
-         last_reg = -1;
-
-         for (int i = 0; i < inst_count; i++) {
-            imm_inst[i]->remove(block);
-         }
-         progress = true;
-      }
    }
 
    if (progress)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 25b1139..bf96704 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -365,6 +365,9 @@ protected:
    virtual void gs_end_primitive();
 
 private:
+   bool vectorize_mov(vec4_instruction *current_inst, vec4_instruction *imm_inst[],
+                      int inst_count, uint writemask, uint8_t *imm, bblock_t *block);
+
    /**
     * If true, then register allocation should fail instead of spilling.
     */
-- 
2.5.0