[Mesa-dev] [PATCH 1/2] i965/vec4: Lower integer multiplication after optimizations.

Matt Turner mattst88 at gmail.com
Mon Apr 18 23:14:15 UTC 2016


Analogous to commit 1e4e17fbd in the i965/fs backend.

Because the copy propagation pass in the vec4 backend is strictly local,
we look at the immediate values coming from NIR and emit the multiplies
we need directly. If the copy propagation pass becomes smarter in the
future, we can reduce the nir_op_imul case in brw_vec4_nir.cpp to a
single multiply.

total instructions in shared programs: 7082311 -> 7081953 (-0.01%)
instructions in affected programs: 59581 -> 59223 (-0.60%)
helped: 293

total cycles in shared programs: 65765712 -> 65764796 (-0.00%)
cycles in affected programs: 854112 -> 853196 (-0.11%)
helped: 154
HURT: 73
---
 src/mesa/drivers/dri/i965/brw_vec4.cpp     | 67 ++++++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_vec4.h       |  1 +
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 48 +++++++++------------
 3 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index b9cf3f6..1644d4d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1671,6 +1671,71 @@ vec4_visitor::lower_minmax()
    return progress;
 }
 
+bool
+vec4_visitor::lower_integer_multiplication()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      const vec4_builder ibld(this, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_MUL) {
+         if (inst->dst.is_accumulator() ||
+             (inst->src[1].type != BRW_REGISTER_TYPE_D &&
+              inst->src[1].type != BRW_REGISTER_TYPE_UD))
+            continue;
+
+         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
+          * operation directly, but CHV/BXT cannot.
+          */
+         if (devinfo->gen >= 8 &&
+             !devinfo->is_cherryview && !devinfo->is_broxton)
+            continue;
+
+         if (inst->src[1].file == IMM &&
+             inst->src[1].ud < (1 << 16)) {
+            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+             * src1 are used.
+             *
+             * If multiplying by an immediate value that fits in 16-bits, do a
+             * single MUL instruction with that value in the proper location.
+             */
+            if (devinfo->gen < 7) {
+               dst_reg imm(VGRF, alloc.allocate(1), inst->dst.type,
+                           inst->dst.writemask);
+               ibld.MOV(imm, inst->src[1]);
+               ibld.MUL(inst->dst, src_reg(imm), inst->src[0]);
+            } else {
+               ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+            }
+         } else {
+            const dst_reg acc(brw_writemask(retype(brw_acc_reg(8),
+                                                   inst->dst.type),
+                                            inst->dst.writemask));
+            const dst_reg null(brw_writemask(retype(brw_null_reg(),
+                                                    inst->dst.type),
+                                             inst->dst.writemask));
+
+            ibld.MUL(acc, inst->src[0], inst->src[1]);
+            ibld.MACH(null, inst->src[0], inst->src[1]);
+            set_condmod(inst->conditional_mod,
+                        ibld.MOV(inst->dst, src_reg(acc)));
+         }
+      } else {
+         continue;
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 src_reg
 vec4_visitor::get_timestamp()
 {
@@ -1950,6 +2015,8 @@ vec4_visitor::run()
       OPT(dead_code_eliminate);
    }
 
+   OPT(lower_integer_multiplication);
+
    if (failed)
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index d43a5a8..f6f8b12 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -301,6 +301,7 @@ public:
    void resolve_ud_negate(src_reg *reg);
 
    bool lower_minmax();
+   bool lower_integer_multiplication();
 
    src_reg get_timestamp();
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index e4e8c38..10e2f54 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1039,35 +1039,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_imul: {
-      if (devinfo->gen < 8) {
-         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
-         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
-
-         /* For integer multiplication, the MUL uses the low 16 bits of one of
-          * the operands (src0 through SNB, src1 on IVB and later). The MACH
-          * accumulates in the contribution of the upper 16 bits of that
-          * operand. If we can determine that one of the args is in the low
-          * 16 bits, though, we can just emit a single MUL.
-          */
-         if (value0 && value0->u32[0] < (1 << 16)) {
-            if (devinfo->gen < 7)
-               emit(MUL(dst, op[0], op[1]));
-            else
-               emit(MUL(dst, op[1], op[0]));
-         } else if (value1 && value1->u32[0] < (1 << 16)) {
-            if (devinfo->gen < 7)
-               emit(MUL(dst, op[1], op[0]));
-            else
-               emit(MUL(dst, op[0], op[1]));
-         } else {
-            struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
-
-            emit(MUL(acc, op[0], op[1]));
-            emit(MACH(dst_null_d(), op[0], op[1]));
-            emit(MOV(dst, src_reg(acc)));
-         }
+      nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
+      nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+
+      /* For integer multiplication, the MUL uses the low 16 bits of one of
+       * the operands (src0 through SNB, src1 on IVB and later). The MACH
+       * accumulates in the contribution of the upper 16 bits of that
+       * operand. If we can determine that one of the args is in the low
+       * 16 bits, though, we can just emit a single MUL.
+       */
+      if (value0 && value0->u32[0] < (1 << 16)) {
+         if (devinfo->gen < 7)
+            emit(MUL(dst, op[0], op[1]));
+         else
+            emit(MUL(dst, op[1], retype(brw_imm_ud(value0->u32[0]), dst.type)));
+      } else if (value1 && value1->u32[0] < (1 << 16)) {
+         if (devinfo->gen < 7)
+            emit(MUL(dst, op[1], op[0]));
+         else
+            emit(MUL(dst, op[0], retype(brw_imm_ud(value1->u32[0]), dst.type)));
       } else {
-	 emit(MUL(dst, op[0], op[1]));
+         emit(MUL(dst, op[0], op[1]));
       }
       break;
    }
-- 
2.7.3



More information about the mesa-dev mailing list