[Mesa-dev] [PATCH 16/21] i965/fs: Extract dword multiplies

Mon Dec 22 19:29:26 PST 2014

As previously mentioned, dword multiplies have a lot of quirks. Since there are
a few places where we might wish to do such multiplies, extract the
functionality.

This was initially requested by Jason to assist in the NIR porting effort, but
it generally makes sense.

Callers will be updated separately for easier bisection.

Cc: Jason Ekstrand <jason.ekstrand at intel.com>
Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp         | 106 +++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs.h           |   4 +
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp |  87 ++--------------------
 3 files changed, 115 insertions(+), 82 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 3639ed2..30b4e67 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1650,6 +1650,112 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 }
 
 void
+fs_visitor::emit_mul_dw(fs_reg dst, fs_reg src0, fs_reg src1,
+                        bool src0_u16, bool src1_u16)
+{
+   assert(type_is_dword(src0.type) && type_is_dword(src1.type));
+
+   /* Starting with GEN8 native dword multiplies are supported. The resulting
+    * high 32b are discarded. Similarly, prior to GEN8, if we know both operands
+    * are smaller than a DW, we can forego the MACH.
+    */
+   if (brw->gen >= 8) {
+      emit(MUL(dst, src0, src1));
+      return;
+   }
+
+   /* For integer multiplication, the MUL uses the low 16 bits of one of the
+    * operands (src0 on gen6, src1 on gen7). The MACH accumulates in the
+    * contribution of the upper 16 bits of that operand.
+    */
+   if (src0_u16) {
+      if (brw->gen < 7)
+         emit(MUL(dst, src0, src1));
+      else
+         emit(MUL(dst, src1, src0));
+   } else if (src1_u16) {
+      if (brw->gen < 7)
+         emit(MUL(dst, src1, src0));
+      else
+         emit(MUL(dst, src0, src1));
+   } else {
+      unsigned width = brw->gen == 7 ? 8 : dispatch_width;
+      fs_reg acc = fs_reg(retype(brw_acc_reg(width), dst.type));
+      fs_reg null = fs_reg(retype(brw_null_vec(width), dst.type));
+
+      if (brw->gen == 7 && dispatch_width == 16) {
+         emit(MUL(acc, half(src0, 0), half(src1, 0)));
+         emit(MACH(null, half(src0, 0), half(src1, 0)));
+         fs_inst *mov = emit(MOV(half(dst, 0), acc));
+         mov->force_sechalf = true;
+
+         emit(MUL(acc, half(src0, 1), half(src1, 1)));
+         emit(MACH(null, half(src0, 1), half(src1, 1)));
+         mov = emit(MOV(half(dst, 1), acc));
+         mov->force_sechalf = true;
+      } else {
+         emit(MUL(acc, src0, src1));
+         emit(MACH(null, src0, src1));
+         emit(MOV(dst, acc));
+      }
+   }
+}
+
+static enum brw_reg_type
+qw_type_for_mul(fs_reg src0, fs_reg src1)
+{
+   /* Gen8 is able to do the full 32x32 multiply into a QWORD.  The docs say you
+    * cannot use direct addressing for a destination of more than 2 registers,
+    * which is the case in SIMD16. Therefore, like Gen7, the operation must be
+    * downgraded to two SIMD8 muls.
+    *
+    * Oddly, the results can be gathered with 1 mov operation, even though the
+    * docs suggest that shouldn't work.
+    */
+   enum brw_reg_type type = BRW_REGISTER_TYPE_UQ;
+   if (src1.type == BRW_REGISTER_TYPE_D || src0.type == BRW_REGISTER_TYPE_D)
+      type = BRW_REGISTER_TYPE_Q;
+
+   return type;
+}
+
+void
+fs_visitor::emit_mul_dw_high(fs_reg dst, fs_reg src0, fs_reg src1)
+{
+   assert(type_is_dword(src0.type) && type_is_dword(src1.type));
+
+   if (brw->gen >= 8) {
+      fs_reg temp = fs_reg(GRF, virtual_grf_alloc(dispatch_width / 8),
+                           qw_type_for_mul(src0, src1), dispatch_width);
+      if (dispatch_width == 16) {
+         fs_inst *mul = emit(MUL(temp, half(src0, 0), half(src1, 0)));
+         mul->exec_size = 8;
+
+         mul = emit(MUL(temp, half(src0, 1), half(src1, 1)));
+         mul->exec_size = 8;
+
+         emit(SHADER_OPCODE_MOV64, this->result, temp);
+      } else {
+         emit(MUL(temp, src0, src1));
+         emit(SHADER_OPCODE_MOV64, this->result, temp);
+      }
+   } else {
+      unsigned width = brw->gen == 7 ? 8 : dispatch_width;
+      fs_reg acc = fs_reg(retype(brw_acc_reg(width), this->result.type));
+
+      if (brw->gen == 7 && dispatch_width == 16) {
+         emit(MUL(acc, half(src0, 0), half(src1, 0)));
+         emit(MACH(half(this->result, 0), half(src0, 0), half(src1, 0)));
+         emit(MUL(acc, half(src0, 1), half(src1, 1)));
+         emit(MACH(half(this->result, 1), half(src0, 1), half(src1, 1)));
+      } else {
+         emit(MUL(acc, src0, src1));
+         emit(MACH(this->result, src0, src1));
+      }
+   }
+}
+
+void
 fs_visitor::assign_curb_setup()
 {
    if (dispatch_width == 8) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 06575a5..6a168ff 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -521,6 +521,10 @@ public:
    fs_reg fix_math_operand(fs_reg src);
    fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0);
    fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0, fs_reg src1);
+   void emit_mul_dw(fs_reg dst, fs_reg src0, fs_reg src1,
+                    bool src0_u16 = false,
+                    bool src1_u16 = false);
+   void emit_mul_dw_high(fs_reg dst, fs_reg src0, fs_reg src1);
    void emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
                  const fs_reg &a);
    void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 96ed17e..29117d6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -757,93 +757,16 @@ fs_visitor::visit(ir_expression *ir)
       unreachable("not reached: should be handled by ir_sub_to_add_neg");
 
    case ir_binop_mul:
-      if (brw->gen < 8 && ir->type->is_integer()) {
-	 /* For integer multiplication, the MUL uses the low 16 bits
-	  * of one of the operands (src0 on gen6, src1 on gen7).  The
-	  * MACH accumulates in the contribution of the upper 16 bits
-	  * of that operand.
-          */
-         if (ir->operands[0]->is_uint16_constant()) {
-            if (brw->gen < 7)
-               emit(MUL(this->result, op[0], op[1]));
-            else
-               emit(MUL(this->result, op[1], op[0]));
-         } else if (ir->operands[1]->is_uint16_constant()) {
-            if (brw->gen < 7)
-               emit(MUL(this->result, op[1], op[0]));
-            else
-               emit(MUL(this->result, op[0], op[1]));
-         } else {
-            unsigned width = brw->gen == 7 ? 8 : dispatch_width;
-            fs_reg acc = fs_reg(retype(brw_acc_reg(width), this->result.type));
-            fs_reg null = fs_reg(retype(brw_null_vec(width), this->result.type));
-
-            if (brw->gen == 7 && dispatch_width == 16) {
-               emit(MUL(acc, half(op[0], 0), half(op[1], 0)));
-               emit(MACH(null, half(op[0], 0), half(op[1], 0)));
-               fs_inst *mov = emit(MOV(half(this->result, 0), acc));
-               mov->force_sechalf = true;
-
-               emit(MUL(acc, half(op[0], 1), half(op[1], 1)));
-               emit(MACH(null, half(op[0], 1), half(op[1], 1)));
-               mov = emit(MOV(half(this->result, 1), acc));
-               mov->force_sechalf = true;
-            } else {
-               emit(MUL(acc, op[0], op[1]));
-               emit(MACH(null, op[0], op[1]));
-               emit(MOV(this->result, acc));
-            }
-         }
+      if (ir->type->is_integer()) {
+         this->emit_mul_dw(this->result, op[0], op[1],
+                           ir->operands[0]->is_uint16_constant(),
+                           ir->operands[1]->is_uint16_constant());
       } else {
 	 emit(MUL(this->result, op[0], op[1]));
       }
       break;
    case ir_binop_imul_high:
-      if (brw->gen >= 8) {
-         /* Gen8 is able to do the full 32x32 multiply into a QWORD. 
-          * The docs say you cannot use direct addressing for a destination of
-          * more than 2 registers, which is the case in SIMD16. Therefore, like
-          * Gen7, the operation must be downgraded to two SIMD8 muls.
-          *
-          * Oddly, the results can be gathered with 1 mov operation, even though
-          * the docs suggest that shouldn't work.
-          */
-         enum brw_reg_type type = BRW_REGISTER_TYPE_UQ;
-
-         if (op[1].type == BRW_REGISTER_TYPE_D ||
-             op[0].type == BRW_REGISTER_TYPE_D)
-            type = BRW_REGISTER_TYPE_Q;
-
-         fs_reg temp = fs_reg(GRF, virtual_grf_alloc(dispatch_width / 8),
-                              type, dispatch_width);
-         if (dispatch_width == 16) {
-            fs_inst *mul = emit(MUL(temp, half(op[0], 0), half(op[1], 0)));
-            mul->exec_size = 8;
-
-            mul = emit(MUL(temp, half(op[0], 1), half(op[1], 1)));
-            mul->exec_size = 8;
-
-            emit(SHADER_OPCODE_MOV64, this->result, temp);
-         } else {
-            emit(MUL(temp, op[0], op[1]));
-            emit(SHADER_OPCODE_MOV64, this->result, temp);
-         }
-      } else {
-         unsigned width = brw->gen == 7 ? 8 : dispatch_width;
-         fs_reg acc = fs_reg(retype(brw_acc_reg(width), this->result.type));
-
-         if (brw->gen == 7 && dispatch_width == 16) {
-            emit(MUL(acc, half(op[0], 0), half(op[1], 0)));
-            emit(MACH(half(this->result, 0), half(op[0], 0), half(op[1], 0)));
-            emit(MUL(acc, half(op[0], 1), half(op[1], 1)));
-            emit(MACH(half(this->result, 1), half(op[0], 1),
-                     half(op[1], 1)));
-         } else {
-            emit(MUL(acc, op[0], op[1]));
-            emit(MACH(this->result, op[0], op[1]));
-         }
-      }
-
+      emit_mul_dw_high(this->result, op[0], op[1]);
       break;
    case ir_binop_div:
       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
-- 
2.2.1