[Mesa-dev] [PATCH 07/21] i965/fs: Implement SIMD16 64-bit integer multiplies on Gen 8.

Ben Widawsky benjamin.widawsky at intel.com
Mon Dec 22 19:29:17 PST 2014


This patch uses the new QWORD type introduced on Gen8. This allows us to perform
the operation without requiring the additional MACH.

Similar to Gen7, it seems we must demote SIMD16 to 2 SIMD8s. On the bright side,
we get the results in 3 instructions, and no MACH. MACH is undesirable because
it requires the accumulator write flag, which can hinder optimization passes.

Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_defines.h        |  2 +
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 12 +++++
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp   | 73 ++++++++++++++------------
 src/mesa/drivers/dri/i965/brw_shader.cpp       |  2 +
 4 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 28e398d..102ba4a 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1100,6 +1100,8 @@ enum opcode {
     *   and number of SO primitives needed.
     */
    GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
+
+   SHADER_OPCODE_MOV64,
 };
 
 enum brw_urb_write_flags {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index c652d65..3a15837 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1619,6 +1619,18 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
       case BRW_OPCODE_MUL:
 	 brw_MUL(p, dst, src[0], src[1]);
 	 break;
+      case SHADER_OPCODE_MOV64:
+         /* This opcode is used to mov the result of a native SIMD16 (2 SIMD8s)
+          * mul into the dst register.
+          */
+         assert(brw->gen >= 8);
+         src[0].subnr = 4;
+         src[0].type = dst.type;
+         src[0] = stride(src[0], 8, 4, 2);
+         assert(dst.type == BRW_REGISTER_TYPE_UD ||
+                dst.type == BRW_REGISTER_TYPE_D);
+         brw_MOV(p, dst, src[0]);
+         break;
       case BRW_OPCODE_AVG:
 	 brw_AVG(p, dst, src[0], src[1]);
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index de03618..98f1b0d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -798,46 +798,53 @@ fs_visitor::visit(ir_expression *ir)
 	 emit(MUL(this->result, op[0], op[1]));
       }
       break;
-   case ir_binop_imul_high: {
-      if (brw->gen >= 8)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
+   case ir_binop_imul_high:
+      if (brw->gen >= 8) {
+         /* Gen8 is able to do the full 32x32 multiply into a QWORD. 
+          * The docs say you cannot use direct addressing for a destination of
+          * more than 2 registers, which is the case in SIMD16. Therefore, like
+          * Gen7, the operation must be downgraded to two SIMD8 muls.
+          *
+          * Oddly, the results can be gathered with 1 mov operation, even though
+          * the docs suggest that shouldn't work.
+          */
+         enum brw_reg_type type = BRW_REGISTER_TYPE_UQ;
 
-      unsigned width = brw->gen >= 7 ? 8 : dispatch_width;
-      fs_reg acc = fs_reg(retype(brw_acc_reg(width), this->result.type));
+         if (op[1].type == BRW_REGISTER_TYPE_D ||
+             op[0].type == BRW_REGISTER_TYPE_D)
+            type = BRW_REGISTER_TYPE_Q;
 
-      if (brw->gen == 7 && dispatch_width == 16) {
-         emit(MUL(acc, half(op[0], 0), half(op[1], 0)));
-         emit(MACH(half(this->result, 0), half(op[0], 0), half(op[1], 0)));
-         emit(MUL(acc, half(op[0], 1), half(op[1], 1)));
-         emit(MACH(half(this->result, 1), half(op[0], 1),
-                     half(op[1], 1)));
-      } else {
-         fs_inst *mul = emit(MUL(acc, op[0], op[1]));
-         emit(MACH(this->result, op[0], op[1]));
+         fs_reg temp = fs_reg(GRF, virtual_grf_alloc(dispatch_width / 8),
+                              type, dispatch_width);
+         if (dispatch_width == 16) {
+            fs_inst *mul = emit(MUL(temp, half(op[0], 0), half(op[1], 0)));
+            mul->exec_size = 8;
 
-         /* Until Gen8, integer multiplies read 32-bits from one source, and
-          * 16-bits from the other, and relying on the MACH instruction to
-          * generate the high bits of the result.
-          *
-          * On Gen8, the multiply instruction does a full 32x32-bit multiply,
-          * but in order to do a 64x64-bit multiply we have to simulate the
-          * previous behavior and then use a MACH instruction.
-          *
-          * FINISHME: Don't use source modifiers on src1.
-          */
-         if (brw->gen >= 8) {
-            assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
-                   mul->src[1].type == BRW_REGISTER_TYPE_UD);
-            if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
-               mul->src[1].type = BRW_REGISTER_TYPE_W;
-            } else {
-               mul->src[1].type = BRW_REGISTER_TYPE_UW;
-            }
+            mul = emit(MUL(temp, half(op[0], 1), half(op[1], 1)));
+            mul->exec_size = 8;
+
+            emit(SHADER_OPCODE_MOV64, this->result, temp);
+         } else {
+            emit(MUL(temp, op[0], op[1]));
+            emit(SHADER_OPCODE_MOV64, this->result, temp);
+         }
+      } else {
+         unsigned width = brw->gen == 7 ? 8 : dispatch_width;
+         fs_reg acc = fs_reg(retype(brw_acc_reg(width), this->result.type));
+
+         if (brw->gen == 7 && dispatch_width == 16) {
+            emit(MUL(acc, half(op[0], 0), half(op[1], 0)));
+            emit(MACH(half(this->result, 0), half(op[0], 0), half(op[1], 0)));
+            emit(MUL(acc, half(op[0], 1), half(op[1], 1)));
+            emit(MACH(half(this->result, 1), half(op[0], 1),
+                     half(op[1], 1)));
+         } else {
+            emit(MUL(acc, op[0], op[1]));
+            emit(MACH(this->result, op[0], op[1]));
          }
       }
 
       break;
-   }
    case ir_binop_div:
       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
       assert(ir->type->is_integer());
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 1e5227c..5373b75 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -404,6 +404,8 @@ brw_instruction_name(enum opcode op)
    case FS_OPCODE_REP_FB_WRITE:
       return "rep_fb_write";
 
+   case SHADER_OPCODE_MOV64:
+      return "mov64";
    case SHADER_OPCODE_RCP:
       return "rcp";
    case SHADER_OPCODE_RSQ:
-- 
2.2.1



More information about the mesa-dev mailing list