[Mesa-dev] [PATCH 1/6] i965/fs: Lower integer multiplication after optimizations.

Sat May 16 08:37:48 PDT 2015

On Fri, May 15, 2015 at 2:02 PM, Matt Turner <mattst88 at gmail.com> wrote:
> 32-bit x 32-bit integer multiplication requires multiple instructions
> until Broadwell. This patch just lets us treat the MUL instruction in
> the FS backend like it operates on Broadwell, and after optimizations
> we lower it into a sequence of instructions on older platforms.

Any reason why you chose to re-use the MUL instruction with D types
rather than making a logical IMUL instruction?  I think I have a minor
preference for IMUL but I also understand that it can be a pain to
plumb through.

> Doing this will allow us to some extra optimization on integer
> multiplies.
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp         | 66 ++++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_fs.h           |  1 +
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp     | 36 +--------------
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 31 +------------
>  4 files changed, 70 insertions(+), 64 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 08664cf..33199dd 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -3523,6 +3523,71 @@ fs_visitor::lower_load_payload()
>     return progress;
>  }
>
> +bool
> +fs_visitor::lower_integer_multiplication()
> +{
> +   bool progress = false;
> +
> +   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
> +    * directly, but Cherryview cannot.
> +    */
> +   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
> +      return false;
> +
> +   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
> +      if (inst->opcode != BRW_OPCODE_MUL ||
> +          inst->dst.is_accumulator() ||
> +          (inst->dst.type != BRW_REGISTER_TYPE_D &&
> +           inst->dst.type != BRW_REGISTER_TYPE_UD))
> +         continue;
> +
> +#define insert(instr) inst->insert_before(block, instr)
> +
> +      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
> +       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
> +       * src1 are used.
> +       *
> +       * If multiplying by an immediate value that fits in 16-bits, do a
> +       * single MUL instruction with that value in the proper location.
> +       */
> +      if (inst->src[1].file == IMM &&
> +          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
> +         if (devinfo->gen < 7) {
> +            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
> +                       inst->dst.type, dispatch_width);
> +            insert(MOV(imm, inst->src[1]));
> +            insert(MUL(inst->dst, imm, inst->src[0]));
> +         } else {
> +            insert(MUL(inst->dst, inst->src[0], inst->src[1]));
> +         }
> +      } else {
> +         if (devinfo->gen >= 7)
> +            no16("SIMD16 integer multiply unsupported\n");
> +
> +         const unsigned channels = dispatch_width;
> +         const enum brw_reg_type type = inst->dst.type;
> +         const fs_reg acc(retype(brw_acc_reg(channels), type));
> +         const fs_reg null(retype(brw_null_vec(channels), type));
> +
> +         const fs_reg &src0 = inst->src[0];
> +         const fs_reg &src1 = inst->src[1];
> +
> +         insert(MUL(acc, src0, src1));
> +         insert(MACH(null, src0, src1));
> +         insert(MOV(inst->dst, acc));
> +      }
> +#undef insert
> +
> +      inst->remove(block);
> +      progress = true;
> +   }
> +
> +   if (progress)
> +      invalidate_live_intervals();
> +
> +   return progress;
> +}
> +
>  void
>  fs_visitor::dump_instructions()
>  {
> @@ -4001,6 +4066,7 @@ fs_visitor::optimize()
>     }
>
>     OPT(opt_combine_constants);
> +   OPT(lower_integer_multiplication);
>
>     lower_uniform_pull_constant_loads();
>  }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index 1d7de2e..f94fd1c 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -257,6 +257,7 @@ public:
>     void no16(const char *msg, ...);
>     void lower_uniform_pull_constant_loads();
>     bool lower_load_payload();
> +   bool lower_integer_multiplication();
>     bool opt_combine_constants();
>
>     void emit_dummy_fs();
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 9cfd0e7..5dd8363 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -780,41 +780,9 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
>        inst->saturate = instr->dest.saturate;
>        break;
>
> -   case nir_op_imul: {
> -      if (devinfo->gen >= 8) {
> -         emit(MUL(result, op[0], op[1]));
> -         break;
> -      } else {
> -         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
> -         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
> -
> -         if (value0 && value0->u[0] < (1 << 16)) {
> -            if (devinfo->gen < 7) {
> -               emit(MUL(result, op[0], op[1]));
> -            } else {
> -               emit(MUL(result, op[1], op[0]));
> -            }
> -            break;
> -         } else if (value1 && value1->u[0] < (1 << 16)) {
> -            if (devinfo->gen < 7) {
> -               emit(MUL(result, op[1], op[0]));
> -            } else {
> -               emit(MUL(result, op[0], op[1]));
> -            }
> -            break;
> -         }
> -      }
> -
> -      if (devinfo->gen >= 7)
> -         no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> -      struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
> -
> -      emit(MUL(acc, op[0], op[1]));
> -      emit(MACH(reg_null_d, op[0], op[1]));
> -      emit(MOV(result, fs_reg(acc)));
> +   case nir_op_imul:
> +      emit(MUL(result, op[0], op[1]));
>        break;
> -   }
>
>     case nir_op_imul_high:
>     case nir_op_umul_high: {
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 78f269e..fa3cff4 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -873,36 +873,7 @@ fs_visitor::visit(ir_expression *ir)
>        unreachable("not reached: should be handled by ir_sub_to_add_neg");
>
>     case ir_binop_mul:
> -      if (devinfo->gen < 8 && ir->type->is_integer()) {
> -        /* For integer multiplication, the MUL uses the low 16 bits
> -         * of one of the operands (src0 on gen6, src1 on gen7).  The
> -         * MACH accumulates in the contribution of the upper 16 bits
> -         * of that operand.
> -          */
> -         if (ir->operands[0]->is_uint16_constant()) {
> -            if (devinfo->gen < 7)
> -               emit(MUL(this->result, op[0], op[1]));
> -            else
> -               emit(MUL(this->result, op[1], op[0]));
> -         } else if (ir->operands[1]->is_uint16_constant()) {
> -            if (devinfo->gen < 7)
> -               emit(MUL(this->result, op[1], op[0]));
> -            else
> -               emit(MUL(this->result, op[0], op[1]));
> -         } else {
> -            if (devinfo->gen >= 7)
> -               no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> -            struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> -                                        this->result.type);
> -
> -            emit(MUL(acc, op[0], op[1]));
> -            emit(MACH(reg_null_d, op[0], op[1]));
> -            emit(MOV(this->result, fs_reg(acc)));
> -         }
> -      } else {
> -        emit(MUL(this->result, op[0], op[1]));
> -      }
> +      emit(MUL(this->result, op[0], op[1]));
>        break;
>     case ir_binop_imul_high: {
>        if (devinfo->gen >= 7)
> --
> 2.3.6
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev