[Mesa-dev] [PATCH 1/6] i965/fs: Lower integer multiplication after optimizations.
Jason Ekstrand
jason at jlekstrand.net
Sat May 16 08:37:48 PDT 2015
On Fri, May 15, 2015 at 2:02 PM, Matt Turner <mattst88 at gmail.com> wrote:
> 32-bit x 32-bit integer multiplication requires multiple instructions
> until Broadwell. This patch just lets us treat the MUL instruction in
> the FS backend like it operates on Broadwell, and after optimizations
> we lower it into a sequence of instructions on older platforms.
Any reason why you chose to re-use the MUL instruction with D types
rather than making a logical IMUL instruction? I think I have a minor
preference for IMUL but I also understand that it can be a pain to
plumb through.
> Doing this will allow us to some extra optimization on integer
> multiplies.
> ---
> src/mesa/drivers/dri/i965/brw_fs.cpp | 66 ++++++++++++++++++++++++++++
> src/mesa/drivers/dri/i965/brw_fs.h | 1 +
> src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 36 +--------------
> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 31 +------------
> 4 files changed, 70 insertions(+), 64 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 08664cf..33199dd 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -3523,6 +3523,71 @@ fs_visitor::lower_load_payload()
> return progress;
> }
>
> +bool
> +fs_visitor::lower_integer_multiplication()
> +{
> + bool progress = false;
> +
> + /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
> + * directly, but Cherryview cannot.
> + */
> + if (devinfo->gen >= 8 && !devinfo->is_cherryview)
> + return false;
> +
> + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
> + if (inst->opcode != BRW_OPCODE_MUL ||
> + inst->dst.is_accumulator() ||
> + (inst->dst.type != BRW_REGISTER_TYPE_D &&
> + inst->dst.type != BRW_REGISTER_TYPE_UD))
> + continue;
> +
> +#define insert(instr) inst->insert_before(block, instr)
> +
> + /* The MUL instruction isn't commutative. On Gen <= 6, only the low
> + * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
> + * src1 are used.
> + *
> + * If multiplying by an immediate value that fits in 16-bits, do a
> + * single MUL instruction with that value in the proper location.
> + */
> + if (inst->src[1].file == IMM &&
> + inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
> + if (devinfo->gen < 7) {
> + fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
> + inst->dst.type, dispatch_width);
> + insert(MOV(imm, inst->src[1]));
> + insert(MUL(inst->dst, imm, inst->src[0]));
> + } else {
> + insert(MUL(inst->dst, inst->src[0], inst->src[1]));
> + }
> + } else {
> + if (devinfo->gen >= 7)
> + no16("SIMD16 integer multiply unsupported\n");
> +
> + const unsigned channels = dispatch_width;
> + const enum brw_reg_type type = inst->dst.type;
> + const fs_reg acc(retype(brw_acc_reg(channels), type));
> + const fs_reg null(retype(brw_null_vec(channels), type));
> +
> + const fs_reg &src0 = inst->src[0];
> + const fs_reg &src1 = inst->src[1];
> +
> + insert(MUL(acc, src0, src1));
> + insert(MACH(null, src0, src1));
> + insert(MOV(inst->dst, acc));
> + }
> +#undef insert
> +
> + inst->remove(block);
> + progress = true;
> + }
> +
> + if (progress)
> + invalidate_live_intervals();
> +
> + return progress;
> +}
> +
> void
> fs_visitor::dump_instructions()
> {
> @@ -4001,6 +4066,7 @@ fs_visitor::optimize()
> }
>
> OPT(opt_combine_constants);
> + OPT(lower_integer_multiplication);
>
> lower_uniform_pull_constant_loads();
> }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index 1d7de2e..f94fd1c 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -257,6 +257,7 @@ public:
> void no16(const char *msg, ...);
> void lower_uniform_pull_constant_loads();
> bool lower_load_payload();
> + bool lower_integer_multiplication();
> bool opt_combine_constants();
>
> void emit_dummy_fs();
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 9cfd0e7..5dd8363 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -780,41 +780,9 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
> inst->saturate = instr->dest.saturate;
> break;
>
> - case nir_op_imul: {
> - if (devinfo->gen >= 8) {
> - emit(MUL(result, op[0], op[1]));
> - break;
> - } else {
> - nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
> - nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
> -
> - if (value0 && value0->u[0] < (1 << 16)) {
> - if (devinfo->gen < 7) {
> - emit(MUL(result, op[0], op[1]));
> - } else {
> - emit(MUL(result, op[1], op[0]));
> - }
> - break;
> - } else if (value1 && value1->u[0] < (1 << 16)) {
> - if (devinfo->gen < 7) {
> - emit(MUL(result, op[1], op[0]));
> - } else {
> - emit(MUL(result, op[0], op[1]));
> - }
> - break;
> - }
> - }
> -
> - if (devinfo->gen >= 7)
> - no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> - struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
> -
> - emit(MUL(acc, op[0], op[1]));
> - emit(MACH(reg_null_d, op[0], op[1]));
> - emit(MOV(result, fs_reg(acc)));
> + case nir_op_imul:
> + emit(MUL(result, op[0], op[1]));
> break;
> - }
>
> case nir_op_imul_high:
> case nir_op_umul_high: {
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 78f269e..fa3cff4 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -873,36 +873,7 @@ fs_visitor::visit(ir_expression *ir)
> unreachable("not reached: should be handled by ir_sub_to_add_neg");
>
> case ir_binop_mul:
> - if (devinfo->gen < 8 && ir->type->is_integer()) {
> - /* For integer multiplication, the MUL uses the low 16 bits
> - * of one of the operands (src0 on gen6, src1 on gen7). The
> - * MACH accumulates in the contribution of the upper 16 bits
> - * of that operand.
> - */
> - if (ir->operands[0]->is_uint16_constant()) {
> - if (devinfo->gen < 7)
> - emit(MUL(this->result, op[0], op[1]));
> - else
> - emit(MUL(this->result, op[1], op[0]));
> - } else if (ir->operands[1]->is_uint16_constant()) {
> - if (devinfo->gen < 7)
> - emit(MUL(this->result, op[1], op[0]));
> - else
> - emit(MUL(this->result, op[0], op[1]));
> - } else {
> - if (devinfo->gen >= 7)
> - no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> - struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> - this->result.type);
> -
> - emit(MUL(acc, op[0], op[1]));
> - emit(MACH(reg_null_d, op[0], op[1]));
> - emit(MOV(this->result, fs_reg(acc)));
> - }
> - } else {
> - emit(MUL(this->result, op[0], op[1]));
> - }
> + emit(MUL(this->result, op[0], op[1]));
> break;
> case ir_binop_imul_high: {
> if (devinfo->gen >= 7)
> --
> 2.3.6
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list