[Mesa-dev] [PATCH 1/2] i965/vec4: Lower integer multiplication after optimizations.
Ian Romanick
idr at freedesktop.org
Tue Apr 19 00:08:31 UTC 2016
On 04/18/2016 04:14 PM, Matt Turner wrote:
> Analogous to commit 1e4e17fbd in the i965/fs backend.
>
> Because the copy propagation pass in the vec4 backend is strictly local,
> we look at the immediate values coming from NIR and emit the multiplies
> we need directly. If the copy propagation pass becomes smarter in the
> future, we can reduce the nir_op_imul case in brw_vec4_nir.cpp to a
> single multiply.
>
> total instructions in shared programs: 7082311 -> 7081953 (-0.01%)
> instructions in affected programs: 59581 -> 59223 (-0.60%)
> helped: 293
>
> total cycles in shared programs: 65765712 -> 65764796 (-0.00%)
> cycles in affected programs: 854112 -> 853196 (-0.11%)
> helped: 154
> HURT: 73
> ---
> src/mesa/drivers/dri/i965/brw_vec4.cpp | 67 ++++++++++++++++++++++++++++++
> src/mesa/drivers/dri/i965/brw_vec4.h | 1 +
> src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 48 +++++++++------------
> 3 files changed, 88 insertions(+), 28 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index b9cf3f6..1644d4d 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -1671,6 +1671,71 @@ vec4_visitor::lower_minmax()
> return progress;
> }
>
> +bool
> +vec4_visitor::lower_integer_multiplication()
> +{
> + bool progress = false;
> +
> + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> + const vec4_builder ibld(this, block, inst);
> +
> + if (inst->opcode == BRW_OPCODE_MUL) {
> + if (inst->dst.is_accumulator() ||
> + (inst->src[1].type != BRW_REGISTER_TYPE_D &&
> + inst->src[1].type != BRW_REGISTER_TYPE_UD))
> + continue;
> +
> + /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
> + * operation directly, but CHV/BXT cannot.
> + */
> + if (devinfo->gen >= 8 &&
> + !devinfo->is_cherryview && !devinfo->is_broxton)
> + continue;
Shouldn't this whole method just bail if we're Gen >= 8 and !CHV and
!BXT? Or does this structure simplify future changes?
> +
> + if (inst->src[1].file == IMM &&
> + inst->src[1].ud < (1 << 16)) {
> + /* The MUL instruction isn't commutative. On Gen <= 6, only the low
> + * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
> + * src1 are used.
> + *
> + * If multiplying by an immediate value that fits in 16-bits, do a
> + * single MUL instruction with that value in the proper location.
> + */
> + if (devinfo->gen < 7) {
> + dst_reg imm(VGRF, alloc.allocate(1), inst->dst.type,
> + inst->dst.writemask);
> + ibld.MOV(imm, inst->src[1]);
> + ibld.MUL(inst->dst, src_reg(imm), inst->src[0]);
> + } else {
> + ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
> + }
> + } else {
> + const dst_reg acc(brw_writemask(retype(brw_acc_reg(8),
> + inst->dst.type),
> + inst->dst.writemask));
> + const dst_reg null(brw_writemask(retype(brw_null_reg(),
> + inst->dst.type),
> + inst->dst.writemask));
> +
> + ibld.MUL(acc, inst->src[0], inst->src[1]);
> + ibld.MACH(null, inst->src[0], inst->src[1]);
> + set_condmod(inst->conditional_mod,
> + ibld.MOV(inst->dst, src_reg(acc)));
> + }
> + } else {
> + continue;
> + }
> +
> + inst->remove(block);
> + progress = true;
> + }
> +
> + if (progress)
> + invalidate_live_intervals();
> +
> + return progress;
> +}
> +
> src_reg
> vec4_visitor::get_timestamp()
> {
> @@ -1950,6 +2015,8 @@ vec4_visitor::run()
> OPT(dead_code_eliminate);
> }
>
> + OPT(lower_integer_multiplication);
> +
> if (failed)
> return false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
> index d43a5a8..f6f8b12 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.h
> @@ -301,6 +301,7 @@ public:
> void resolve_ud_negate(src_reg *reg);
>
> bool lower_minmax();
> + bool lower_integer_multiplication();
>
> src_reg get_timestamp();
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
> index e4e8c38..10e2f54 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
> @@ -1039,35 +1039,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
> break;
>
> case nir_op_imul: {
> - if (devinfo->gen < 8) {
> - nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
> - nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
> -
> - /* For integer multiplication, the MUL uses the low 16 bits of one of
> - * the operands (src0 through SNB, src1 on IVB and later). The MACH
> - * accumulates in the contribution of the upper 16 bits of that
> - * operand. If we can determine that one of the args is in the low
> - * 16 bits, though, we can just emit a single MUL.
> - */
> - if (value0 && value0->u32[0] < (1 << 16)) {
> - if (devinfo->gen < 7)
> - emit(MUL(dst, op[0], op[1]));
> - else
> - emit(MUL(dst, op[1], op[0]));
> - } else if (value1 && value1->u32[0] < (1 << 16)) {
> - if (devinfo->gen < 7)
> - emit(MUL(dst, op[1], op[0]));
> - else
> - emit(MUL(dst, op[0], op[1]));
> - } else {
> - struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
> -
> - emit(MUL(acc, op[0], op[1]));
> - emit(MACH(dst_null_d(), op[0], op[1]));
> - emit(MOV(dst, src_reg(acc)));
> - }
> + nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
> + nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
> +
> + /* For integer multiplication, the MUL uses the low 16 bits of one of
> + * the operands (src0 through SNB, src1 on IVB and later). The MACH
> + * accumulates in the contribution of the upper 16 bits of that
> + * operand. If we can determine that one of the args is in the low
> + * 16 bits, though, we can just emit a single MUL.
> + */
> + if (value0 && value0->u32[0] < (1 << 16)) {
> + if (devinfo->gen < 7)
> + emit(MUL(dst, op[0], op[1]));
> + else
> + emit(MUL(dst, op[1], retype(brw_imm_ud(value0->u32[0]), dst.type)));
> + } else if (value1 && value1->u32[0] < (1 << 16)) {
> + if (devinfo->gen < 7)
> + emit(MUL(dst, op[1], op[0]));
> + else
> + emit(MUL(dst, op[0], retype(brw_imm_ud(value1->u32[0]), dst.type)));
> } else {
> - emit(MUL(dst, op[0], op[1]));
> + emit(MUL(dst, op[0], op[1]));
> }
> break;
> }
>
More information about the mesa-dev
mailing list