[Mesa-dev] [PATCH 1/2] i965/vec4: Lower integer multiplication after optimizations.

Tue Apr 19 00:08:31 UTC 2016

On 04/18/2016 04:14 PM, Matt Turner wrote:
> Analogous to commit 1e4e17fbd in the i965/fs backend.
> 
> Because the copy propagation pass in the vec4 backend is strictly local,
> we look at the immediate values coming from NIR and emit the multiplies
> we need directly. If the copy propagation pass becomes smarter in the
> future, we can reduce the nir_op_imul case in brw_vec4_nir.cpp to a
> single multiply.
> 
> total instructions in shared programs: 7082311 -> 7081953 (-0.01%)
> instructions in affected programs: 59581 -> 59223 (-0.60%)
> helped: 293
> 
> total cycles in shared programs: 65765712 -> 65764796 (-0.00%)
> cycles in affected programs: 854112 -> 853196 (-0.11%)
> helped: 154
> HURT: 73
> ---
>  src/mesa/drivers/dri/i965/brw_vec4.cpp     | 67 ++++++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_vec4.h       |  1 +
>  src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 48 +++++++++------------
>  3 files changed, 88 insertions(+), 28 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index b9cf3f6..1644d4d 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -1671,6 +1671,71 @@ vec4_visitor::lower_minmax()
>     return progress;
>  }
>  
> +bool
> +vec4_visitor::lower_integer_multiplication()
> +{
> +   bool progress = false;
> +
> +   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> +      const vec4_builder ibld(this, block, inst);
> +
> +      if (inst->opcode == BRW_OPCODE_MUL) {
> +         if (inst->dst.is_accumulator() ||
> +             (inst->src[1].type != BRW_REGISTER_TYPE_D &&
> +              inst->src[1].type != BRW_REGISTER_TYPE_UD))
> +            continue;
> +
> +         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
> +          * operation directly, but CHV/BXT cannot.
> +          */
> +         if (devinfo->gen >= 8 &&
> +             !devinfo->is_cherryview && !devinfo->is_broxton)
> +            continue;

Shouldn't this whole method just bail if we're Gen >= 8 and !CHV and
!BXT?  Or does this structure simplify future changes?

> +
> +         if (inst->src[1].file == IMM &&
> +             inst->src[1].ud < (1 << 16)) {
> +            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
> +             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
> +             * src1 are used.
> +             *
> +             * If multiplying by an immediate value that fits in 16-bits, do a
> +             * single MUL instruction with that value in the proper location.
> +             */
> +            if (devinfo->gen < 7) {
> +               dst_reg imm(VGRF, alloc.allocate(1), inst->dst.type,
> +                           inst->dst.writemask);
> +               ibld.MOV(imm, inst->src[1]);
> +               ibld.MUL(inst->dst, src_reg(imm), inst->src[0]);
> +            } else {
> +               ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
> +            }
> +         } else {
> +            const dst_reg acc(brw_writemask(retype(brw_acc_reg(8),
> +                                                   inst->dst.type),
> +                                            inst->dst.writemask));
> +            const dst_reg null(brw_writemask(retype(brw_null_reg(),
> +                                                    inst->dst.type),
> +                                             inst->dst.writemask));
> +
> +            ibld.MUL(acc, inst->src[0], inst->src[1]);
> +            ibld.MACH(null, inst->src[0], inst->src[1]);
> +            set_condmod(inst->conditional_mod,
> +                        ibld.MOV(inst->dst, src_reg(acc)));
> +         }
> +      } else {
> +         continue;
> +      }
> +
> +      inst->remove(block);
> +      progress = true;
> +   }
> +
> +   if (progress)
> +      invalidate_live_intervals();
> +
> +   return progress;
> +}
> +
>  src_reg
>  vec4_visitor::get_timestamp()
>  {
> @@ -1950,6 +2015,8 @@ vec4_visitor::run()
>        OPT(dead_code_eliminate);
>     }
>  
> +   OPT(lower_integer_multiplication);
> +
>     if (failed)
>        return false;
>  
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
> index d43a5a8..f6f8b12 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.h
> @@ -301,6 +301,7 @@ public:
>     void resolve_ud_negate(src_reg *reg);
>  
>     bool lower_minmax();
> +   bool lower_integer_multiplication();
>  
>     src_reg get_timestamp();
>  
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
> index e4e8c38..10e2f54 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
> @@ -1039,35 +1039,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
>        break;
>  
>     case nir_op_imul: {
> -      if (devinfo->gen < 8) {
> -         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
> -         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
> -
> -         /* For integer multiplication, the MUL uses the low 16 bits of one of
> -          * the operands (src0 through SNB, src1 on IVB and later). The MACH
> -          * accumulates in the contribution of the upper 16 bits of that
> -          * operand. If we can determine that one of the args is in the low
> -          * 16 bits, though, we can just emit a single MUL.
> -          */
> -         if (value0 && value0->u32[0] < (1 << 16)) {
> -            if (devinfo->gen < 7)
> -               emit(MUL(dst, op[0], op[1]));
> -            else
> -               emit(MUL(dst, op[1], op[0]));
> -         } else if (value1 && value1->u32[0] < (1 << 16)) {
> -            if (devinfo->gen < 7)
> -               emit(MUL(dst, op[1], op[0]));
> -            else
> -               emit(MUL(dst, op[0], op[1]));
> -         } else {
> -            struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
> -
> -            emit(MUL(acc, op[0], op[1]));
> -            emit(MACH(dst_null_d(), op[0], op[1]));
> -            emit(MOV(dst, src_reg(acc)));
> -         }
> +      nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
> +      nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
> +
> +      /* For integer multiplication, the MUL uses the low 16 bits of one of
> +       * the operands (src0 through SNB, src1 on IVB and later). The MACH
> +       * accumulates in the contribution of the upper 16 bits of that
> +       * operand. If we can determine that one of the args is in the low
> +       * 16 bits, though, we can just emit a single MUL.
> +       */
> +      if (value0 && value0->u32[0] < (1 << 16)) {
> +         if (devinfo->gen < 7)
> +            emit(MUL(dst, op[0], op[1]));
> +         else
> +            emit(MUL(dst, op[1], retype(brw_imm_ud(value0->u32[0]), dst.type)));
> +      } else if (value1 && value1->u32[0] < (1 << 16)) {
> +         if (devinfo->gen < 7)
> +            emit(MUL(dst, op[1], op[0]));
> +         else
> +            emit(MUL(dst, op[0], retype(brw_imm_ud(value1->u32[0]), dst.type)));
>        } else {
> -	 emit(MUL(dst, op[0], op[1]));
> +         emit(MUL(dst, op[0], op[1]));
>        }
>        break;
>     }
>