[Mesa-dev] [PATCH 3/5] i965/fs: Indent the implementation of 32x32-bit MUL lowering by one level.

Wed Aug 5 11:39:10 PDT 2015

On Wed, Aug 5, 2015 at 10:52 AM, Francisco Jerez <currojerez at riseup.net> wrote:
> In order to make room for the code that will lower the MULH virtual
> instruction.  Also move the hardware generation and execution type
> checks into the same branch, they are going to have to be different
> for MULH.
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp | 265 ++++++++++++++++++-----------------
>  1 file changed, 136 insertions(+), 129 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index fc9f007..911e32b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -3125,155 +3125,162 @@ fs_visitor::lower_integer_multiplication()
>  {
>     bool progress = false;
>
> -   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
> -    * directly, but CHV/BXT cannot.
> -    */
> -   if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton)
> -      return false;
> -
>     foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
> -      if (inst->opcode != BRW_OPCODE_MUL ||
> -          inst->dst.is_accumulator() ||
> -          (inst->dst.type != BRW_REGISTER_TYPE_D &&
> -           inst->dst.type != BRW_REGISTER_TYPE_UD))
> -         continue;
> -
>        const fs_builder ibld(this, block, inst);
>
> -      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
> -       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
> -       * src1 are used.
> -       *
> -       * If multiplying by an immediate value that fits in 16-bits, do a
> -       * single MUL instruction with that value in the proper location.
> -       */
> -      if (inst->src[1].file == IMM &&
> -          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
> -         if (devinfo->gen < 7) {
> -            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
> -                       inst->dst.type);
> -            ibld.MOV(imm, inst->src[1]);
> -            ibld.MUL(inst->dst, imm, inst->src[0]);
> -         } else {
> -            ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
> -         }
> -      } else {
> -         /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
> -          * do 32-bit integer multiplication in one instruction, but instead
> -          * must do a sequence (which actually calculates a 64-bit result):
> -          *
> -          *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
> -          *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
> -          *    mov(8)  g2<1>D     acc0<8,8,1>D
> -          *
> -          * But on Gen > 6, the ability to use second accumulator register
> -          * (acc1) for non-float data types was removed, preventing a simple
> -          * implementation in SIMD16. A 16-channel result can be calculated by
> -          * executing the three instructions twice in SIMD8, once with quarter
> -          * control of 1Q for the first eight channels and again with 2Q for
> -          * the second eight channels.
> -          *
> -          * Which accumulator register is implicitly accessed (by AccWrEnable
> -          * for instance) is determined by the quarter control. Unfortunately
> -          * Ivybridge (and presumably Baytrail) has a hardware bug in which an
> -          * implicit accumulator access by an instruction with 2Q will access
> -          * acc1 regardless of whether the data type is usable in acc1.
> -          *
> -          * Specifically, the 2Q mach(8) writes acc1 which does not exist for
> -          * integer data types.
> -          *
> -          * Since we only want the low 32-bits of the result, we can do two
> -          * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
> -          * adjust the high result and add them (like the mach is doing):
> -          *
> -          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
> -          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
> -          *    shl(8)  g9<1>D     g8<8,8,1>D      16D
> -          *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
> -          *
> -          * We avoid the shl instruction by realizing that we only want to add
> -          * the low 16-bits of the "high" result to the high 16-bits of the
> -          * "low" result and using proper regioning on the add:
> -          *
> -          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
> -          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
> -          *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
> -          *
> -          * Since it does not use the (single) accumulator register, we can
> -          * schedule multi-component multiplications much better.
> +      if (inst->opcode == BRW_OPCODE_MUL) {
> +         if (inst->dst.is_accumulator() ||
> +             (inst->dst.type != BRW_REGISTER_TYPE_D &&
> +              inst->dst.type != BRW_REGISTER_TYPE_UD))
> +            continue;
> +
> +         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
> +          * operation directly, but CHV/BXT cannot.
>            */
> +         if (devinfo->gen >= 8 &&
> +             !devinfo->is_cherryview && !devinfo->is_broxton)
> +            continue;
>
> -         if (inst->conditional_mod && inst->dst.is_null()) {
> -            inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
> -                               inst->dst.type);
> -         }
> -         fs_reg low = inst->dst;
> -         fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
> -                     inst->dst.type);
> +         if (inst->src[1].file == IMM &&
> +             inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
> +            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
> +             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
> +             * src1 are used.
> +             *
> +             * If multiplying by an immediate value that fits in 16-bits, do a
> +             * single MUL instruction with that value in the proper location.
> +             */
> +            if (devinfo->gen < 7) {
> +               fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
> +                          inst->dst.type);
> +               ibld.MOV(imm, inst->src[1]);
> +               ibld.MUL(inst->dst, imm, inst->src[0]);
> +            } else {
> +               ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
> +            }
> +         } else {
> +            /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
> +             * do 32-bit integer multiplication in one instruction, but instead
> +             * must do a sequence (which actually calculates a 64-bit result):
> +             *
> +             *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
> +             *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
> +             *    mov(8)  g2<1>D     acc0<8,8,1>D
> +             *
> +             * But on Gen > 6, the ability to use second accumulator register
> +             * (acc1) for non-float data types was removed, preventing a simple
> +             * implementation in SIMD16. A 16-channel result can be calculated by
> +             * executing the three instructions twice in SIMD8, once with quarter
> +             * control of 1Q for the first eight channels and again with 2Q for
> +             * the second eight channels.
> +             *
> +             * Which accumulator register is implicitly accessed (by AccWrEnable
> +             * for instance) is determined by the quarter control. Unfortunately
> +             * Ivybridge (and presumably Baytrail) has a hardware bug in which an
> +             * implicit accumulator access by an instruction with 2Q will access
> +             * acc1 regardless of whether the data type is usable in acc1.
> +             *
> +             * Specifically, the 2Q mach(8) writes acc1 which does not exist for
> +             * integer data types.
> +             *
> +             * Since we only want the low 32-bits of the result, we can do two
> +             * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
> +             * adjust the high result and add them (like the mach is doing):
> +             *
> +             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
> +             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
> +             *    shl(8)  g9<1>D     g8<8,8,1>D      16D
> +             *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
> +             *
> +             * We avoid the shl instruction by realizing that we only want to add
> +             * the low 16-bits of the "high" result to the high 16-bits of the
> +             * "low" result and using proper regioning on the add:
> +             *
> +             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
> +             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
> +             *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
> +             *
> +             * Since it does not use the (single) accumulator register, we can
> +             * schedule multi-component multiplications much better.
> +             */
> +
> +            if (inst->conditional_mod && inst->dst.is_null()) {
> +               inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
> +                                  inst->dst.type);
> +            }
> +            fs_reg low = inst->dst;
> +            fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
> +                        inst->dst.type);
>
> -         if (devinfo->gen >= 7) {
> -            fs_reg src1_0_w = inst->src[1];
> -            fs_reg src1_1_w = inst->src[1];
> +            if (devinfo->gen >= 7) {
> +               fs_reg src1_0_w = inst->src[1];
> +               fs_reg src1_1_w = inst->src[1];
>
> -            if (inst->src[1].file == IMM) {
> -               src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
> -               src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
> -            } else {
> -               src1_0_w.type = BRW_REGISTER_TYPE_UW;
> -               if (src1_0_w.stride != 0) {
> -                  assert(src1_0_w.stride == 1);
> -                  src1_0_w.stride = 2;
> +               if (inst->src[1].file == IMM) {
> +                  src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
> +                  src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
> +               } else {
> +                  src1_0_w.type = BRW_REGISTER_TYPE_UW;
> +                  if (src1_0_w.stride != 0) {
> +                     assert(src1_0_w.stride == 1);
> +                     src1_0_w.stride = 2;
> +                  }
> +
> +                  src1_1_w.type = BRW_REGISTER_TYPE_UW;
> +                  if (src1_1_w.stride != 0) {
> +                     assert(src1_1_w.stride == 1);
> +                     src1_1_w.stride = 2;
> +                  }
> +                  src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>                 }
> +               ibld.MUL(low, inst->src[0], src1_0_w);
> +               ibld.MUL(high, inst->src[0], src1_1_w);
> +            } else {
> +               fs_reg src0_0_w = inst->src[0];
> +               fs_reg src0_1_w = inst->src[0];
>
> -               src1_1_w.type = BRW_REGISTER_TYPE_UW;
> -               if (src1_1_w.stride != 0) {
> -                  assert(src1_1_w.stride == 1);
> -                  src1_1_w.stride = 2;
> +               src0_0_w.type = BRW_REGISTER_TYPE_UW;
> +               if (src0_0_w.stride != 0) {
> +                  assert(src0_0_w.stride == 1);
> +                  src0_0_w.stride = 2;
>                 }
> -               src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
> -            }
> -            ibld.MUL(low, inst->src[0], src1_0_w);
> -            ibld.MUL(high, inst->src[0], src1_1_w);
> -         } else {
> -            fs_reg src0_0_w = inst->src[0];
> -            fs_reg src0_1_w = inst->src[0];
>
> -            src0_0_w.type = BRW_REGISTER_TYPE_UW;
> -            if (src0_0_w.stride != 0) {
> -               assert(src0_0_w.stride == 1);
> -               src0_0_w.stride = 2;
> -            }
> +               src0_1_w.type = BRW_REGISTER_TYPE_UW;
> +               if (src0_1_w.stride != 0) {
> +                  assert(src0_1_w.stride == 1);
> +                  src0_1_w.stride = 2;
> +               }
> +               src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>
> -            src0_1_w.type = BRW_REGISTER_TYPE_UW;
> -            if (src0_1_w.stride != 0) {
> -               assert(src0_1_w.stride == 1);
> -               src0_1_w.stride = 2;
> +               ibld.MUL(low, src0_0_w, inst->src[1]);
> +               ibld.MUL(high, src0_1_w, inst->src[1]);
>              }
> -            src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>
> -            ibld.MUL(low, src0_0_w, inst->src[1]);
> -            ibld.MUL(high, src0_1_w, inst->src[1]);
> -         }
> +            fs_reg dst = inst->dst;
> +            dst.type = BRW_REGISTER_TYPE_UW;
> +            dst.subreg_offset = 2;
> +            dst.stride = 2;
>
> -         fs_reg dst = inst->dst;
> -         dst.type = BRW_REGISTER_TYPE_UW;
> -         dst.subreg_offset = 2;
> -         dst.stride = 2;
> +            high.type = BRW_REGISTER_TYPE_UW;
> +            high.stride = 2;
>
> -         high.type = BRW_REGISTER_TYPE_UW;
> -         high.stride = 2;
> +            low.type = BRW_REGISTER_TYPE_UW;
> +            low.subreg_offset = 2;
> +            low.stride = 2;
>
> -         low.type = BRW_REGISTER_TYPE_UW;
> -         low.subreg_offset = 2;
> -         low.stride = 2;
> +            ibld.ADD(dst, low, high);
>
> -         ibld.ADD(dst, low, high);
> +            if (inst->conditional_mod) {
> +               fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
> +               set_condmod(inst->conditional_mod,
> +                           ibld.MOV(null, inst->dst));
> +            }
> +         }
>
> -         if (inst->conditional_mod) {
> -            fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
> -            set_condmod(inst->conditional_mod,
> -                        ibld.MOV(null, inst->dst));
>           }
> +

Extra newline before the else.

> +      } else {
> +         continue;
>        }

There's a build-break here, probably a rebase problem in splitting
this from the next patch. There seems to be an extra } that should be
in the next patch.

With that fixed, these five patches are

Reviewed-by: Matt Turner <mattst88 at gmail.com>