[Mesa-dev] [PATCH 3/5] i965/fs: Indent the implementation of 32x32-bit MUL lowering by one level.

Wed Aug 5 11:42:54 PDT 2015

Matt Turner <mattst88 at gmail.com> writes:

> On Wed, Aug 5, 2015 at 10:52 AM, Francisco Jerez <currojerez at riseup.net> wrote:
>> In order to make room for the code that will lower the MULH virtual
>> instruction.  Also move the hardware generation and execution type
>> checks into the same branch, they are going to have to be different
>> for MULH.
>> ---
>>  src/mesa/drivers/dri/i965/brw_fs.cpp | 265 ++++++++++++++++++-----------------
>>  1 file changed, 136 insertions(+), 129 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> index fc9f007..911e32b 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> @@ -3125,155 +3125,162 @@ fs_visitor::lower_integer_multiplication()
>>  {
>>     bool progress = false;
>>
>> -   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
>> -    * directly, but CHV/BXT cannot.
>> -    */
>> -   if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton)
>> -      return false;
>> -
>>     foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
>> -      if (inst->opcode != BRW_OPCODE_MUL ||
>> -          inst->dst.is_accumulator() ||
>> -          (inst->dst.type != BRW_REGISTER_TYPE_D &&
>> -           inst->dst.type != BRW_REGISTER_TYPE_UD))
>> -         continue;
>> -
>>        const fs_builder ibld(this, block, inst);
>>
>> -      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
>> -       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
>> -       * src1 are used.
>> -       *
>> -       * If multiplying by an immediate value that fits in 16-bits, do a
>> -       * single MUL instruction with that value in the proper location.
>> -       */
>> -      if (inst->src[1].file == IMM &&
>> -          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
>> -         if (devinfo->gen < 7) {
>> -            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
>> -                       inst->dst.type);
>> -            ibld.MOV(imm, inst->src[1]);
>> -            ibld.MUL(inst->dst, imm, inst->src[0]);
>> -         } else {
>> -            ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
>> -         }
>> -      } else {
>> -         /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
>> -          * do 32-bit integer multiplication in one instruction, but instead
>> -          * must do a sequence (which actually calculates a 64-bit result):
>> -          *
>> -          *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
>> -          *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
>> -          *    mov(8)  g2<1>D     acc0<8,8,1>D
>> -          *
>> -          * But on Gen > 6, the ability to use second accumulator register
>> -          * (acc1) for non-float data types was removed, preventing a simple
>> -          * implementation in SIMD16. A 16-channel result can be calculated by
>> -          * executing the three instructions twice in SIMD8, once with quarter
>> -          * control of 1Q for the first eight channels and again with 2Q for
>> -          * the second eight channels.
>> -          *
>> -          * Which accumulator register is implicitly accessed (by AccWrEnable
>> -          * for instance) is determined by the quarter control. Unfortunately
>> -          * Ivybridge (and presumably Baytrail) has a hardware bug in which an
>> -          * implicit accumulator access by an instruction with 2Q will access
>> -          * acc1 regardless of whether the data type is usable in acc1.
>> -          *
>> -          * Specifically, the 2Q mach(8) writes acc1 which does not exist for
>> -          * integer data types.
>> -          *
>> -          * Since we only want the low 32-bits of the result, we can do two
>> -          * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
>> -          * adjust the high result and add them (like the mach is doing):
>> -          *
>> -          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
>> -          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
>> -          *    shl(8)  g9<1>D     g8<8,8,1>D      16D
>> -          *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
>> -          *
>> -          * We avoid the shl instruction by realizing that we only want to add
>> -          * the low 16-bits of the "high" result to the high 16-bits of the
>> -          * "low" result and using proper regioning on the add:
>> -          *
>> -          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
>> -          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
>> -          *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
>> -          *
>> -          * Since it does not use the (single) accumulator register, we can
>> -          * schedule multi-component multiplications much better.
>> +      if (inst->opcode == BRW_OPCODE_MUL) {
>> +         if (inst->dst.is_accumulator() ||
>> +             (inst->dst.type != BRW_REGISTER_TYPE_D &&
>> +              inst->dst.type != BRW_REGISTER_TYPE_UD))
>> +            continue;
>> +
>> +         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
>> +          * operation directly, but CHV/BXT cannot.
>>            */
>> +         if (devinfo->gen >= 8 &&
>> +             !devinfo->is_cherryview && !devinfo->is_broxton)
>> +            continue;
>>
>> -         if (inst->conditional_mod && inst->dst.is_null()) {
>> -            inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
>> -                               inst->dst.type);
>> -         }
>> -         fs_reg low = inst->dst;
>> -         fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
>> -                     inst->dst.type);
>> +         if (inst->src[1].file == IMM &&
>> +             inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
>> +            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
>> +             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
>> +             * src1 are used.
>> +             *
>> +             * If multiplying by an immediate value that fits in 16-bits, do a
>> +             * single MUL instruction with that value in the proper location.
>> +             */
>> +            if (devinfo->gen < 7) {
>> +               fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
>> +                          inst->dst.type);
>> +               ibld.MOV(imm, inst->src[1]);
>> +               ibld.MUL(inst->dst, imm, inst->src[0]);
>> +            } else {
>> +               ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
>> +            }
>> +         } else {
>> +            /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
>> +             * do 32-bit integer multiplication in one instruction, but instead
>> +             * must do a sequence (which actually calculates a 64-bit result):
>> +             *
>> +             *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
>> +             *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
>> +             *    mov(8)  g2<1>D     acc0<8,8,1>D
>> +             *
>> +             * But on Gen > 6, the ability to use second accumulator register
>> +             * (acc1) for non-float data types was removed, preventing a simple
>> +             * implementation in SIMD16. A 16-channel result can be calculated by
>> +             * executing the three instructions twice in SIMD8, once with quarter
>> +             * control of 1Q for the first eight channels and again with 2Q for
>> +             * the second eight channels.
>> +             *
>> +             * Which accumulator register is implicitly accessed (by AccWrEnable
>> +             * for instance) is determined by the quarter control. Unfortunately
>> +             * Ivybridge (and presumably Baytrail) has a hardware bug in which an
>> +             * implicit accumulator access by an instruction with 2Q will access
>> +             * acc1 regardless of whether the data type is usable in acc1.
>> +             *
>> +             * Specifically, the 2Q mach(8) writes acc1 which does not exist for
>> +             * integer data types.
>> +             *
>> +             * Since we only want the low 32-bits of the result, we can do two
>> +             * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
>> +             * adjust the high result and add them (like the mach is doing):
>> +             *
>> +             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
>> +             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
>> +             *    shl(8)  g9<1>D     g8<8,8,1>D      16D
>> +             *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
>> +             *
>> +             * We avoid the shl instruction by realizing that we only want to add
>> +             * the low 16-bits of the "high" result to the high 16-bits of the
>> +             * "low" result and using proper regioning on the add:
>> +             *
>> +             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
>> +             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
>> +             *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
>> +             *
>> +             * Since it does not use the (single) accumulator register, we can
>> +             * schedule multi-component multiplications much better.
>> +             */
>> +
>> +            if (inst->conditional_mod && inst->dst.is_null()) {
>> +               inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
>> +                                  inst->dst.type);
>> +            }
>> +            fs_reg low = inst->dst;
>> +            fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
>> +                        inst->dst.type);
>>
>> -         if (devinfo->gen >= 7) {
>> -            fs_reg src1_0_w = inst->src[1];
>> -            fs_reg src1_1_w = inst->src[1];
>> +            if (devinfo->gen >= 7) {
>> +               fs_reg src1_0_w = inst->src[1];
>> +               fs_reg src1_1_w = inst->src[1];
>>
>> -            if (inst->src[1].file == IMM) {
>> -               src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
>> -               src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
>> -            } else {
>> -               src1_0_w.type = BRW_REGISTER_TYPE_UW;
>> -               if (src1_0_w.stride != 0) {
>> -                  assert(src1_0_w.stride == 1);
>> -                  src1_0_w.stride = 2;
>> +               if (inst->src[1].file == IMM) {
>> +                  src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
>> +                  src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
>> +               } else {
>> +                  src1_0_w.type = BRW_REGISTER_TYPE_UW;
>> +                  if (src1_0_w.stride != 0) {
>> +                     assert(src1_0_w.stride == 1);
>> +                     src1_0_w.stride = 2;
>> +                  }
>> +
>> +                  src1_1_w.type = BRW_REGISTER_TYPE_UW;
>> +                  if (src1_1_w.stride != 0) {
>> +                     assert(src1_1_w.stride == 1);
>> +                     src1_1_w.stride = 2;
>> +                  }
>> +                  src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>>                 }
>> +               ibld.MUL(low, inst->src[0], src1_0_w);
>> +               ibld.MUL(high, inst->src[0], src1_1_w);
>> +            } else {
>> +               fs_reg src0_0_w = inst->src[0];
>> +               fs_reg src0_1_w = inst->src[0];
>>
>> -               src1_1_w.type = BRW_REGISTER_TYPE_UW;
>> -               if (src1_1_w.stride != 0) {
>> -                  assert(src1_1_w.stride == 1);
>> -                  src1_1_w.stride = 2;
>> +               src0_0_w.type = BRW_REGISTER_TYPE_UW;
>> +               if (src0_0_w.stride != 0) {
>> +                  assert(src0_0_w.stride == 1);
>> +                  src0_0_w.stride = 2;
>>                 }
>> -               src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>> -            }
>> -            ibld.MUL(low, inst->src[0], src1_0_w);
>> -            ibld.MUL(high, inst->src[0], src1_1_w);
>> -         } else {
>> -            fs_reg src0_0_w = inst->src[0];
>> -            fs_reg src0_1_w = inst->src[0];
>>
>> -            src0_0_w.type = BRW_REGISTER_TYPE_UW;
>> -            if (src0_0_w.stride != 0) {
>> -               assert(src0_0_w.stride == 1);
>> -               src0_0_w.stride = 2;
>> -            }
>> +               src0_1_w.type = BRW_REGISTER_TYPE_UW;
>> +               if (src0_1_w.stride != 0) {
>> +                  assert(src0_1_w.stride == 1);
>> +                  src0_1_w.stride = 2;
>> +               }
>> +               src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>>
>> -            src0_1_w.type = BRW_REGISTER_TYPE_UW;
>> -            if (src0_1_w.stride != 0) {
>> -               assert(src0_1_w.stride == 1);
>> -               src0_1_w.stride = 2;
>> +               ibld.MUL(low, src0_0_w, inst->src[1]);
>> +               ibld.MUL(high, src0_1_w, inst->src[1]);
>>              }
>> -            src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>>
>> -            ibld.MUL(low, src0_0_w, inst->src[1]);
>> -            ibld.MUL(high, src0_1_w, inst->src[1]);
>> -         }
>> +            fs_reg dst = inst->dst;
>> +            dst.type = BRW_REGISTER_TYPE_UW;
>> +            dst.subreg_offset = 2;
>> +            dst.stride = 2;
>>
>> -         fs_reg dst = inst->dst;
>> -         dst.type = BRW_REGISTER_TYPE_UW;
>> -         dst.subreg_offset = 2;
>> -         dst.stride = 2;
>> +            high.type = BRW_REGISTER_TYPE_UW;
>> +            high.stride = 2;
>>
>> -         high.type = BRW_REGISTER_TYPE_UW;
>> -         high.stride = 2;
>> +            low.type = BRW_REGISTER_TYPE_UW;
>> +            low.subreg_offset = 2;
>> +            low.stride = 2;
>>
>> -         low.type = BRW_REGISTER_TYPE_UW;
>> -         low.subreg_offset = 2;
>> -         low.stride = 2;
>> +            ibld.ADD(dst, low, high);
>>
>> -         ibld.ADD(dst, low, high);
>> +            if (inst->conditional_mod) {
>> +               fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
>> +               set_condmod(inst->conditional_mod,
>> +                           ibld.MOV(null, inst->dst));
>> +            }
>> +         }
>>
>> -         if (inst->conditional_mod) {
>> -            fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
>> -            set_condmod(inst->conditional_mod,
>> -                        ibld.MOV(null, inst->dst));
>>           }
>> +
>
> Extra newline before the else.
>
>> +      } else {
>> +         continue;
>>        }
>
> There's a build-break here, probably a rebase problem in splitting
> this from the next patch. There seems to be an extra } that should be
> in the next patch.
>

Oops, good catch, I'll fix that.

> With that fixed, these five patches are
>
> Reviewed-by: Matt Turner <mattst88 at gmail.com>

Thanks.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 212 bytes
Desc: not available
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20150805/5eeabea2/attachment.sig>