[Mesa-dev] [PATCH 3/5] i965/fs: Indent the implementation of 32x32-bit MUL lowering by one level.
Francisco Jerez
currojerez at riseup.net
Wed Aug 5 11:42:54 PDT 2015
Matt Turner <mattst88 at gmail.com> writes:
> On Wed, Aug 5, 2015 at 10:52 AM, Francisco Jerez <currojerez at riseup.net> wrote:
>> In order to make room for the code that will lower the MULH virtual
>> instruction. Also move the hardware generation and execution type
>> checks into the same branch, they are going to have to be different
>> for MULH.
>> ---
>> src/mesa/drivers/dri/i965/brw_fs.cpp | 265 ++++++++++++++++++-----------------
>> 1 file changed, 136 insertions(+), 129 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> index fc9f007..911e32b 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> @@ -3125,155 +3125,162 @@ fs_visitor::lower_integer_multiplication()
>> {
>> bool progress = false;
>>
>> - /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
>> - * directly, but CHV/BXT cannot.
>> - */
>> - if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton)
>> - return false;
>> -
>> foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
>> - if (inst->opcode != BRW_OPCODE_MUL ||
>> - inst->dst.is_accumulator() ||
>> - (inst->dst.type != BRW_REGISTER_TYPE_D &&
>> - inst->dst.type != BRW_REGISTER_TYPE_UD))
>> - continue;
>> -
>> const fs_builder ibld(this, block, inst);
>>
>> - /* The MUL instruction isn't commutative. On Gen <= 6, only the low
>> - * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
>> - * src1 are used.
>> - *
>> - * If multiplying by an immediate value that fits in 16-bits, do a
>> - * single MUL instruction with that value in the proper location.
>> - */
>> - if (inst->src[1].file == IMM &&
>> - inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
>> - if (devinfo->gen < 7) {
>> - fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
>> - inst->dst.type);
>> - ibld.MOV(imm, inst->src[1]);
>> - ibld.MUL(inst->dst, imm, inst->src[0]);
>> - } else {
>> - ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
>> - }
>> - } else {
>> - /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
>> - * do 32-bit integer multiplication in one instruction, but instead
>> - * must do a sequence (which actually calculates a 64-bit result):
>> - *
>> - * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
>> - * mach(8) null g3<8,8,1>D g4<8,8,1>D
>> - * mov(8) g2<1>D acc0<8,8,1>D
>> - *
>> - * But on Gen > 6, the ability to use second accumulator register
>> - * (acc1) for non-float data types was removed, preventing a simple
>> - * implementation in SIMD16. A 16-channel result can be calculated by
>> - * executing the three instructions twice in SIMD8, once with quarter
>> - * control of 1Q for the first eight channels and again with 2Q for
>> - * the second eight channels.
>> - *
>> - * Which accumulator register is implicitly accessed (by AccWrEnable
>> - * for instance) is determined by the quarter control. Unfortunately
>> - * Ivybridge (and presumably Baytrail) has a hardware bug in which an
>> - * implicit accumulator access by an instruction with 2Q will access
>> - * acc1 regardless of whether the data type is usable in acc1.
>> - *
>> - * Specifically, the 2Q mach(8) writes acc1 which does not exist for
>> - * integer data types.
>> - *
>> - * Since we only want the low 32-bits of the result, we can do two
>> - * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
>> - * adjust the high result and add them (like the mach is doing):
>> - *
>> - * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
>> - * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
>> - * shl(8) g9<1>D g8<8,8,1>D 16D
>> - * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
>> - *
>> - * We avoid the shl instruction by realizing that we only want to add
>> - * the low 16-bits of the "high" result to the high 16-bits of the
>> - * "low" result and using proper regioning on the add:
>> - *
>> - * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
>> - * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
>> - * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
>> - *
>> - * Since it does not use the (single) accumulator register, we can
>> - * schedule multi-component multiplications much better.
>> + if (inst->opcode == BRW_OPCODE_MUL) {
>> + if (inst->dst.is_accumulator() ||
>> + (inst->dst.type != BRW_REGISTER_TYPE_D &&
>> + inst->dst.type != BRW_REGISTER_TYPE_UD))
>> + continue;
>> +
>> + /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
>> + * operation directly, but CHV/BXT cannot.
>> */
>> + if (devinfo->gen >= 8 &&
>> + !devinfo->is_cherryview && !devinfo->is_broxton)
>> + continue;
>>
>> - if (inst->conditional_mod && inst->dst.is_null()) {
>> - inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
>> - inst->dst.type);
>> - }
>> - fs_reg low = inst->dst;
>> - fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
>> - inst->dst.type);
>> + if (inst->src[1].file == IMM &&
>> + inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
>> + /* The MUL instruction isn't commutative. On Gen <= 6, only the low
>> + * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
>> + * src1 are used.
>> + *
>> + * If multiplying by an immediate value that fits in 16-bits, do a
>> + * single MUL instruction with that value in the proper location.
>> + */
>> + if (devinfo->gen < 7) {
>> + fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
>> + inst->dst.type);
>> + ibld.MOV(imm, inst->src[1]);
>> + ibld.MUL(inst->dst, imm, inst->src[0]);
>> + } else {
>> + ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
>> + }
>> + } else {
>> + /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
>> + * do 32-bit integer multiplication in one instruction, but instead
>> + * must do a sequence (which actually calculates a 64-bit result):
>> + *
>> + * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
>> + * mach(8) null g3<8,8,1>D g4<8,8,1>D
>> + * mov(8) g2<1>D acc0<8,8,1>D
>> + *
>> + * But on Gen > 6, the ability to use second accumulator register
>> + * (acc1) for non-float data types was removed, preventing a simple
>> + * implementation in SIMD16. A 16-channel result can be calculated by
>> + * executing the three instructions twice in SIMD8, once with quarter
>> + * control of 1Q for the first eight channels and again with 2Q for
>> + * the second eight channels.
>> + *
>> + * Which accumulator register is implicitly accessed (by AccWrEnable
>> + * for instance) is determined by the quarter control. Unfortunately
>> + * Ivybridge (and presumably Baytrail) has a hardware bug in which an
>> + * implicit accumulator access by an instruction with 2Q will access
>> + * acc1 regardless of whether the data type is usable in acc1.
>> + *
>> + * Specifically, the 2Q mach(8) writes acc1 which does not exist for
>> + * integer data types.
>> + *
>> + * Since we only want the low 32-bits of the result, we can do two
>> + * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
>> + * adjust the high result and add them (like the mach is doing):
>> + *
>> + * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
>> + * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
>> + * shl(8) g9<1>D g8<8,8,1>D 16D
>> + * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
>> + *
>> + * We avoid the shl instruction by realizing that we only want to add
>> + * the low 16-bits of the "high" result to the high 16-bits of the
>> + * "low" result and using proper regioning on the add:
>> + *
>> + * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
>> + * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
>> + * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
>> + *
>> + * Since it does not use the (single) accumulator register, we can
>> + * schedule multi-component multiplications much better.
>> + */
>> +
>> + if (inst->conditional_mod && inst->dst.is_null()) {
>> + inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
>> + inst->dst.type);
>> + }
>> + fs_reg low = inst->dst;
>> + fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
>> + inst->dst.type);
>>
>> - if (devinfo->gen >= 7) {
>> - fs_reg src1_0_w = inst->src[1];
>> - fs_reg src1_1_w = inst->src[1];
>> + if (devinfo->gen >= 7) {
>> + fs_reg src1_0_w = inst->src[1];
>> + fs_reg src1_1_w = inst->src[1];
>>
>> - if (inst->src[1].file == IMM) {
>> - src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
>> - src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
>> - } else {
>> - src1_0_w.type = BRW_REGISTER_TYPE_UW;
>> - if (src1_0_w.stride != 0) {
>> - assert(src1_0_w.stride == 1);
>> - src1_0_w.stride = 2;
>> + if (inst->src[1].file == IMM) {
>> + src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
>> + src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
>> + } else {
>> + src1_0_w.type = BRW_REGISTER_TYPE_UW;
>> + if (src1_0_w.stride != 0) {
>> + assert(src1_0_w.stride == 1);
>> + src1_0_w.stride = 2;
>> + }
>> +
>> + src1_1_w.type = BRW_REGISTER_TYPE_UW;
>> + if (src1_1_w.stride != 0) {
>> + assert(src1_1_w.stride == 1);
>> + src1_1_w.stride = 2;
>> + }
>> + src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>> }
>> + ibld.MUL(low, inst->src[0], src1_0_w);
>> + ibld.MUL(high, inst->src[0], src1_1_w);
>> + } else {
>> + fs_reg src0_0_w = inst->src[0];
>> + fs_reg src0_1_w = inst->src[0];
>>
>> - src1_1_w.type = BRW_REGISTER_TYPE_UW;
>> - if (src1_1_w.stride != 0) {
>> - assert(src1_1_w.stride == 1);
>> - src1_1_w.stride = 2;
>> + src0_0_w.type = BRW_REGISTER_TYPE_UW;
>> + if (src0_0_w.stride != 0) {
>> + assert(src0_0_w.stride == 1);
>> + src0_0_w.stride = 2;
>> }
>> - src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>> - }
>> - ibld.MUL(low, inst->src[0], src1_0_w);
>> - ibld.MUL(high, inst->src[0], src1_1_w);
>> - } else {
>> - fs_reg src0_0_w = inst->src[0];
>> - fs_reg src0_1_w = inst->src[0];
>>
>> - src0_0_w.type = BRW_REGISTER_TYPE_UW;
>> - if (src0_0_w.stride != 0) {
>> - assert(src0_0_w.stride == 1);
>> - src0_0_w.stride = 2;
>> - }
>> + src0_1_w.type = BRW_REGISTER_TYPE_UW;
>> + if (src0_1_w.stride != 0) {
>> + assert(src0_1_w.stride == 1);
>> + src0_1_w.stride = 2;
>> + }
>> + src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>>
>> - src0_1_w.type = BRW_REGISTER_TYPE_UW;
>> - if (src0_1_w.stride != 0) {
>> - assert(src0_1_w.stride == 1);
>> - src0_1_w.stride = 2;
>> + ibld.MUL(low, src0_0_w, inst->src[1]);
>> + ibld.MUL(high, src0_1_w, inst->src[1]);
>> }
>> - src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>>
>> - ibld.MUL(low, src0_0_w, inst->src[1]);
>> - ibld.MUL(high, src0_1_w, inst->src[1]);
>> - }
>> + fs_reg dst = inst->dst;
>> + dst.type = BRW_REGISTER_TYPE_UW;
>> + dst.subreg_offset = 2;
>> + dst.stride = 2;
>>
>> - fs_reg dst = inst->dst;
>> - dst.type = BRW_REGISTER_TYPE_UW;
>> - dst.subreg_offset = 2;
>> - dst.stride = 2;
>> + high.type = BRW_REGISTER_TYPE_UW;
>> + high.stride = 2;
>>
>> - high.type = BRW_REGISTER_TYPE_UW;
>> - high.stride = 2;
>> + low.type = BRW_REGISTER_TYPE_UW;
>> + low.subreg_offset = 2;
>> + low.stride = 2;
>>
>> - low.type = BRW_REGISTER_TYPE_UW;
>> - low.subreg_offset = 2;
>> - low.stride = 2;
>> + ibld.ADD(dst, low, high);
>>
>> - ibld.ADD(dst, low, high);
>> + if (inst->conditional_mod) {
>> + fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
>> + set_condmod(inst->conditional_mod,
>> + ibld.MOV(null, inst->dst));
>> + }
>> + }
>>
>> - if (inst->conditional_mod) {
>> - fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
>> - set_condmod(inst->conditional_mod,
>> - ibld.MOV(null, inst->dst));
>> }
>> +
>
> Extra newline before the else.
>
>> + } else {
>> + continue;
>> }
>
> There's a build-break here, probably a rebase problem in splitting
> this from the next patch. There seems to be an extra } that should be
> in the next patch.
>
Oops, good catch, I'll fix that.
> With that fixed, these five patches are
>
> Reviewed-by: Matt Turner <mattst88 at gmail.com>
Thanks.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 212 bytes
Desc: not available
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20150805/5eeabea2/attachment.sig>
More information about the mesa-dev
mailing list