[Mesa-dev] [PATCH 3/5] i965/fs: Indent the implementation of 32x32-bit MUL lowering by one level.
Matt Turner
mattst88 at gmail.com
Wed Aug 5 11:39:10 PDT 2015
On Wed, Aug 5, 2015 at 10:52 AM, Francisco Jerez <currojerez at riseup.net> wrote:
> In order to make room for the code that will lower the MULH virtual
> instruction. Also move the hardware generation and execution type
> checks into the same branch, they are going to have to be different
> for MULH.
> ---
> src/mesa/drivers/dri/i965/brw_fs.cpp | 265 ++++++++++++++++++-----------------
> 1 file changed, 136 insertions(+), 129 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index fc9f007..911e32b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -3125,155 +3125,162 @@ fs_visitor::lower_integer_multiplication()
> {
> bool progress = false;
>
> - /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
> - * directly, but CHV/BXT cannot.
> - */
> - if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton)
> - return false;
> -
> foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
> - if (inst->opcode != BRW_OPCODE_MUL ||
> - inst->dst.is_accumulator() ||
> - (inst->dst.type != BRW_REGISTER_TYPE_D &&
> - inst->dst.type != BRW_REGISTER_TYPE_UD))
> - continue;
> -
> const fs_builder ibld(this, block, inst);
>
> - /* The MUL instruction isn't commutative. On Gen <= 6, only the low
> - * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
> - * src1 are used.
> - *
> - * If multiplying by an immediate value that fits in 16-bits, do a
> - * single MUL instruction with that value in the proper location.
> - */
> - if (inst->src[1].file == IMM &&
> - inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
> - if (devinfo->gen < 7) {
> - fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
> - inst->dst.type);
> - ibld.MOV(imm, inst->src[1]);
> - ibld.MUL(inst->dst, imm, inst->src[0]);
> - } else {
> - ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
> - }
> - } else {
> - /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
> - * do 32-bit integer multiplication in one instruction, but instead
> - * must do a sequence (which actually calculates a 64-bit result):
> - *
> - * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
> - * mach(8) null g3<8,8,1>D g4<8,8,1>D
> - * mov(8) g2<1>D acc0<8,8,1>D
> - *
> - * But on Gen > 6, the ability to use second accumulator register
> - * (acc1) for non-float data types was removed, preventing a simple
> - * implementation in SIMD16. A 16-channel result can be calculated by
> - * executing the three instructions twice in SIMD8, once with quarter
> - * control of 1Q for the first eight channels and again with 2Q for
> - * the second eight channels.
> - *
> - * Which accumulator register is implicitly accessed (by AccWrEnable
> - * for instance) is determined by the quarter control. Unfortunately
> - * Ivybridge (and presumably Baytrail) has a hardware bug in which an
> - * implicit accumulator access by an instruction with 2Q will access
> - * acc1 regardless of whether the data type is usable in acc1.
> - *
> - * Specifically, the 2Q mach(8) writes acc1 which does not exist for
> - * integer data types.
> - *
> - * Since we only want the low 32-bits of the result, we can do two
> - * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
> - * adjust the high result and add them (like the mach is doing):
> - *
> - * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
> - * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
> - * shl(8) g9<1>D g8<8,8,1>D 16D
> - * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
> - *
> - * We avoid the shl instruction by realizing that we only want to add
> - * the low 16-bits of the "high" result to the high 16-bits of the
> - * "low" result and using proper regioning on the add:
> - *
> - * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
> - * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
> - * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
> - *
> - * Since it does not use the (single) accumulator register, we can
> - * schedule multi-component multiplications much better.
> + if (inst->opcode == BRW_OPCODE_MUL) {
> + if (inst->dst.is_accumulator() ||
> + (inst->dst.type != BRW_REGISTER_TYPE_D &&
> + inst->dst.type != BRW_REGISTER_TYPE_UD))
> + continue;
> +
> + /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
> + * operation directly, but CHV/BXT cannot.
> */
> + if (devinfo->gen >= 8 &&
> + !devinfo->is_cherryview && !devinfo->is_broxton)
> + continue;
>
> - if (inst->conditional_mod && inst->dst.is_null()) {
> - inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
> - inst->dst.type);
> - }
> - fs_reg low = inst->dst;
> - fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
> - inst->dst.type);
> + if (inst->src[1].file == IMM &&
> + inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
> + /* The MUL instruction isn't commutative. On Gen <= 6, only the low
> + * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
> + * src1 are used.
> + *
> + * If multiplying by an immediate value that fits in 16-bits, do a
> + * single MUL instruction with that value in the proper location.
> + */
> + if (devinfo->gen < 7) {
> + fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
> + inst->dst.type);
> + ibld.MOV(imm, inst->src[1]);
> + ibld.MUL(inst->dst, imm, inst->src[0]);
> + } else {
> + ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
> + }
> + } else {
> + /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
> + * do 32-bit integer multiplication in one instruction, but instead
> + * must do a sequence (which actually calculates a 64-bit result):
> + *
> + * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
> + * mach(8) null g3<8,8,1>D g4<8,8,1>D
> + * mov(8) g2<1>D acc0<8,8,1>D
> + *
> + * But on Gen > 6, the ability to use second accumulator register
> + * (acc1) for non-float data types was removed, preventing a simple
> + * implementation in SIMD16. A 16-channel result can be calculated by
> + * executing the three instructions twice in SIMD8, once with quarter
> + * control of 1Q for the first eight channels and again with 2Q for
> + * the second eight channels.
> + *
> + * Which accumulator register is implicitly accessed (by AccWrEnable
> + * for instance) is determined by the quarter control. Unfortunately
> + * Ivybridge (and presumably Baytrail) has a hardware bug in which an
> + * implicit accumulator access by an instruction with 2Q will access
> + * acc1 regardless of whether the data type is usable in acc1.
> + *
> + * Specifically, the 2Q mach(8) writes acc1 which does not exist for
> + * integer data types.
> + *
> + * Since we only want the low 32-bits of the result, we can do two
> + * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
> + * adjust the high result and add them (like the mach is doing):
> + *
> + * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
> + * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
> + * shl(8) g9<1>D g8<8,8,1>D 16D
> + * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
> + *
> + * We avoid the shl instruction by realizing that we only want to add
> + * the low 16-bits of the "high" result to the high 16-bits of the
> + * "low" result and using proper regioning on the add:
> + *
> + * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
> + * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
> + * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
> + *
> + * Since it does not use the (single) accumulator register, we can
> + * schedule multi-component multiplications much better.
> + */
> +
> + if (inst->conditional_mod && inst->dst.is_null()) {
> + inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
> + inst->dst.type);
> + }
> + fs_reg low = inst->dst;
> + fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
> + inst->dst.type);
>
> - if (devinfo->gen >= 7) {
> - fs_reg src1_0_w = inst->src[1];
> - fs_reg src1_1_w = inst->src[1];
> + if (devinfo->gen >= 7) {
> + fs_reg src1_0_w = inst->src[1];
> + fs_reg src1_1_w = inst->src[1];
>
> - if (inst->src[1].file == IMM) {
> - src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
> - src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
> - } else {
> - src1_0_w.type = BRW_REGISTER_TYPE_UW;
> - if (src1_0_w.stride != 0) {
> - assert(src1_0_w.stride == 1);
> - src1_0_w.stride = 2;
> + if (inst->src[1].file == IMM) {
> + src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
> + src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
> + } else {
> + src1_0_w.type = BRW_REGISTER_TYPE_UW;
> + if (src1_0_w.stride != 0) {
> + assert(src1_0_w.stride == 1);
> + src1_0_w.stride = 2;
> + }
> +
> + src1_1_w.type = BRW_REGISTER_TYPE_UW;
> + if (src1_1_w.stride != 0) {
> + assert(src1_1_w.stride == 1);
> + src1_1_w.stride = 2;
> + }
> + src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
> }
> + ibld.MUL(low, inst->src[0], src1_0_w);
> + ibld.MUL(high, inst->src[0], src1_1_w);
> + } else {
> + fs_reg src0_0_w = inst->src[0];
> + fs_reg src0_1_w = inst->src[0];
>
> - src1_1_w.type = BRW_REGISTER_TYPE_UW;
> - if (src1_1_w.stride != 0) {
> - assert(src1_1_w.stride == 1);
> - src1_1_w.stride = 2;
> + src0_0_w.type = BRW_REGISTER_TYPE_UW;
> + if (src0_0_w.stride != 0) {
> + assert(src0_0_w.stride == 1);
> + src0_0_w.stride = 2;
> }
> - src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
> - }
> - ibld.MUL(low, inst->src[0], src1_0_w);
> - ibld.MUL(high, inst->src[0], src1_1_w);
> - } else {
> - fs_reg src0_0_w = inst->src[0];
> - fs_reg src0_1_w = inst->src[0];
>
> - src0_0_w.type = BRW_REGISTER_TYPE_UW;
> - if (src0_0_w.stride != 0) {
> - assert(src0_0_w.stride == 1);
> - src0_0_w.stride = 2;
> - }
> + src0_1_w.type = BRW_REGISTER_TYPE_UW;
> + if (src0_1_w.stride != 0) {
> + assert(src0_1_w.stride == 1);
> + src0_1_w.stride = 2;
> + }
> + src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>
> - src0_1_w.type = BRW_REGISTER_TYPE_UW;
> - if (src0_1_w.stride != 0) {
> - assert(src0_1_w.stride == 1);
> - src0_1_w.stride = 2;
> + ibld.MUL(low, src0_0_w, inst->src[1]);
> + ibld.MUL(high, src0_1_w, inst->src[1]);
> }
> - src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
>
> - ibld.MUL(low, src0_0_w, inst->src[1]);
> - ibld.MUL(high, src0_1_w, inst->src[1]);
> - }
> + fs_reg dst = inst->dst;
> + dst.type = BRW_REGISTER_TYPE_UW;
> + dst.subreg_offset = 2;
> + dst.stride = 2;
>
> - fs_reg dst = inst->dst;
> - dst.type = BRW_REGISTER_TYPE_UW;
> - dst.subreg_offset = 2;
> - dst.stride = 2;
> + high.type = BRW_REGISTER_TYPE_UW;
> + high.stride = 2;
>
> - high.type = BRW_REGISTER_TYPE_UW;
> - high.stride = 2;
> + low.type = BRW_REGISTER_TYPE_UW;
> + low.subreg_offset = 2;
> + low.stride = 2;
>
> - low.type = BRW_REGISTER_TYPE_UW;
> - low.subreg_offset = 2;
> - low.stride = 2;
> + ibld.ADD(dst, low, high);
>
> - ibld.ADD(dst, low, high);
> + if (inst->conditional_mod) {
> + fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
> + set_condmod(inst->conditional_mod,
> + ibld.MOV(null, inst->dst));
> + }
> + }
>
> - if (inst->conditional_mod) {
> - fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
> - set_condmod(inst->conditional_mod,
> - ibld.MOV(null, inst->dst));
> }
> +
Extra newline before the else.
> + } else {
> + continue;
> }
There's a build-break here, probably a rebase problem in splitting
this from the next patch. There seems to be an extra } that should be
in the next patch.
With that fixed, these five patches are
Reviewed-by: Matt Turner <mattst88 at gmail.com>
More information about the mesa-dev
mailing list