[Beignet] [PATCH V2 3/3] enable byte gather for vload2/3/4/8(offset, char*) on SKL, BXT and BDW
Song, Ruiling
ruiling.song at intel.com
Wed Feb 24 05:49:14 UTC 2016
The patchset LGTM.
Thanks!
Ruiling
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Guo Yejun
> Sent: Wednesday, February 24, 2016 4:15 AM
> To: beignet at lists.freedesktop.org
> Cc: Guo, Yejun <yejun.guo at intel.com>
> Subject: [Beignet] [PATCH V2 3/3] enable byte gather for vload2/3/4/8(offset,
> char*) on SKL, BXT and BDW
>
> currently, the unaligned bytes gather is implemented with readByteAsDWord
> for a good performance, change back to native byte scattered read
> for new GPU generations with better performance.
>
> as for vload16(1/2/.../15, global char*), the native byte scattered
> read on SKL/BXT is not good, so use the original method for vload16.
>
> as for vload8(1/2/.../7, global char*), the native byte scattered
> read on BDW is not good, so use the original method for vload8
>
> V2: use UNPACK_BYTE to refine code
> disable vload8
> enable BDW support
>
> Signed-off-by: Guo Yejun <yejun.guo at intel.com>
> ---
> backend/src/backend/gen_insn_selection.cpp | 127 +++++++++++++++++++-----
> -----
> 1 file changed, 85 insertions(+), 42 deletions(-)
>
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 9225294..d157009 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -2263,7 +2263,7 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> this->opaque->setHas32X32Mul(true);
> this->opaque->setHasLongType(true);
> this->opaque->setHasDoubleType(true);
> - this->opaque->setSlowByteGather(true);
> + this->opaque->setSlowByteGather(false);
> this->opaque->setHasHalfType(true);
> opt_features = SIOF_LOGICAL_SRCMOD;
> }
> @@ -2283,7 +2283,7 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> this->opaque->setHasLongType(true);
> this->opaque->setHasDoubleType(true);
> this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
> - this->opaque->setSlowByteGather(true);
> + this->opaque->setSlowByteGather(false);
> this->opaque->setHasHalfType(true);
> opt_features = SIOF_LOGICAL_SRCMOD;
> }
> @@ -2294,7 +2294,7 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> this->opaque->setLongRegRestrict(true);
> this->opaque->setHasDoubleType(true);
> this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
> - this->opaque->setSlowByteGather(true);
> + this->opaque->setSlowByteGather(false);
> this->opaque->setHasHalfType(true);
> opt_features = SIOF_LOGICAL_SRCMOD |
> SIOF_OP_MOV_LONG_REG_RESTRICT;
> }
> @@ -3772,53 +3772,96 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>
> if(valueNum > 1) {
> GBE_ASSERT(!isUniform && "vector load should not be uniform. Something
> went wrong.");
> - vector<GenRegister> dst(valueNum);
> - const uint32_t typeSize = getFamilySize(family);
> + //need to investigate the case of GEN_BYTE_SCATTER_WORD later
> + //for GEN_BYTE_SCATTER_BYTE, if the pointer is not aligned to 4, using
> byte gather,
> + // on BDW, vec8 and vec16 are worse. on SKL/BXT, vec16 is
> worse.
> + if(sel.getSlowByteGather() || elemSize == GEN_BYTE_SCATTER_WORD
> + || (elemSize == GEN_BYTE_SCATTER_BYTE && (valueNum == 16 ||
> valueNum == 8))) {
> + vector<GenRegister> dst(valueNum);
> + const uint32_t typeSize = getFamilySize(family);
>
> - for(uint32_t i = 0; i < valueNum; i++)
> - dst[i] = sel.selReg(insn.getValue(i), getType(family));
> + for(uint32_t i = 0; i < valueNum; i++)
> + dst[i] = sel.selReg(insn.getValue(i), getType(family));
>
> - uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
> - vector<GenRegister> tmp(effectDataNum + 1);
> - vector<GenRegister> tmp2(effectDataNum + 1);
> - vector<GenRegister> effectData(effectDataNum);
> - for(uint32_t i = 0; i < effectDataNum + 1; i++)
> - tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform),
> ir::TYPE_U32);
> + uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
> + vector<GenRegister> tmp(effectDataNum + 1);
> + vector<GenRegister> tmp2(effectDataNum + 1);
> + vector<GenRegister> effectData(effectDataNum);
> + for(uint32_t i = 0; i < effectDataNum + 1; i++)
> + tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform),
> ir::TYPE_U32);
>
> - GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform),
> ir::TYPE_U32);
> - sel.push();
> - if (isUniform)
> - sel.curr.noMask = 1;
> - sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD),
> GenRegister::immud(~0x3));
> - sel.pop();
> + GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform),
> ir::TYPE_U32);
> + sel.push();
> + if (isUniform)
> + sel.curr.noMask = 1;
> + sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD),
> GenRegister::immud(~0x3));
> + sel.pop();
>
> - uint32_t remainedReg = effectDataNum + 1;
> - uint32_t pos = 0;
> - do {
> - uint32_t width = remainedReg > 4 ? 4 : remainedReg;
> - vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
> - vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
> - if (pos != 0) {
> - sel.push();
> - if (isUniform)
> - sel.curr.noMask = 1;
> - sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
> - sel.pop();
> + uint32_t remainedReg = effectDataNum + 1;
> + uint32_t pos = 0;
> + do {
> + uint32_t width = remainedReg > 4 ? 4 : remainedReg;
> + vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
> + vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
> + if (pos != 0) {
> + sel.push();
> + if (isUniform)
> + sel.curr.noMask = 1;
> + sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
> + sel.pop();
> + }
> + readDWord(sel, t1, alignedAddr, width, bti);
> + remainedReg -= width;
> + pos += width;
> + } while(remainedReg);
> +
> + for(uint32_t i = 0; i < effectDataNum; i++)
> + effectData[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform),
> ir::TYPE_U32);
> +
> + getEffectByteData(sel, effectData, tmp, effectDataNum, address,
> isUniform);
> +
> + for(uint32_t i = 0; i < effectDataNum; i++) {
> + unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
> + 4/typeSize : (valueNum - i * (4 / typeSize));
> + sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize,
> elemNum);
> + }
> + } else {
> + GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_BYTE);
> + GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) :
> sel.selReg(bti.reg, ir::TYPE_U32);
> + vector<GenRegister> dst(valueNum);
> + for(uint32_t i = 0; i < valueNum; i++)
> + dst[i] = sel.selReg(insn.getValue(i), getType(family));
> +
> + GenRegister readDst = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
> + uint32_t valueIndex = 0;
> + uint32_t loopCount = (valueNum + 3) / 4;
> + GenRegister addressForLoop = address;
> +
> + sel.push();
> + if (loopCount > 1) {
> + addressForLoop = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
> + sel.MOV(addressForLoop, address);
> }
> - readDWord(sel, t1, alignedAddr, width, bti);
> - remainedReg -= width;
> - pos += width;
> - } while(remainedReg);
>
> - for(uint32_t i = 0; i < effectDataNum; i++)
> - effectData[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform),
> ir::TYPE_U32);
> + for (uint32_t i = 0; i < loopCount; ++i) {
> + uint32_t valueLeft = valueNum - valueIndex;
> + GBE_ASSERT(valueLeft > 1);
> + uint32_t dataSize = 0;
> + if (valueLeft == 2)
> + dataSize = GEN_BYTE_SCATTER_WORD;
> + else
> + dataSize = GEN_BYTE_SCATTER_DWORD;
> + sel.BYTE_GATHER(readDst, addressForLoop, dataSize, b,
> sel.getBTITemps(bti));
>
> - getEffectByteData(sel, effectData, tmp, effectDataNum, address,
> isUniform);
> + // only 4 bytes is gathered even if valueLeft >= 4
> + sel.UNPACK_BYTE(dst.data(), readDst, getFamilySize(FAMILY_BYTE),
> (valueLeft < 4 ? valueLeft : 4));
> + valueIndex += 4;
>
> - for(uint32_t i = 0; i < effectDataNum; i++) {
> - unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
> - 4/typeSize : (valueNum - i * (4 / typeSize));
> - sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize,
> elemNum);
> + //calculate the new address to read
> + if (valueIndex < valueNum)
> + sel.ADD(addressForLoop, addressForLoop, GenRegister::immud(4));
> + }
> + sel.pop();
> }
> } else {
> GBE_ASSERT(insn.getValueNum() == 1);
> --
> 1.9.1
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list