[Beignet] [PATCH 2/2] GBE: optimize unaligned char and short data vector's load.
Zhigang Gong
zhigang.gong at linux.intel.com
Mon Sep 1 21:36:59 PDT 2014
On Wed, Aug 27, 2014 at 12:12:44PM +0800, Zhigang Gong wrote:
> The gather the contiguous short/char loads into a single load instruction
> could give us a good pportunity to use untyped load to optimize them.
>
> This patch enable the short/char load gathering at the load store optimize
> pass. Then at the backend, it will load corresponding DWORDs then covert to
> short/char accordingly by applying shift and bitwise operations.
>
> The benchmark shows, for vload4/8/16 char or vload/2/4/8/16 short, this patch brings
> about 80%-100% improvement.
>
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
> backend/src/backend/gen_insn_selection.cpp | 154 ++++++++++++++++++++---
> backend/src/llvm/llvm_gen_backend.cpp | 14 ++-
> backend/src/llvm/llvm_loadstore_optimization.cpp | 56 +++++----
> 3 files changed, 178 insertions(+), 46 deletions(-)
>
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index b7a39af..8478616 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -2843,11 +2843,97 @@ namespace gbe
> sel.pop();
> }
>
> - void emitByteGather(Selection::Opaque &sel,
> - const ir::LoadInstruction &insn,
> - const uint32_t elemSize,
> - GenRegister address,
> - ir::BTI bti) const
> + // The address is dw aligned.
> + void emitAlignedByteGather(Selection::Opaque &sel,
> + const ir::LoadInstruction &insn,
> + const uint32_t elemSize,
> + GenRegister address,
> + ir::BTI bti) const
> + {
> + using namespace ir;
> + const uint32_t valueNum = insn.getValueNum();
> + const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
> + 1 : sel.ctx.getSimdWidth();
> + RegisterFamily family = getFamily(insn.getValueType());
> +
> + vector<GenRegister> dst(valueNum);
> + const uint32_t typeSize = getFamilySize(family);
> +
> + for(uint32_t i = 0; i < valueNum; i++)
> + dst[i] = sel.selReg(insn.getValue(i), getType(family));
> +
> + uint32_t tmpRegNum = typeSize*valueNum / 4;
> + if (tmpRegNum == 0)
> + tmpRegNum = 1;
> + vector<GenRegister> tmp(tmpRegNum);
> + vector<GenRegister> tmp2(tmpRegNum);
> + vector<Register> tmpReg(tmpRegNum);
> + for(uint32_t i = 0; i < tmpRegNum; i++) {
> + tmpReg[i] = sel.reg(FAMILY_DWORD);
> + tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]);
> + }
> +
> + readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
> +
> + if (valueNum > 1) {
> + for(uint32_t i = 0; i < tmpRegNum; i++)
> + sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
> + }
> + else {
> + if (elemSize == GEN_BYTE_SCATTER_WORD)
> + sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UW), sel.unpacked_uw(tmpReg[0]));
> + else if (elemSize == GEN_BYTE_SCATTER_BYTE)
> + sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UB), sel.unpacked_ub(tmpReg[0]));
> + }
> + }
> +
> + // Gather effect data to the effectData vector from the tmp vector.
> + // x x d0 d1 | d2 d3 d4 d5 | ... ==> d0 d1 d2 d3 | d4 d5 ...
> + void getEffectByteData(Selection::Opaque &sel,
> + vector<GenRegister> &effectData,
> + vector<GenRegister> &tmp,
> + uint32_t effectDataNum,
> + GenRegister addr,
> + uint32_t simdWidth) const
> + {
> + using namespace ir;
> + GBE_ASSERT(effectData.size() == effectDataNum);
> + GBE_ASSERT(tmp.size() == effectDataNum + 1);
> + sel.push();
> + sel.curr.noMask = 1;
> + for(uint32_t i = 0; i < effectDataNum; i++) {
> + GenRegister tmpH = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> + GenRegister tmpL = effectData[i];
> + GenRegister shift = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> + Register shift1Reg = sel.reg(FAMILY_DWORD);
> + GenRegister shift1 = GenRegister::udxgrf(simdWidth, shift1Reg);
> + GenRegister factor = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> + sel.AND(shift, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(0x3));
> + sel.SHL(shift, shift, GenRegister::immud(0x3));
> + sel.SHR(tmpL, tmp[i], shift);
> + sel.ADD(shift1, GenRegister::negate(shift), GenRegister::immud(32));
> + sel.push();
> + // Only need to consider the tmpH when the shift is not 32.
> + Register flag = sel.reg(FAMILY_BOOL);
> + sel.curr.physicalFlag = 0;
> + sel.curr.modFlag = 1;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.flagIndex = (uint16_t)flag;
> + sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shift1Reg), GenRegister::immuw(32), factor);
> + sel.curr.modFlag = 0;
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> + sel.SHL(tmpH, tmp[i + 1], shift1);
> + sel.OR(effectData[i], tmpL, tmpH);
> + sel.pop();
> + }
> + sel.pop();
> + }
> +
> + void emitUnalignedByteGather(Selection::Opaque &sel,
> + const ir::LoadInstruction &insn,
> + const uint32_t elemSize,
> + GenRegister address,
> + ir::BTI bti) const
> {
> using namespace ir;
> const uint32_t valueNum = insn.getValueNum();
> @@ -2862,17 +2948,45 @@ namespace gbe
> for(uint32_t i = 0; i < valueNum; i++)
> dst[i] = sel.selReg(insn.getValue(i), getType(family));
>
> - uint32_t tmpRegNum = typeSize*valueNum / 4;
> - vector<GenRegister> tmp(tmpRegNum);
> - vector<GenRegister> tmp2(tmpRegNum);
> - for(uint32_t i = 0; i < tmpRegNum; i++) {
> + uint32_t effectDataNum = typeSize*valueNum / 4;
> + vector<GenRegister> tmp(effectDataNum + 1);
> + vector<GenRegister> tmp2(effectDataNum + 1);
> + vector<GenRegister> effectData(effectDataNum);
> + for(uint32_t i = 0; i < effectDataNum + 1; i++)
> tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> - }
>
> - readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
> + GenRegister alignedAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> + sel.push();
> + if (simdWidth == 1)
> + sel.curr.noMask = 1;
> + sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
> + sel.pop();
>
> - for(uint32_t i = 0; i < tmpRegNum; i++) {
> - sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
> + uint32_t remainedReg = effectDataNum + 1;
> + uint32_t pos = 0;
> + do {
> + uint32_t width = remainedReg > 4 ? 4 : remainedReg;
> + vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
> + vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
> + if (pos != 0) {
> + sel.push();
> + if (simdWidth == 1)
> + sel.curr.noMask = 1;
> + sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
> + sel.pop();
> + }
> + readDWord(sel, t1, t2, alignedAddr, width, insn.getAddressSpace(), bti);
> + remainedReg -= width;
> + pos += width;
> + } while(remainedReg);
> +
> + for(uint32_t i = 0; i < effectDataNum; i++)
> + effectData[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +
> + getEffectByteData(sel, effectData, tmp, effectDataNum, address, simdWidth);
> +
> + for(uint32_t i = 0; i < effectDataNum; i++) {
> + sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], 4/typeSize);
> }
> } else {
> GBE_ASSERT(insn.getValueNum() == 1);
> @@ -2954,17 +3068,19 @@ namespace gbe
> this->emitRead64(sel, insn, address, bti);
> else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
> this->emitDWordGather(sel, insn, address, bti);
> - else {
> - this->emitByteGather(sel, insn, elemSize, address, bti);
> - }
> + else if (insn.isAligned() == true)
> + this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
> + else
> + this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
> } else {
> if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
> this->emitRead64(sel, insn, address, bti);
> else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
> this->emitUntypedRead(sel, insn, address, bti);
> - else {
> - this->emitByteGather(sel, insn, elemSize, address, bti);
> - }
> + else if (insn.isAligned())
> + this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
> + else
> + this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
> }
> return true;
> }
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 3a46951..b956bc6 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -614,7 +614,8 @@ namespace gbe
> // batch vec4/8/16 load/store
> INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
> Value *llvmValue, const ir::Register ptr,
> - const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
> + const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
> + bool dwAligned);
> void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
> private:
> ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
> @@ -3290,7 +3291,8 @@ handle_write_image:
> void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
> Value *llvmValues, const ir::Register ptr,
> const ir::AddressSpace addrSpace,
> - Type * elemType, bool isLoad, ir::BTI bti) {
> + Type * elemType, bool isLoad, ir::BTI bti,
> + bool dwAligned) {
> const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
> uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
> @@ -3336,9 +3338,9 @@ handle_write_image:
>
> // Emit the instruction
> if (isLoad)
> - ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
> + ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
> else
> - ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
> + ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
> }
> }
>
> @@ -3510,11 +3512,11 @@ handle_write_image:
> // Not supported by the hardware. So, we split the message and we use
> // strided loads and stores
> else {
> - emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
> + emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
> }
> }
> else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
> - emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
> + emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
> } else {
> for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
> if(regTranslator.isUndefConst(llvmValues, elemID))
> diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
> index 4bfc7f6..19726b0 100644
> --- a/backend/src/llvm/llvm_loadstore_optimization.cpp
> +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
> @@ -87,12 +87,12 @@ namespace gbe {
> bool optimizeLoadStore(BasicBlock &BB);
>
> bool isLoadStoreCompatible(Value *A, Value *B);
> - void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
> - void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
> + void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
> + void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
> BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
> - SmallVector<Instruction*, 4> &merged,
> + SmallVector<Instruction*, 16> &merged,
> BasicBlock::iterator &start,
> - unsigned maxLimit,
> + unsigned maxVecSize,
> bool isLoad);
>
> virtual const char *getPassName() const {
> @@ -154,11 +154,11 @@ namespace gbe {
> return ((-offset) == sz);
> }
>
> - void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
> + void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
> IRBuilder<> Builder(&BB);
>
> unsigned size = merged.size();
> - SmallVector<Value *, 4> values;
> + SmallVector<Value *, 16> values;
> for(unsigned i = 0; i < size; i++) {
> values.push_back(merged[i]);
> }
> @@ -169,7 +169,7 @@ namespace gbe {
> Builder.SetInsertPoint(ld);
> VectorType *vecTy = VectorType::get(ld->getType(), size);
> Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
> - PointerType::get(vecTy, addrSpace));
> + PointerType::get(vecTy, addrSpace));
> LoadInst *vecValue = Builder.CreateLoad(vecPtr);
> vecValue->setAlignment(align);
>
> @@ -181,9 +181,9 @@ namespace gbe {
>
> BasicBlock::iterator
> GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
> - SmallVector<Instruction*, 4> &merged,
> + SmallVector<Instruction*, 16> &merged,
> BasicBlock::iterator &start,
> - unsigned maxLimit,
> + unsigned maxVecSize,
> bool isLoad) {
>
> BasicBlock::iterator stepForward = start;
> @@ -194,6 +194,8 @@ namespace gbe {
> BasicBlock::iterator E = BB.end();
> BasicBlock::iterator J = ++start;
>
> + unsigned maxLimit = maxVecSize * 3;
> +
> for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
> if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
> if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
> @@ -205,12 +207,12 @@ namespace gbe {
> break;
> }
>
> - if(merged.size() >= 4) break;
> + if(merged.size() > maxVecSize) break;
Tony pointed out this should be merged.size() >= maxVecSize, I already fixed it locally. Thanks Tony.
> }
> return stepForward;
> }
>
> - void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
> + void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
> IRBuilder<> Builder(&BB);
>
> unsigned size = merged.size();
> @@ -239,25 +241,37 @@ namespace gbe {
>
> bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
> bool changed = false;
> - SmallVector<Instruction*, 4> merged;
> + SmallVector<Instruction*, 16> merged;
> for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
> if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
> bool isLoad = isa<LoadInst>(*BBI) ? true: false;
> Type *ty = getValueType(BBI);
> if(ty->isVectorTy()) continue;
> - // we only support DWORD data type merge
> - if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
> - BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
> - if(merged.size() > 1) {
> + // TODO Support DWORD/WORD/BYTE LOAD for store support DWORD only now.
> + if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
> + ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
> + continue;
> + unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
> + (ty->isIntegerTy(16) ? 8 : 16);
> + BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
> + uint32_t size = merged.size();
> + uint32_t pos = 0;
> + while(size > 1) {
> + unsigned vecSize = (size >= 16) ? 16 :
> + (size >= 8 ? 8 :
> + (size >= 4 ? 4 :
> + (size >= 2 ? 2 : size)));
> + SmallVector<Instruction*, 16> mergedVec(merged.begin() + pos, merged.begin() + pos + vecSize);
> if(isLoad)
> - mergeLoad(BB, merged);
> + mergeLoad(BB, mergedVec);
> else
> - mergeStore(BB, merged);
> + mergeStore(BB, mergedVec);
> // remove merged insn
> - int size = merged.size();
> - for(int i = 0; i < size; i++)
> - merged[i]->eraseFromParent();
> + for(uint32_t i = 0; i < mergedVec.size(); i++)
> + mergedVec[i]->eraseFromParent();
> changed = true;
> + pos += vecSize;
> + size -= vecSize;
> }
> merged.clear();
> }
> --
> 1.8.3.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list