[Beignet] [PATCH 2/2] GBE: optimize unaligned char and short data vector's load.
Zhigang Gong
zhigang.gong at intel.com
Tue Aug 26 21:12:44 PDT 2014
The gather the contiguous short/char loads into a single load instruction
could give us a good pportunity to use untyped load to optimize them.
This patch enable the short/char load gathering at the load store optimize
pass. Then at the backend, it will load corresponding DWORDs then covert to
short/char accordingly by applying shift and bitwise operations.
The benchmark shows, for vload4/8/16 char or vload/2/4/8/16 short, this patch brings
about 80%-100% improvement.
Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
backend/src/backend/gen_insn_selection.cpp | 154 ++++++++++++++++++++---
backend/src/llvm/llvm_gen_backend.cpp | 14 ++-
backend/src/llvm/llvm_loadstore_optimization.cpp | 56 +++++----
3 files changed, 178 insertions(+), 46 deletions(-)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index b7a39af..8478616 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2843,11 +2843,97 @@ namespace gbe
sel.pop();
}
- void emitByteGather(Selection::Opaque &sel,
- const ir::LoadInstruction &insn,
- const uint32_t elemSize,
- GenRegister address,
- ir::BTI bti) const
+ // The address is dw aligned.
+ void emitAlignedByteGather(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ const uint32_t elemSize,
+ GenRegister address,
+ ir::BTI bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
+ 1 : sel.ctx.getSimdWidth();
+ RegisterFamily family = getFamily(insn.getValueType());
+
+ vector<GenRegister> dst(valueNum);
+ const uint32_t typeSize = getFamilySize(family);
+
+ for(uint32_t i = 0; i < valueNum; i++)
+ dst[i] = sel.selReg(insn.getValue(i), getType(family));
+
+ uint32_t tmpRegNum = typeSize*valueNum / 4;
+ if (tmpRegNum == 0)
+ tmpRegNum = 1;
+ vector<GenRegister> tmp(tmpRegNum);
+ vector<GenRegister> tmp2(tmpRegNum);
+ vector<Register> tmpReg(tmpRegNum);
+ for(uint32_t i = 0; i < tmpRegNum; i++) {
+ tmpReg[i] = sel.reg(FAMILY_DWORD);
+ tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]);
+ }
+
+ readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
+
+ if (valueNum > 1) {
+ for(uint32_t i = 0; i < tmpRegNum; i++)
+ sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
+ }
+ else {
+ if (elemSize == GEN_BYTE_SCATTER_WORD)
+ sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UW), sel.unpacked_uw(tmpReg[0]));
+ else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+ sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UB), sel.unpacked_ub(tmpReg[0]));
+ }
+ }
+
+ // Gather effect data to the effectData vector from the tmp vector.
+ // x x d0 d1 | d2 d3 d4 d5 | ... ==> d0 d1 d2 d3 | d4 d5 ...
+ void getEffectByteData(Selection::Opaque &sel,
+ vector<GenRegister> &effectData,
+ vector<GenRegister> &tmp,
+ uint32_t effectDataNum,
+ GenRegister addr,
+ uint32_t simdWidth) const
+ {
+ using namespace ir;
+ GBE_ASSERT(effectData.size() == effectDataNum);
+ GBE_ASSERT(tmp.size() == effectDataNum + 1);
+ sel.push();
+ sel.curr.noMask = 1;
+ for(uint32_t i = 0; i < effectDataNum; i++) {
+ GenRegister tmpH = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ GenRegister tmpL = effectData[i];
+ GenRegister shift = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ Register shift1Reg = sel.reg(FAMILY_DWORD);
+ GenRegister shift1 = GenRegister::udxgrf(simdWidth, shift1Reg);
+ GenRegister factor = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ sel.AND(shift, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(0x3));
+ sel.SHL(shift, shift, GenRegister::immud(0x3));
+ sel.SHR(tmpL, tmp[i], shift);
+ sel.ADD(shift1, GenRegister::negate(shift), GenRegister::immud(32));
+ sel.push();
+ // Only need to consider the tmpH when the shift is not 32.
+ Register flag = sel.reg(FAMILY_BOOL);
+ sel.curr.physicalFlag = 0;
+ sel.curr.modFlag = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.flagIndex = (uint16_t)flag;
+ sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shift1Reg), GenRegister::immuw(32), factor);
+ sel.curr.modFlag = 0;
+ sel.curr.predicate = GEN_PREDICATE_NORMAL;
+ sel.SHL(tmpH, tmp[i + 1], shift1);
+ sel.OR(effectData[i], tmpL, tmpH);
+ sel.pop();
+ }
+ sel.pop();
+ }
+
+ void emitUnalignedByteGather(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ const uint32_t elemSize,
+ GenRegister address,
+ ir::BTI bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
@@ -2862,17 +2948,45 @@ namespace gbe
for(uint32_t i = 0; i < valueNum; i++)
dst[i] = sel.selReg(insn.getValue(i), getType(family));
- uint32_t tmpRegNum = typeSize*valueNum / 4;
- vector<GenRegister> tmp(tmpRegNum);
- vector<GenRegister> tmp2(tmpRegNum);
- for(uint32_t i = 0; i < tmpRegNum; i++) {
+ uint32_t effectDataNum = typeSize*valueNum / 4;
+ vector<GenRegister> tmp(effectDataNum + 1);
+ vector<GenRegister> tmp2(effectDataNum + 1);
+ vector<GenRegister> effectData(effectDataNum);
+ for(uint32_t i = 0; i < effectDataNum + 1; i++)
tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
- }
- readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
+ GenRegister alignedAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ sel.push();
+ if (simdWidth == 1)
+ sel.curr.noMask = 1;
+ sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
+ sel.pop();
- for(uint32_t i = 0; i < tmpRegNum; i++) {
- sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
+ uint32_t remainedReg = effectDataNum + 1;
+ uint32_t pos = 0;
+ do {
+ uint32_t width = remainedReg > 4 ? 4 : remainedReg;
+ vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
+ vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
+ if (pos != 0) {
+ sel.push();
+ if (simdWidth == 1)
+ sel.curr.noMask = 1;
+ sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
+ sel.pop();
+ }
+ readDWord(sel, t1, t2, alignedAddr, width, insn.getAddressSpace(), bti);
+ remainedReg -= width;
+ pos += width;
+ } while(remainedReg);
+
+ for(uint32_t i = 0; i < effectDataNum; i++)
+ effectData[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+
+ getEffectByteData(sel, effectData, tmp, effectDataNum, address, simdWidth);
+
+ for(uint32_t i = 0; i < effectDataNum; i++) {
+ sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], 4/typeSize);
}
} else {
GBE_ASSERT(insn.getValueNum() == 1);
@@ -2954,17 +3068,19 @@ namespace gbe
this->emitRead64(sel, insn, address, bti);
else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitDWordGather(sel, insn, address, bti);
- else {
- this->emitByteGather(sel, insn, elemSize, address, bti);
- }
+ else if (insn.isAligned() == true)
+ this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+ else
+ this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
} else {
if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
this->emitRead64(sel, insn, address, bti);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedRead(sel, insn, address, bti);
- else {
- this->emitByteGather(sel, insn, elemSize, address, bti);
- }
+ else if (insn.isAligned())
+ this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+ else
+ this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
}
return true;
}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3a46951..b956bc6 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -614,7 +614,8 @@ namespace gbe
// batch vec4/8/16 load/store
INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
Value *llvmValue, const ir::Register ptr,
- const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
+ const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
+ bool dwAligned);
void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
private:
ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
@@ -3290,7 +3291,8 @@ handle_write_image:
void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
Value *llvmValues, const ir::Register ptr,
const ir::AddressSpace addrSpace,
- Type * elemType, bool isLoad, ir::BTI bti) {
+ Type * elemType, bool isLoad, ir::BTI bti,
+ bool dwAligned) {
const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
@@ -3336,9 +3338,9 @@ handle_write_image:
// Emit the instruction
if (isLoad)
- ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+ ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
else
- ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+ ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
}
}
@@ -3510,11 +3512,11 @@ handle_write_image:
// Not supported by the hardware. So, we split the message and we use
// strided loads and stores
else {
- emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
+ emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
}
}
else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
- emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
+ emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
} else {
for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
if(regTranslator.isUndefConst(llvmValues, elemID))
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index 4bfc7f6..19726b0 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -87,12 +87,12 @@ namespace gbe {
bool optimizeLoadStore(BasicBlock &BB);
bool isLoadStoreCompatible(Value *A, Value *B);
- void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
- void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+ void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
+ void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
- SmallVector<Instruction*, 4> &merged,
+ SmallVector<Instruction*, 16> &merged,
BasicBlock::iterator &start,
- unsigned maxLimit,
+ unsigned maxVecSize,
bool isLoad);
virtual const char *getPassName() const {
@@ -154,11 +154,11 @@ namespace gbe {
return ((-offset) == sz);
}
- void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+ void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
IRBuilder<> Builder(&BB);
unsigned size = merged.size();
- SmallVector<Value *, 4> values;
+ SmallVector<Value *, 16> values;
for(unsigned i = 0; i < size; i++) {
values.push_back(merged[i]);
}
@@ -169,7 +169,7 @@ namespace gbe {
Builder.SetInsertPoint(ld);
VectorType *vecTy = VectorType::get(ld->getType(), size);
Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
- PointerType::get(vecTy, addrSpace));
+ PointerType::get(vecTy, addrSpace));
LoadInst *vecValue = Builder.CreateLoad(vecPtr);
vecValue->setAlignment(align);
@@ -181,9 +181,9 @@ namespace gbe {
BasicBlock::iterator
GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
- SmallVector<Instruction*, 4> &merged,
+ SmallVector<Instruction*, 16> &merged,
BasicBlock::iterator &start,
- unsigned maxLimit,
+ unsigned maxVecSize,
bool isLoad) {
BasicBlock::iterator stepForward = start;
@@ -194,6 +194,8 @@ namespace gbe {
BasicBlock::iterator E = BB.end();
BasicBlock::iterator J = ++start;
+ unsigned maxLimit = maxVecSize * 3;
+
for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
@@ -205,12 +207,12 @@ namespace gbe {
break;
}
- if(merged.size() >= 4) break;
+ if(merged.size() > maxVecSize) break;
}
return stepForward;
}
- void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+ void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
IRBuilder<> Builder(&BB);
unsigned size = merged.size();
@@ -239,25 +241,37 @@ namespace gbe {
bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
bool changed = false;
- SmallVector<Instruction*, 4> merged;
+ SmallVector<Instruction*, 16> merged;
for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
bool isLoad = isa<LoadInst>(*BBI) ? true: false;
Type *ty = getValueType(BBI);
if(ty->isVectorTy()) continue;
- // we only support DWORD data type merge
- if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
- BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
- if(merged.size() > 1) {
+ // TODO Support DWORD/WORD/BYTE LOAD for store support DWORD only now.
+ if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
+ ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
+ continue;
+ unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
+ (ty->isIntegerTy(16) ? 8 : 16);
+ BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
+ uint32_t size = merged.size();
+ uint32_t pos = 0;
+ while(size > 1) {
+ unsigned vecSize = (size >= 16) ? 16 :
+ (size >= 8 ? 8 :
+ (size >= 4 ? 4 :
+ (size >= 2 ? 2 : size)));
+ SmallVector<Instruction*, 16> mergedVec(merged.begin() + pos, merged.begin() + pos + vecSize);
if(isLoad)
- mergeLoad(BB, merged);
+ mergeLoad(BB, mergedVec);
else
- mergeStore(BB, merged);
+ mergeStore(BB, mergedVec);
// remove merged insn
- int size = merged.size();
- for(int i = 0; i < size; i++)
- merged[i]->eraseFromParent();
+ for(uint32_t i = 0; i < mergedVec.size(); i++)
+ mergedVec[i]->eraseFromParent();
changed = true;
+ pos += vecSize;
+ size -= vecSize;
}
merged.clear();
}
--
1.8.3.2
More information about the Beignet
mailing list