[Beignet] [PATCH] GBE: Do more aggressive load/store merging.
Ruiling Song
ruiling.song at intel.com
Tue Feb 16 05:43:23 UTC 2016
Previously, we keep strict load/store order as before transformation.
But for load/store from/into different address spaces, the instruction
order can be changed. And this gave us an chance to merge more
load/store instructions.
Also change the processing window a little larger to (maxVecSize * 5).
This change would make significant performance benefit for SKL platform:
A typical case is:
./opencv_perf_core --gtest_filter=OCL_LUTFixture_LUT.LUT/15
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/llvm/llvm_loadstore_optimization.cpp | 87 +++++++++++++++++++-----
1 file changed, 70 insertions(+), 17 deletions(-)
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index 121f53d..1a2e46b 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -70,11 +70,11 @@ namespace gbe {
bool isLoadStoreCompatible(Value *A, Value *B);
void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
- BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
- SmallVector<Instruction*, 16> &merged,
- BasicBlock::iterator &start,
- unsigned maxVecSize,
- bool isLoad);
+ bool findConsecutiveAccess(BasicBlock &BB,
+ SmallVector<Instruction*, 16> &merged,
+ const BasicBlock::iterator &start,
+ unsigned maxVecSize,
+ bool isLoad);
virtual const char *getPassName() const {
return "Merge compatible Load/stores for Gen";
@@ -159,38 +159,58 @@ namespace gbe {
values[i]->replaceAllUsesWith(S);
}
}
-
- BasicBlock::iterator
+ // When searching for consecutive memory access, we do it in a small window,
+ // if the window is too large, it would take up too much compiling time.
+ // An Important rule we have followed is don't try to change load/store order.
+ // But an exeption is 'load& store that are from different address spaces. The
+ // return value will indicate wheter such kind of reorder happens.
+ bool
GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
SmallVector<Instruction*, 16> &merged,
- BasicBlock::iterator &start,
+ const BasicBlock::iterator &start,
unsigned maxVecSize,
bool isLoad) {
- BasicBlock::iterator stepForward = start;
- if(!isSimpleLoadStore(&*start)) return stepForward;
+ if(!isSimpleLoadStore(&*start)) return false;
merged.push_back(&*start);
+ unsigned targetAddrSpace = getAddressSpace(&*start);
BasicBlock::iterator E = BB.end();
- BasicBlock::iterator J = ++start;
+ BasicBlock::iterator J = start;
+ ++J;
- unsigned maxLimit = maxVecSize * 3;
+ unsigned maxLimit = maxVecSize * 5;
+ bool reordered = false;
for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
if(isLoadStoreCompatible(merged[merged.size()-1], &*J)) {
merged.push_back(&*J);
- stepForward = ++J;
}
- } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
+ } else if((isLoad && isa<StoreInst>(*J))) {
// simple stop to keep read/write order
- break;
+ StoreInst *st = cast<StoreInst>(&*J);
+ unsigned addrSpace = st->getPointerAddressSpace();
+ if (addrSpace != targetAddrSpace) {
+ reordered = true;
+ } else {
+ break;
+ }
+ } else if ((!isLoad && isa<LoadInst>(*J))) {
+ LoadInst *ld = cast<LoadInst>(&*J);
+ unsigned addrSpace = ld->getPointerAddressSpace();
+ if (addrSpace != targetAddrSpace) {
+ reordered = true;
+ } else {
+ break;
+ }
}
if(merged.size() >= maxVecSize) break;
}
- return stepForward;
+
+ return reordered;
}
void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
@@ -220,6 +240,28 @@ namespace gbe {
newST->setAlignment(align);
}
+ // Find the safe iterator we can point to. If reorder happens, we need to
+ // point to the instruction after the first of toBeDeleted. If no reorder,
+ // we are safe to point to the instruction after the last of toBeDeleted
+ static BasicBlock::iterator
+ findSafeInstruction(SmallVector<Instruction*, 16> &toBeDeleted,
+ const BasicBlock::iterator ¤t,
+ bool reorder) {
+ BasicBlock::iterator safe = current;
+ unsigned size = toBeDeleted.size();
+ if (reorder) {
+ unsigned i = 0;
+ while (toBeDeleted[i] == &*safe && i < size) {
+ ++i;
+ ++safe;
+ }
+ } else {
+ safe = BasicBlock::iterator(toBeDeleted[size - 1]);
+ ++safe;
+ }
+ return safe;
+ }
+
bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
bool changed = false;
SmallVector<Instruction*, 16> merged;
@@ -232,11 +274,18 @@ namespace gbe {
if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
continue;
+
unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
(ty->isIntegerTy(16) ? 8 : 16);
- BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
+ bool reorder = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
uint32_t size = merged.size();
uint32_t pos = 0;
+ bool doDeleting = size > 1;
+ if (doDeleting) {
+ // choose next undeleted instruction
+ BBI = findSafeInstruction(merged, BBI, reorder);
+ }
+
while(size > 1) {
unsigned vecSize = (size >= 16) ? 16 :
(size >= 8 ? 8 :
@@ -253,6 +302,10 @@ namespace gbe {
pos += vecSize;
size -= vecSize;
}
+ if (doDeleting) {
+ //adjust the BBI back by one, as we would increase it in for loop
+ --BBI;
+ }
merged.clear();
}
}
--
2.4.1
More information about the Beignet
mailing list