[Beignet] [PATCH V3] GBE: Do more aggressive load/store merging.

Thu Apr 28 08:39:21 UTC 2016

Previously, we keep strict load/store order as before transformation.
But for load/store from/into different address spaces, the instruction
order can be changed. And this gave us an chance to merge more
load/store instructions.

Also change the processing window a little larger to (maxVecSize * 5).

This change would make significant performance benefit for SKL platform:
A typical case is:
./opencv_perf_core --gtest_filter=OCL_LUTFixture_LUT.LUT/15

v2:
  should not move BBI backward if it points to the first instruction
  in the basic block.

v3:
 increase the scanning window little larger. change factor from 5 to 8.
 thus this patch would not bring too much regression to gemm opencl
sample.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/llvm/llvm_loadstore_optimization.cpp | 89 +++++++++++++++++++-----
 1 file changed, 72 insertions(+), 17 deletions(-)

diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index 121f53d..c90730f 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -70,11 +70,11 @@ namespace gbe {
     bool     isLoadStoreCompatible(Value *A, Value *B);
     void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
     void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
-    BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
-                                               SmallVector<Instruction*, 16> &merged,
-                                               BasicBlock::iterator &start,
-                                               unsigned maxVecSize,
-                                               bool isLoad);
+    bool     findConsecutiveAccess(BasicBlock &BB,
+                                  SmallVector<Instruction*, 16> &merged,
+                                  const BasicBlock::iterator &start,
+                                  unsigned maxVecSize,
+                                  bool isLoad);
 
     virtual const char *getPassName() const {
       return "Merge compatible Load/stores for Gen";
@@ -159,38 +159,58 @@ namespace gbe {
       values[i]->replaceAllUsesWith(S);
     }
   }
-
-  BasicBlock::iterator
+  // When searching for consecutive memory access, we do it in a small window,
+  // if the window is too large, it would take up too much compiling time.
+  // An Important rule we have followed is don't try to change load/store order.
+  // But an exeption is 'load& store that are from different address spaces. The
+  // return value will indicate wheter such kind of reorder happens.
+  bool
   GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
                             SmallVector<Instruction*, 16> &merged,
-                            BasicBlock::iterator &start,
+                            const BasicBlock::iterator &start,
                             unsigned maxVecSize,
                             bool isLoad) {
 
-    BasicBlock::iterator stepForward = start;
-    if(!isSimpleLoadStore(&*start)) return stepForward;
+    if(!isSimpleLoadStore(&*start)) return false;
 
     merged.push_back(&*start);
+    unsigned targetAddrSpace = getAddressSpace(&*start);
 
     BasicBlock::iterator E = BB.end();
-    BasicBlock::iterator J = ++start;
+    BasicBlock::iterator J = start;
+    ++J;
 
-    unsigned maxLimit = maxVecSize * 3;
+    unsigned maxLimit = maxVecSize * 8;
+    bool reordered = false;
 
     for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
       if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
         if(isLoadStoreCompatible(merged[merged.size()-1], &*J)) {
           merged.push_back(&*J);
-          stepForward = ++J;
         }
-      } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
+      } else if((isLoad && isa<StoreInst>(*J))) {
         // simple stop to keep read/write order
-        break;
+        StoreInst *st = cast<StoreInst>(&*J);
+        unsigned addrSpace = st->getPointerAddressSpace();
+        if (addrSpace != targetAddrSpace) {
+          reordered = true;
+        } else {
+          break;
+        }
+      } else if ((!isLoad && isa<LoadInst>(*J))) {
+        LoadInst *ld = cast<LoadInst>(&*J);
+        unsigned addrSpace = ld->getPointerAddressSpace();
+        if (addrSpace != targetAddrSpace) {
+          reordered = true;
+        } else {
+          break;
+        }
       }
 
       if(merged.size() >= maxVecSize) break;
     }
-    return stepForward;
+
+    return reordered;
   }
 
   void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
@@ -220,6 +240,28 @@ namespace gbe {
     newST->setAlignment(align);
   }
 
+  // Find the safe iterator we can point to. If reorder happens, we need to
+  // point to the instruction after the first of toBeDeleted. If no reorder,
+  // we are safe to point to the instruction after the last of toBeDeleted
+  static BasicBlock::iterator
+  findSafeInstruction(SmallVector<Instruction*, 16> &toBeDeleted,
+                           const BasicBlock::iterator &current,
+                           bool reorder) {
+    BasicBlock::iterator safe = current;
+    unsigned size = toBeDeleted.size();
+    if (reorder) {
+      unsigned i = 0;
+      while (toBeDeleted[i] == &*safe && i < size) {
+        ++i;
+        ++safe;
+      }
+    } else {
+      safe = BasicBlock::iterator(toBeDeleted[size - 1]);
+      ++safe;
+    }
+    return safe;
+  }
+
   bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
     bool changed = false;
     SmallVector<Instruction*, 16> merged;
@@ -232,11 +274,18 @@ namespace gbe {
         if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
              ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
           continue;
+
         unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
                               (ty->isIntegerTy(16) ? 8 : 16);
-        BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
+        bool reorder = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
         uint32_t size = merged.size();
         uint32_t pos = 0;
+        bool doDeleting = size > 1;
+        if (doDeleting) {
+          // choose next undeleted instruction
+          BBI = findSafeInstruction(merged, BBI, reorder);
+        }
+
         while(size > 1) {
           unsigned vecSize = (size >= 16) ? 16 :
                              (size >= 8 ? 8 :
@@ -253,6 +302,12 @@ namespace gbe {
           pos += vecSize;
           size -= vecSize;
         }
+        if (doDeleting) {
+          //adjust the BBI back by one, as we would increase it in for loop
+          //don't do this if BBI points to the very first instruction.
+          if (BBI != BB.begin())
+            --BBI;
+        }
         merged.clear();
       }
     }
-- 
2.4.1