[Beignet] [PATCH] GBE: Merge successive load/store together for better performance.

Wed May 7 19:18:22 PDT 2014

Gen support at most 4 DWORD read/write in one single instruction.
So we merge successive read/write for less instruction and better performance.
This improves about 10% for LuxMark medium scene.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/CMakeLists.txt                       |    1 +
 backend/src/llvm/llvm_gen_backend.hpp            |    3 +
 backend/src/llvm/llvm_loadstore_optimization.cpp |  269 ++++++++++++++++++++++
 backend/src/llvm/llvm_to_gen.cpp                 |    3 +-
 4 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 backend/src/llvm/llvm_loadstore_optimization.cpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 2d59644..3bb31e5 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -146,6 +146,7 @@ else (GBE_USE_BLOB)
     llvm/llvm_intrinsic_lowering.cpp
     llvm/llvm_barrier_nodup.cpp
     llvm/llvm_to_gen.cpp
+    llvm/llvm_loadstore_optimization.cpp
     llvm/llvm_gen_backend.hpp
     llvm/llvm_gen_ocl_function.hxx
     llvm/llvm_to_gen.hpp
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 56dd27f..26323a3 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -84,6 +84,9 @@ namespace gbe
   /*! Remove the GEP instructions */
   llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
 
+  /*! Merge load/store if possible */
+  llvm::BasicBlockPass *createLoadStoreOptimizationPass();
+
   /*! Scalarize all vector op instructions */
   llvm::FunctionPass* createScalarizePass();
   /*! Remove/add NoDuplicate function attribute for barrier functions. */
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
new file mode 100644
index 0000000..a597927
--- /dev/null
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling, Song <ruiling.song at intel.com>
+ *
+ * The Idea is that: As GEN support at most 4 successive DWORD load/store,
+ * then merge successive load/store that are compatible is beneficial.
+ * The method of checking whether two load/store is compatible are borrowed
+ * from Vectorize passes in llvm.
+ */
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+
+#include "llvm/Config/config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+
+using namespace llvm;
+namespace gbe {
+  class GenLoadStoreOptimization : public BasicBlockPass {
+
+  public:
+    static char ID;
+    ScalarEvolution *SE;
+    DataLayout *TD;
+    GenLoadStoreOptimization() : BasicBlockPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.setPreservesCFG();
+    }
+
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      SE = &getAnalysis<ScalarEvolution>();
+      TD = getAnalysisIfAvailable<DataLayout>();
+      return optimizeLoadStore(BB);
+    }
+    Type    *getValueType(Value *insn);
+    Value   *getPointerOperand(Value *I);
+    unsigned getAddressSpace(Value *I);
+    bool     isSimpleLoadStore(Value *I);
+    bool     optimizeLoadStore(BasicBlock &BB);
+
+    bool     isLoadStoreCompatible(Value *A, Value *B);
+    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+    BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
+                                               SmallVector<Instruction*, 4> &merged,
+                                               BasicBlock::iterator &start,
+                                               unsigned maxLimit,
+                                               bool isLoad);
+
+    virtual const char *getPassName() const {
+      return "Merge compatible Load/stores for Gen";
+    }
+  };
+
+  char GenLoadStoreOptimization::ID = 0;
+
+  Value *GenLoadStoreOptimization::getPointerOperand(Value *I) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
+    if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
+    return NULL;
+  }
+  unsigned GenLoadStoreOptimization::getAddressSpace(Value *I) {
+    if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
+    if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
+    return -1;
+  }
+  bool GenLoadStoreOptimization::isSimpleLoadStore(Value *I) {
+    if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->isSimple();
+    if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->isSimple();
+    return false;
+  }
+  Type *GenLoadStoreOptimization::getValueType(Value *insn) {
+    if(LoadInst *ld = dyn_cast<LoadInst>(insn)) return ld->getType();
+    if(StoreInst *st = dyn_cast<StoreInst>(insn)) return st->getValueOperand()->getType();
+
+    return NULL;
+  }
+
+  bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B) {
+    Value *ptrA = getPointerOperand(A);
+    Value *ptrB = getPointerOperand(B);
+    unsigned ASA = getAddressSpace(A);
+    unsigned ASB = getAddressSpace(B);
+
+    // Check that the address spaces match and that the pointers are valid.
+    if (!ptrA || !ptrB || (ASA != ASB)) return false;
+
+    if(!isSimpleLoadStore(A) || !isSimpleLoadStore(B)) return false;
+    // Check that A and B are of the same type.
+    if (ptrA->getType() != ptrB->getType()) return false;
+
+    // Calculate the distance.
+    const SCEV *ptrSCEVA = SE->getSCEV(ptrA);
+    const SCEV *ptrSCEVB = SE->getSCEV(ptrB);
+    const SCEV *offsetSCEV = SE->getMinusSCEV(ptrSCEVA, ptrSCEVB);
+    const SCEVConstant *constOffSCEV = dyn_cast<SCEVConstant>(offsetSCEV);
+
+    // Non constant distance.
+    if (!constOffSCEV) return false;
+
+    int64_t offset = constOffSCEV->getValue()->getSExtValue();
+    Type *Ty = cast<PointerType>(ptrA->getType())->getElementType();
+    // The Instructions are connsecutive if the size of the first load/store is
+    // the same as the offset.
+    int64_t sz = TD->getTypeStoreSize(Ty);
+    return ((-offset) == sz);
+  }
+
+  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+    IRBuilder<> Builder(&BB);
+
+    unsigned size = merged.size();
+    SmallVector<Value *, 4> values;
+    for(unsigned i = 0; i < size; i++) {
+      values.push_back(merged[i]);
+    }
+    LoadInst *ld = cast<LoadInst>(merged[0]);
+    unsigned align = ld->getAlignment();
+    unsigned addrSpace = ld->getPointerAddressSpace();
+    // insert before first load
+    Builder.SetInsertPoint(ld);
+    VectorType *vecTy = VectorType::get(ld->getType(), size);
+    Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
+                                          PointerType::get(vecTy, addrSpace));
+    LoadInst *vecValue = Builder.CreateLoad(vecPtr);
+    vecValue->setAlignment(align);
+
+    for (unsigned i = 0; i < size; ++i) {
+      Value *S = Builder.CreateExtractElement(vecValue, Builder.getInt32(i));
+      values[i]->replaceAllUsesWith(S);
+    }
+  }
+
+  BasicBlock::iterator
+  GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
+                            SmallVector<Instruction*, 4> &merged,
+                            BasicBlock::iterator &start,
+                            unsigned maxLimit,
+                            bool isLoad) {
+
+    BasicBlock::iterator stepForward = start;
+    if(!isSimpleLoadStore(start)) return stepForward;
+
+    merged.push_back(start);
+
+    BasicBlock::iterator E = BB.end();
+    BasicBlock::iterator J = ++start;
+
+    for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
+      if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
+        if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
+          merged.push_back(J);
+          stepForward = ++J;
+        }
+      } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
+        // simple stop to keep read/write order
+        break;
+      }
+
+      if(merged.size() >= 4) break;
+    }
+    return stepForward;
+  }
+
+  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+    IRBuilder<> Builder(&BB);
+
+    unsigned size = merged.size();
+    SmallVector<Value *, 4> values;
+    for(unsigned i = 0; i < size; i++) {
+      values.push_back(cast<StoreInst>(merged[i])->getValueOperand());
+    }
+    StoreInst *st = cast<StoreInst>(merged[0]);
+    unsigned addrSpace = st->getPointerAddressSpace();
+
+    unsigned align = st->getAlignment();
+    // insert before the last store
+    Builder.SetInsertPoint(merged[size-1]);
+
+    Type *dataTy = st->getValueOperand()->getType();
+    VectorType *vecTy = VectorType::get(dataTy, size);
+    Value * parent = UndefValue::get(vecTy);
+    for(unsigned i = 0; i < size; i++) {
+      parent = Builder.CreateInsertElement(parent, values[i], ConstantInt::get(IntegerType::get(st->getContext(), 32), i));
+    }
+
+    Value *newPtr = Builder.CreateBitCast(st->getPointerOperand(), PointerType::get(vecTy, addrSpace));
+    StoreInst *newST = Builder.CreateStore(parent, newPtr);
+    newST->setAlignment(align);
+  }
+
+  bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
+    bool changed = false;
+    SmallVector<Instruction*, 4> merged;
+    for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
+      if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
+        bool isLoad = isa<LoadInst>(*BBI) ? true: false;
+        Type *ty = getValueType(BBI);
+        if(ty->isVectorTy()) continue;
+        // we only support DWORD data type merge
+        if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
+        BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
+        if(merged.size() > 1) {
+          if(isLoad)
+            mergeLoad(BB, merged);
+          else
+            mergeStore(BB, merged);
+          // remove merged insn
+          int size = merged.size();
+          for(int i = 0; i < size; i++)
+            merged[i]->eraseFromParent();
+          changed = true;
+        }
+        merged.clear();
+      }
+    }
+    return changed;
+  }
+
+  BasicBlockPass *createLoadStoreOptimizationPass() {
+    return new GenLoadStoreOptimization();
+  }
+};
+
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 37a5b2b..9282b3f 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -187,7 +187,7 @@ namespace gbe
     runModulePass(mod, libraryInfo, optLevel);
 
     llvm::PassManager passes;
-
+    passes.add(new DataLayout(&mod));
     // Print the code before further optimizations
     if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
@@ -198,6 +198,7 @@ namespace gbe
     passes.add(createIntrinsicLoweringPass());
     passes.add(createFunctionInliningPass(200000));
     passes.add(createScalarReplAggregatesPass()); // Break up allocas
+    passes.add(createLoadStoreOptimizationPass());
     passes.add(createRemoveGEPPass(unit));
     passes.add(createConstantPropagationPass());
     passes.add(createLowerSwitchPass());
-- 
1.7.10.4