[Beignet] [PATCH] GBE: Merge successive load/store together for better performance.
Ruiling Song
ruiling.song at intel.com
Wed May 7 19:18:22 PDT 2014
Gen support at most 4 DWORD read/write in one single instruction.
So we merge successive read/write for less instruction and better performance.
This improves about 10% for LuxMark medium scene.
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/CMakeLists.txt | 1 +
backend/src/llvm/llvm_gen_backend.hpp | 3 +
backend/src/llvm/llvm_loadstore_optimization.cpp | 269 ++++++++++++++++++++++
backend/src/llvm/llvm_to_gen.cpp | 3 +-
4 files changed, 275 insertions(+), 1 deletion(-)
create mode 100644 backend/src/llvm/llvm_loadstore_optimization.cpp
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 2d59644..3bb31e5 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -146,6 +146,7 @@ else (GBE_USE_BLOB)
llvm/llvm_intrinsic_lowering.cpp
llvm/llvm_barrier_nodup.cpp
llvm/llvm_to_gen.cpp
+ llvm/llvm_loadstore_optimization.cpp
llvm/llvm_gen_backend.hpp
llvm/llvm_gen_ocl_function.hxx
llvm/llvm_to_gen.hpp
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 56dd27f..26323a3 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -84,6 +84,9 @@ namespace gbe
/*! Remove the GEP instructions */
llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
+ /*! Merge load/store if possible */
+ llvm::BasicBlockPass *createLoadStoreOptimizationPass();
+
/*! Scalarize all vector op instructions */
llvm::FunctionPass* createScalarizePass();
/*! Remove/add NoDuplicate function attribute for barrier functions. */
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
new file mode 100644
index 0000000..a597927
--- /dev/null
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling, Song <ruiling.song at intel.com>
+ *
+ * The Idea is that: As GEN support at most 4 successive DWORD load/store,
+ * then merge successive load/store that are compatible is beneficial.
+ * The method of checking whether two load/store is compatible are borrowed
+ * from Vectorize passes in llvm.
+ */
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+
+#include "llvm/Config/config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+
+using namespace llvm;
+namespace gbe {
+ class GenLoadStoreOptimization : public BasicBlockPass {
+
+ public:
+ static char ID;
+ ScalarEvolution *SE;
+ DataLayout *TD;
+ GenLoadStoreOptimization() : BasicBlockPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<ScalarEvolution>();
+ AU.addPreserved<ScalarEvolution>();
+ AU.setPreservesCFG();
+ }
+
+ virtual bool runOnBasicBlock(BasicBlock &BB) {
+ SE = &getAnalysis<ScalarEvolution>();
+ TD = getAnalysisIfAvailable<DataLayout>();
+ return optimizeLoadStore(BB);
+ }
+ Type *getValueType(Value *insn);
+ Value *getPointerOperand(Value *I);
+ unsigned getAddressSpace(Value *I);
+ bool isSimpleLoadStore(Value *I);
+ bool optimizeLoadStore(BasicBlock &BB);
+
+ bool isLoadStoreCompatible(Value *A, Value *B);
+ void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+ void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+ BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
+ SmallVector<Instruction*, 4> &merged,
+ BasicBlock::iterator &start,
+ unsigned maxLimit,
+ bool isLoad);
+
+ virtual const char *getPassName() const {
+ return "Merge compatible Load/stores for Gen";
+ }
+ };
+
+ char GenLoadStoreOptimization::ID = 0;
+
+ Value *GenLoadStoreOptimization::getPointerOperand(Value *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
+ return NULL;
+ }
+ unsigned GenLoadStoreOptimization::getAddressSpace(Value *I) {
+ if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
+ if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
+ return -1;
+ }
+ bool GenLoadStoreOptimization::isSimpleLoadStore(Value *I) {
+ if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->isSimple();
+ if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->isSimple();
+ return false;
+ }
+ Type *GenLoadStoreOptimization::getValueType(Value *insn) {
+ if(LoadInst *ld = dyn_cast<LoadInst>(insn)) return ld->getType();
+ if(StoreInst *st = dyn_cast<StoreInst>(insn)) return st->getValueOperand()->getType();
+
+ return NULL;
+ }
+
+ bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B) {
+ Value *ptrA = getPointerOperand(A);
+ Value *ptrB = getPointerOperand(B);
+ unsigned ASA = getAddressSpace(A);
+ unsigned ASB = getAddressSpace(B);
+
+ // Check that the address spaces match and that the pointers are valid.
+ if (!ptrA || !ptrB || (ASA != ASB)) return false;
+
+ if(!isSimpleLoadStore(A) || !isSimpleLoadStore(B)) return false;
+ // Check that A and B are of the same type.
+ if (ptrA->getType() != ptrB->getType()) return false;
+
+ // Calculate the distance.
+ const SCEV *ptrSCEVA = SE->getSCEV(ptrA);
+ const SCEV *ptrSCEVB = SE->getSCEV(ptrB);
+ const SCEV *offsetSCEV = SE->getMinusSCEV(ptrSCEVA, ptrSCEVB);
+ const SCEVConstant *constOffSCEV = dyn_cast<SCEVConstant>(offsetSCEV);
+
+ // Non constant distance.
+ if (!constOffSCEV) return false;
+
+ int64_t offset = constOffSCEV->getValue()->getSExtValue();
+ Type *Ty = cast<PointerType>(ptrA->getType())->getElementType();
+ // The Instructions are connsecutive if the size of the first load/store is
+ // the same as the offset.
+ int64_t sz = TD->getTypeStoreSize(Ty);
+ return ((-offset) == sz);
+ }
+
+ void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+ IRBuilder<> Builder(&BB);
+
+ unsigned size = merged.size();
+ SmallVector<Value *, 4> values;
+ for(unsigned i = 0; i < size; i++) {
+ values.push_back(merged[i]);
+ }
+ LoadInst *ld = cast<LoadInst>(merged[0]);
+ unsigned align = ld->getAlignment();
+ unsigned addrSpace = ld->getPointerAddressSpace();
+ // insert before first load
+ Builder.SetInsertPoint(ld);
+ VectorType *vecTy = VectorType::get(ld->getType(), size);
+ Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
+ PointerType::get(vecTy, addrSpace));
+ LoadInst *vecValue = Builder.CreateLoad(vecPtr);
+ vecValue->setAlignment(align);
+
+ for (unsigned i = 0; i < size; ++i) {
+ Value *S = Builder.CreateExtractElement(vecValue, Builder.getInt32(i));
+ values[i]->replaceAllUsesWith(S);
+ }
+ }
+
+ BasicBlock::iterator
+ GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
+ SmallVector<Instruction*, 4> &merged,
+ BasicBlock::iterator &start,
+ unsigned maxLimit,
+ bool isLoad) {
+
+ BasicBlock::iterator stepForward = start;
+ if(!isSimpleLoadStore(start)) return stepForward;
+
+ merged.push_back(start);
+
+ BasicBlock::iterator E = BB.end();
+ BasicBlock::iterator J = ++start;
+
+ for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
+ if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
+ if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
+ merged.push_back(J);
+ stepForward = ++J;
+ }
+ } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
+ // simple stop to keep read/write order
+ break;
+ }
+
+ if(merged.size() >= 4) break;
+ }
+ return stepForward;
+ }
+
+ void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+ IRBuilder<> Builder(&BB);
+
+ unsigned size = merged.size();
+ SmallVector<Value *, 4> values;
+ for(unsigned i = 0; i < size; i++) {
+ values.push_back(cast<StoreInst>(merged[i])->getValueOperand());
+ }
+ StoreInst *st = cast<StoreInst>(merged[0]);
+ unsigned addrSpace = st->getPointerAddressSpace();
+
+ unsigned align = st->getAlignment();
+ // insert before the last store
+ Builder.SetInsertPoint(merged[size-1]);
+
+ Type *dataTy = st->getValueOperand()->getType();
+ VectorType *vecTy = VectorType::get(dataTy, size);
+ Value * parent = UndefValue::get(vecTy);
+ for(unsigned i = 0; i < size; i++) {
+ parent = Builder.CreateInsertElement(parent, values[i], ConstantInt::get(IntegerType::get(st->getContext(), 32), i));
+ }
+
+ Value *newPtr = Builder.CreateBitCast(st->getPointerOperand(), PointerType::get(vecTy, addrSpace));
+ StoreInst *newST = Builder.CreateStore(parent, newPtr);
+ newST->setAlignment(align);
+ }
+
+ bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
+ bool changed = false;
+ SmallVector<Instruction*, 4> merged;
+ for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
+ if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
+ bool isLoad = isa<LoadInst>(*BBI) ? true: false;
+ Type *ty = getValueType(BBI);
+ if(ty->isVectorTy()) continue;
+ // we only support DWORD data type merge
+ if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
+ BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
+ if(merged.size() > 1) {
+ if(isLoad)
+ mergeLoad(BB, merged);
+ else
+ mergeStore(BB, merged);
+ // remove merged insn
+ int size = merged.size();
+ for(int i = 0; i < size; i++)
+ merged[i]->eraseFromParent();
+ changed = true;
+ }
+ merged.clear();
+ }
+ }
+ return changed;
+ }
+
+ BasicBlockPass *createLoadStoreOptimizationPass() {
+ return new GenLoadStoreOptimization();
+ }
+};
+
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 37a5b2b..9282b3f 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -187,7 +187,7 @@ namespace gbe
runModulePass(mod, libraryInfo, optLevel);
llvm::PassManager passes;
-
+ passes.add(new DataLayout(&mod));
// Print the code before further optimizations
if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
@@ -198,6 +198,7 @@ namespace gbe
passes.add(createIntrinsicLoweringPass());
passes.add(createFunctionInliningPass(200000));
passes.add(createScalarReplAggregatesPass()); // Break up allocas
+ passes.add(createLoadStoreOptimizationPass());
passes.add(createRemoveGEPPass(unit));
passes.add(createConstantPropagationPass());
passes.add(createLowerSwitchPass());
--
1.7.10.4
More information about the Beignet
mailing list