[Beignet] [PATCH] GBE: Merge successive load/store together for better performance.

Thu May 8 19:08:17 PDT 2014

Good job! Ruiling. In general, this patch looks good to me.
And as we just discussed offline, after you submmitted a fix for the
SLM value allocation bug, I will do a fully test on this patch.
If everything is ok, will push it then.
Thanks.

On Thu, May 08, 2014 at 10:18:22AM +0800, Ruiling Song wrote:
> Gen support at most 4 DWORD read/write in one single instruction.
> So we merge successive read/write for less instruction and better performance.
> This improves about 10% for LuxMark medium scene.
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  backend/src/CMakeLists.txt                       |    1 +
>  backend/src/llvm/llvm_gen_backend.hpp            |    3 +
>  backend/src/llvm/llvm_loadstore_optimization.cpp |  269 ++++++++++++++++++++++
>  backend/src/llvm/llvm_to_gen.cpp                 |    3 +-
>  4 files changed, 275 insertions(+), 1 deletion(-)
>  create mode 100644 backend/src/llvm/llvm_loadstore_optimization.cpp
> 
> diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
> index 2d59644..3bb31e5 100644
> --- a/backend/src/CMakeLists.txt
> +++ b/backend/src/CMakeLists.txt
> @@ -146,6 +146,7 @@ else (GBE_USE_BLOB)
>      llvm/llvm_intrinsic_lowering.cpp
>      llvm/llvm_barrier_nodup.cpp
>      llvm/llvm_to_gen.cpp
> +    llvm/llvm_loadstore_optimization.cpp
>      llvm/llvm_gen_backend.hpp
>      llvm/llvm_gen_ocl_function.hxx
>      llvm/llvm_to_gen.hpp
> diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
> index 56dd27f..26323a3 100644
> --- a/backend/src/llvm/llvm_gen_backend.hpp
> +++ b/backend/src/llvm/llvm_gen_backend.hpp
> @@ -84,6 +84,9 @@ namespace gbe
>    /*! Remove the GEP instructions */
>    llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
>  
> +  /*! Merge load/store if possible */
> +  llvm::BasicBlockPass *createLoadStoreOptimizationPass();
> +
>    /*! Scalarize all vector op instructions */
>    llvm::FunctionPass* createScalarizePass();
>    /*! Remove/add NoDuplicate function attribute for barrier functions. */
> diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
> new file mode 100644
> index 0000000..a597927
> --- /dev/null
> +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
> @@ -0,0 +1,269 @@
> +/*
> + * Copyright © 2012 Intel Corporation
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library. If not, see <http://www.gnu.org/licenses/>.
> + *
> + * Author: Ruiling, Song <ruiling.song at intel.com>
> + *
> + * The Idea is that: As GEN support at most 4 successive DWORD load/store,
> + * then merge successive load/store that are compatible is beneficial.
> + * The method of checking whether two load/store is compatible are borrowed
> + * from Vectorize passes in llvm.
> + */
> +
> +#include "llvm/IR/Instructions.h"
> +#include "llvm/Pass.h"
> +#include "llvm/PassManager.h"
> +
> +#include "llvm/Config/config.h"
> +#include "llvm/ADT/DenseMap.h"
> +#include "llvm/ADT/PostOrderIterator.h"
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
> +#include "llvm/Function.h"
> +#include "llvm/InstrTypes.h"
> +#include "llvm/Instructions.h"
> +#include "llvm/IntrinsicInst.h"
> +#include "llvm/Module.h"
> +#else
> +#include "llvm/IR/Function.h"
> +#include "llvm/IR/InstrTypes.h"
> +#include "llvm/IR/Instructions.h"
> +#include "llvm/IR/IntrinsicInst.h"
> +#include "llvm/IR/Module.h"
> +#endif  /* LLVM_VERSION_MINOR <= 2 */
> +#include "llvm/Pass.h"
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
> +#include "llvm/Support/IRBuilder.h"
> +#elif LLVM_VERSION_MINOR == 2
> +#include "llvm/IRBuilder.h"
> +#else
> +#include "llvm/IR/IRBuilder.h"
> +#endif /* LLVM_VERSION_MINOR <= 1 */
> +#include "llvm/Support/CallSite.h"
> +#include "llvm/Support/CFG.h"
> +#include "llvm/Support/raw_ostream.h"
> +#include "llvm/Analysis/ScalarEvolution.h"
> +#include "llvm/Analysis/ScalarEvolutionExpressions.h"
> +
> +using namespace llvm;
> +namespace gbe {
> +  class GenLoadStoreOptimization : public BasicBlockPass {
> +
> +  public:
> +    static char ID;
> +    ScalarEvolution *SE;
> +    DataLayout *TD;
> +    GenLoadStoreOptimization() : BasicBlockPass(ID) {}
> +
> +    void getAnalysisUsage(AnalysisUsage &AU) const {
> +      AU.addRequired<ScalarEvolution>();
> +      AU.addPreserved<ScalarEvolution>();
> +      AU.setPreservesCFG();
> +    }
> +
> +    virtual bool runOnBasicBlock(BasicBlock &BB) {
> +      SE = &getAnalysis<ScalarEvolution>();
> +      TD = getAnalysisIfAvailable<DataLayout>();
> +      return optimizeLoadStore(BB);
> +    }
> +    Type    *getValueType(Value *insn);
> +    Value   *getPointerOperand(Value *I);
> +    unsigned getAddressSpace(Value *I);
> +    bool     isSimpleLoadStore(Value *I);
> +    bool     optimizeLoadStore(BasicBlock &BB);
> +
> +    bool     isLoadStoreCompatible(Value *A, Value *B);
> +    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
> +    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
> +    BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
> +                                               SmallVector<Instruction*, 4> &merged,
> +                                               BasicBlock::iterator &start,
> +                                               unsigned maxLimit,
> +                                               bool isLoad);
> +
> +    virtual const char *getPassName() const {
> +      return "Merge compatible Load/stores for Gen";
> +    }
> +  };
> +
> +  char GenLoadStoreOptimization::ID = 0;
> +
> +  Value *GenLoadStoreOptimization::getPointerOperand(Value *I) {
> +    if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
> +    if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
> +    return NULL;
> +  }
> +  unsigned GenLoadStoreOptimization::getAddressSpace(Value *I) {
> +    if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
> +    if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
> +    return -1;
> +  }
> +  bool GenLoadStoreOptimization::isSimpleLoadStore(Value *I) {
> +    if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->isSimple();
> +    if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->isSimple();
> +    return false;
> +  }
> +  Type *GenLoadStoreOptimization::getValueType(Value *insn) {
> +    if(LoadInst *ld = dyn_cast<LoadInst>(insn)) return ld->getType();
> +    if(StoreInst *st = dyn_cast<StoreInst>(insn)) return st->getValueOperand()->getType();
> +
> +    return NULL;
> +  }
> +
> +  bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B) {
> +    Value *ptrA = getPointerOperand(A);
> +    Value *ptrB = getPointerOperand(B);
> +    unsigned ASA = getAddressSpace(A);
> +    unsigned ASB = getAddressSpace(B);
> +
> +    // Check that the address spaces match and that the pointers are valid.
> +    if (!ptrA || !ptrB || (ASA != ASB)) return false;
> +
> +    if(!isSimpleLoadStore(A) || !isSimpleLoadStore(B)) return false;
> +    // Check that A and B are of the same type.
> +    if (ptrA->getType() != ptrB->getType()) return false;
> +
> +    // Calculate the distance.
> +    const SCEV *ptrSCEVA = SE->getSCEV(ptrA);
> +    const SCEV *ptrSCEVB = SE->getSCEV(ptrB);
> +    const SCEV *offsetSCEV = SE->getMinusSCEV(ptrSCEVA, ptrSCEVB);
> +    const SCEVConstant *constOffSCEV = dyn_cast<SCEVConstant>(offsetSCEV);
> +
> +    // Non constant distance.
> +    if (!constOffSCEV) return false;
> +
> +    int64_t offset = constOffSCEV->getValue()->getSExtValue();
> +    Type *Ty = cast<PointerType>(ptrA->getType())->getElementType();
> +    // The Instructions are connsecutive if the size of the first load/store is
> +    // the same as the offset.
> +    int64_t sz = TD->getTypeStoreSize(Ty);
> +    return ((-offset) == sz);
> +  }
> +
> +  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
> +    IRBuilder<> Builder(&BB);
> +
> +    unsigned size = merged.size();
> +    SmallVector<Value *, 4> values;
> +    for(unsigned i = 0; i < size; i++) {
> +      values.push_back(merged[i]);
> +    }
> +    LoadInst *ld = cast<LoadInst>(merged[0]);
> +    unsigned align = ld->getAlignment();
> +    unsigned addrSpace = ld->getPointerAddressSpace();
> +    // insert before first load
> +    Builder.SetInsertPoint(ld);
> +    VectorType *vecTy = VectorType::get(ld->getType(), size);
> +    Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
> +                                          PointerType::get(vecTy, addrSpace));
> +    LoadInst *vecValue = Builder.CreateLoad(vecPtr);
> +    vecValue->setAlignment(align);
> +
> +    for (unsigned i = 0; i < size; ++i) {
> +      Value *S = Builder.CreateExtractElement(vecValue, Builder.getInt32(i));
> +      values[i]->replaceAllUsesWith(S);
> +    }
> +  }
> +
> +  BasicBlock::iterator
> +  GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
> +                            SmallVector<Instruction*, 4> &merged,
> +                            BasicBlock::iterator &start,
> +                            unsigned maxLimit,
> +                            bool isLoad) {
> +
> +    BasicBlock::iterator stepForward = start;
> +    if(!isSimpleLoadStore(start)) return stepForward;
> +
> +    merged.push_back(start);
> +
> +    BasicBlock::iterator E = BB.end();
> +    BasicBlock::iterator J = ++start;
> +
> +    for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
> +      if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
> +        if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
> +          merged.push_back(J);
> +          stepForward = ++J;
> +        }
> +      } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
> +        // simple stop to keep read/write order
> +        break;
> +      }
> +
> +      if(merged.size() >= 4) break;
> +    }
> +    return stepForward;
> +  }
> +
> +  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
> +    IRBuilder<> Builder(&BB);
> +
> +    unsigned size = merged.size();
> +    SmallVector<Value *, 4> values;
> +    for(unsigned i = 0; i < size; i++) {
> +      values.push_back(cast<StoreInst>(merged[i])->getValueOperand());
> +    }
> +    StoreInst *st = cast<StoreInst>(merged[0]);
> +    unsigned addrSpace = st->getPointerAddressSpace();
> +
> +    unsigned align = st->getAlignment();
> +    // insert before the last store
> +    Builder.SetInsertPoint(merged[size-1]);
> +
> +    Type *dataTy = st->getValueOperand()->getType();
> +    VectorType *vecTy = VectorType::get(dataTy, size);
> +    Value * parent = UndefValue::get(vecTy);
> +    for(unsigned i = 0; i < size; i++) {
> +      parent = Builder.CreateInsertElement(parent, values[i], ConstantInt::get(IntegerType::get(st->getContext(), 32), i));
> +    }
> +
> +    Value *newPtr = Builder.CreateBitCast(st->getPointerOperand(), PointerType::get(vecTy, addrSpace));
> +    StoreInst *newST = Builder.CreateStore(parent, newPtr);
> +    newST->setAlignment(align);
> +  }
> +
> +  bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
> +    bool changed = false;
> +    SmallVector<Instruction*, 4> merged;
> +    for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
> +      if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
> +        bool isLoad = isa<LoadInst>(*BBI) ? true: false;
> +        Type *ty = getValueType(BBI);
> +        if(ty->isVectorTy()) continue;
> +        // we only support DWORD data type merge
> +        if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
> +        BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
> +        if(merged.size() > 1) {
> +          if(isLoad)
> +            mergeLoad(BB, merged);
> +          else
> +            mergeStore(BB, merged);
> +          // remove merged insn
> +          int size = merged.size();
> +          for(int i = 0; i < size; i++)
> +            merged[i]->eraseFromParent();
> +          changed = true;
> +        }
> +        merged.clear();
> +      }
> +    }
> +    return changed;
> +  }
> +
> +  BasicBlockPass *createLoadStoreOptimizationPass() {
> +    return new GenLoadStoreOptimization();
> +  }
> +};
> +
> diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
> index 37a5b2b..9282b3f 100644
> --- a/backend/src/llvm/llvm_to_gen.cpp
> +++ b/backend/src/llvm/llvm_to_gen.cpp
> @@ -187,7 +187,7 @@ namespace gbe
>      runModulePass(mod, libraryInfo, optLevel);
>  
>      llvm::PassManager passes;
> -
> +    passes.add(new DataLayout(&mod));
>      // Print the code before further optimizations
>      if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
>  #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
> @@ -198,6 +198,7 @@ namespace gbe
>      passes.add(createIntrinsicLoweringPass());
>      passes.add(createFunctionInliningPass(200000));
>      passes.add(createScalarReplAggregatesPass()); // Break up allocas
> +    passes.add(createLoadStoreOptimizationPass());
>      passes.add(createRemoveGEPPass(unit));
>      passes.add(createConstantPropagationPass());
>      passes.add(createLowerSwitchPass());
> -- 
> 1.7.10.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet