[Beignet] [PATCH V2] backend: refine load/store merging algorithm

Thu Jun 22 06:29:26 UTC 2017

LGTM

Ruiling

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> rander.wang
> Sent: Friday, June 16, 2017 9:50 AM
> To: beignet at freedesktop.org
> Cc: Wang, Rander <rander.wang at intel.com>
> Subject: [Beignet] [PATCH V2] backend: refine load/store merging algorithm
> 
> 	Now it works for sequence: load(0), load(1), load(2)
> 	but it cant work for load(2), load(0), load(1). because
>         it compared the last merged load and the new one not all
> 	the loads
> 
> 	for  sequence: load(0), load(1), load(2). the load(0) is the
>         start, can find that load(1) is successor without space, so
>         put it to a merge fifo. then the start is moving to the top
>         of fifo load(1), and compared with load(2). Also load(2) can be merged
> 
> 	for load(2), load(0), load(1). load(2) cant be merged with
>         load(0) for a space between them. So skip load(0) and mov to next
>         load(1).And this load(1) can be merged. But it never go back merge
>         load(0)
> 
>         Now change the algorithm.
> 	(1) find all loads maybe merged arround the start by the distance to
> 	    the start. the distance is depended on data type, for 32bit data, the
>             distance is 4. Put them in a list
> 
>         (2) sort the list by the distance from the start.
> 
> 	(3) search the continuous sequence including the start to merge
> 
> 	V2: (1)refine the sort and compare algoritm. First find all the IO
> 	       in small offset compared to start. Then call std:sort
>             (2)check the number of candidate IO to be favorable to performance
> 	       for most cases there is no chance to merge IO
> 
> Signed-off-by: rander.wang <rander.wang at intel.com>
> ---
>  backend/src/llvm/llvm_loadstore_optimization.cpp | 87
> +++++++++++++++++++++---
>  1 file changed, 78 insertions(+), 9 deletions(-)
> 
> diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp
> b/backend/src/llvm/llvm_loadstore_optimization.cpp
> index 5aa38be..c91c1a0 100644
> --- a/backend/src/llvm/llvm_loadstore_optimization.cpp
> +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
> @@ -67,7 +67,7 @@ namespace gbe {
>      bool     isSimpleLoadStore(Value *I);
>      bool     optimizeLoadStore(BasicBlock &BB);
> 
> -    bool     isLoadStoreCompatible(Value *A, Value *B);
> +    bool     isLoadStoreCompatible(Value *A, Value *B, int *dist, int* elementSize,
> int maxVecSize);
>      void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
>      void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
>      bool     findConsecutiveAccess(BasicBlock &BB,
> @@ -109,7 +109,7 @@ namespace gbe {
>      return NULL;
>    }
> 
> -  bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B) {
> +  bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B,
> int *dist, int* elementSize, int maxVecSize) {
>      Value *ptrA = getPointerOperand(A);
>      Value *ptrB = getPointerOperand(B);
>      unsigned ASA = getAddressSpace(A);
> @@ -136,7 +136,11 @@ namespace gbe {
>      // The Instructions are connsecutive if the size of the first load/store is
>      // the same as the offset.
>      int64_t sz = TD->getTypeStoreSize(Ty);
> -    return ((-offset) == sz);
> +    *dist = -offset;
> +    *elementSize = sz;
> +
> +    //a insn with small distance from the search load/store is a candidate one
> +    return (abs(-offset) < sz*maxVecSize);
>    }
> 
>    void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB,
> SmallVector<Instruction*, 16> &merged) {
> @@ -163,6 +167,25 @@ namespace gbe {
>        values[i]->replaceAllUsesWith(S);
>      }
>    }
> +
> +  class mergedInfo{
> +    public:
> +    Instruction* mInsn;
> +    int mOffset;
> +
> +    void init(Instruction* insn, int offset)
> +    {
> +      mInsn = insn;
> +      mOffset = offset;
> +    }
> +  };
> +
> +  struct offsetSorter {
> +    bool operator()(mergedInfo* m0, mergedInfo* m1) const {
> +    return m0->mOffset < m1->mOffset;
> +    }
> +  };
> +
>    // When searching for consecutive memory access, we do it in a small window,
>    // if the window is too large, it would take up too much compiling time.
>    // An Important rule we have followed is don't try to change load/store order.
> @@ -177,7 +200,6 @@ namespace gbe {
> 
>      if(!isSimpleLoadStore(&*start)) return false;
> 
> -    merged.push_back(&*start);
>      unsigned targetAddrSpace = getAddressSpace(&*start);
> 
>      BasicBlock::iterator E = BB.end();
> @@ -187,11 +209,27 @@ namespace gbe {
>      unsigned maxLimit = maxVecSize * 8;
>      bool reordered = false;
> 
> -    for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
> -      if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
> -        if(isLoadStoreCompatible(merged[merged.size()-1], &*J)) {
> -          merged.push_back(&*J);
> -        }
> +    bool ready = false;
> +    int elementSize;
> +
> +    SmallVector<mergedInfo *, 16> searchInsnArray;
> +    mergedInfo meInfoArray[16];
> +    int indx = 0;
> +    meInfoArray[indx++].init(&*start, 0);
> +
> +    searchInsnArray.push_back(&meInfoArray[0]);
> +    BasicBlock::iterator I = start;
> +    ++I;
> +
> +    for(unsigned ss = 0; I!= E && ss <= maxLimit; ++ss, ++I) {
> +      if((isLoad && isa<LoadInst>(*I)) || (!isLoad && isa<StoreInst>(*J))) {
> +          int distance;
> +          if(isLoadStoreCompatible(searchInsnArray[0]->mInsn, &*I, &distance,
> &elementSize, maxVecSize))
> +          {
> +            meInfoArray[indx].init(&*I, distance);
> +            searchInsnArray.push_back(&meInfoArray[indx]);
> +            indx++;
> +          }
>        } else if((isLoad && isa<StoreInst>(*J))) {
>          // simple stop to keep read/write order
>          StoreInst *st = cast<StoreInst>(&*J);
> @@ -214,6 +252,37 @@ namespace gbe {
>        if(merged.size() >= maxVecSize) break;
>      }
> 
> +    if(indx > 1)
> +    {
> +      //try to sort the load/store by the offset from the start
> +      //the farthest is at the beginning. this is easy to find the
> +      //continuous load/store
> +      std::sort(searchInsnArray.begin(), searchInsnArray.end(), offsetSorter());
> +
> +      // try to find continuous loadstore insn in the candidate array
> +      for(unsigned i = 0; i < searchInsnArray.size(); i++)
> +      {
> +        unsigned j;
> +        for(j = 0; j < maxVecSize-1 && (j+i+1) < searchInsnArray.size(); j++)
> +        {
> +          if(searchInsnArray[i+j+1]->mOffset - searchInsnArray[i+j]->mOffset !=
> elementSize)
> +            break;
> +
> +          //this means the search load/store which offset is 0, is in the sequence
> +          if(searchInsnArray[i+j]->mOffset == 0 || searchInsnArray[i+j+1]->mOffset
> == 0)
> +            ready = true;
> +        }
> +
> +        if(j > 0 && ready)
> +        {
> +          for(unsigned k = 0; k < j+1; k++)
> +            merged.push_back(searchInsnArray[i+k]->mInsn);
> +
> +          break;
> +        }
> +      }
> +    }
> +
>      return reordered;
>    }
> 
> --
> 2.7.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet