[Beignet] [PATCH 2/2] GBE: optimize unaligned char and short data vector's load.

Mon Sep 1 21:36:59 PDT 2014

On Wed, Aug 27, 2014 at 12:12:44PM +0800, Zhigang Gong wrote:
> The gather the contiguous short/char loads into a single load instruction
> could give us a good pportunity to use untyped load to optimize them.
> 
> This patch enable the short/char load gathering at the load store optimize
> pass. Then at the backend, it will load corresponding DWORDs then covert to
> short/char accordingly by applying shift and bitwise operations.
> 
> The benchmark shows, for vload4/8/16 char or vload/2/4/8/16 short, this patch brings
> about 80%-100% improvement.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp       | 154 ++++++++++++++++++++---
>  backend/src/llvm/llvm_gen_backend.cpp            |  14 ++-
>  backend/src/llvm/llvm_loadstore_optimization.cpp |  56 +++++----
>  3 files changed, 178 insertions(+), 46 deletions(-)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index b7a39af..8478616 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -2843,11 +2843,97 @@ namespace gbe
>          sel.pop();
>      }
>  
> -    void emitByteGather(Selection::Opaque &sel,
> -                        const ir::LoadInstruction &insn,
> -                        const uint32_t elemSize,
> -                        GenRegister address,
> -                        ir::BTI bti) const
> +    // The address is dw aligned.
> +    void emitAlignedByteGather(Selection::Opaque &sel,
> +                               const ir::LoadInstruction &insn,
> +                               const uint32_t elemSize,
> +                               GenRegister address,
> +                               ir::BTI bti) const
> +    {
> +      using namespace ir;
> +      const uint32_t valueNum = insn.getValueNum();
> +      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
> +                                 1 : sel.ctx.getSimdWidth();
> +      RegisterFamily family = getFamily(insn.getValueType());
> +
> +      vector<GenRegister> dst(valueNum);
> +      const uint32_t typeSize = getFamilySize(family);
> +
> +      for(uint32_t i = 0; i < valueNum; i++)
> +        dst[i] = sel.selReg(insn.getValue(i), getType(family));
> +
> +      uint32_t tmpRegNum = typeSize*valueNum / 4;
> +      if (tmpRegNum == 0)
> +        tmpRegNum = 1;
> +      vector<GenRegister> tmp(tmpRegNum);
> +      vector<GenRegister> tmp2(tmpRegNum);
> +      vector<Register> tmpReg(tmpRegNum);
> +      for(uint32_t i = 0; i < tmpRegNum; i++) {
> +        tmpReg[i] = sel.reg(FAMILY_DWORD);
> +        tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]);
> +      }
> +
> +      readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
> +
> +      if (valueNum > 1) {
> +        for(uint32_t i = 0; i < tmpRegNum; i++)
> +          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
> +      }
> +      else {
> +        if (elemSize == GEN_BYTE_SCATTER_WORD)
> +          sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UW), sel.unpacked_uw(tmpReg[0]));
> +        else if (elemSize == GEN_BYTE_SCATTER_BYTE)
> +          sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UB), sel.unpacked_ub(tmpReg[0]));
> +      }
> +    }
> +
> +    // Gather effect data to the effectData vector from the tmp vector.
> +    //  x x d0 d1 | d2 d3 d4 d5 | ... ==> d0 d1 d2 d3 | d4 d5 ...
> +    void getEffectByteData(Selection::Opaque &sel,
> +                           vector<GenRegister> &effectData,
> +                           vector<GenRegister> &tmp,
> +                           uint32_t effectDataNum,
> +                           GenRegister addr,
> +                           uint32_t simdWidth) const
> +    {
> +      using namespace ir;
> +      GBE_ASSERT(effectData.size() == effectDataNum);
> +      GBE_ASSERT(tmp.size() == effectDataNum + 1);
> +      sel.push();
> +        sel.curr.noMask = 1;
> +        for(uint32_t i = 0; i < effectDataNum; i++) {
> +          GenRegister tmpH = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +          GenRegister tmpL = effectData[i];
> +          GenRegister shift = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +          Register shift1Reg = sel.reg(FAMILY_DWORD);
> +          GenRegister shift1 = GenRegister::udxgrf(simdWidth, shift1Reg);
> +          GenRegister factor = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +          sel.AND(shift, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(0x3));
> +          sel.SHL(shift, shift, GenRegister::immud(0x3));
> +          sel.SHR(tmpL, tmp[i], shift);
> +          sel.ADD(shift1, GenRegister::negate(shift), GenRegister::immud(32));
> +          sel.push();
> +            // Only need to consider the tmpH when the shift is not 32.
> +            Register flag = sel.reg(FAMILY_BOOL);
> +            sel.curr.physicalFlag = 0;
> +            sel.curr.modFlag = 1;
> +            sel.curr.predicate = GEN_PREDICATE_NONE;
> +            sel.curr.flagIndex = (uint16_t)flag;
> +            sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shift1Reg), GenRegister::immuw(32), factor);
> +            sel.curr.modFlag = 0;
> +            sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +            sel.SHL(tmpH, tmp[i + 1], shift1);
> +            sel.OR(effectData[i], tmpL, tmpH);
> +          sel.pop();
> +        }
> +      sel.pop();
> +    }
> +
> +    void emitUnalignedByteGather(Selection::Opaque &sel,
> +                                 const ir::LoadInstruction &insn,
> +                                 const uint32_t elemSize,
> +                                 GenRegister address,
> +                                 ir::BTI bti) const
>      {
>        using namespace ir;
>        const uint32_t valueNum = insn.getValueNum();
> @@ -2862,17 +2948,45 @@ namespace gbe
>          for(uint32_t i = 0; i < valueNum; i++)
>            dst[i] = sel.selReg(insn.getValue(i), getType(family));
>  
> -        uint32_t tmpRegNum = typeSize*valueNum / 4;
> -        vector<GenRegister> tmp(tmpRegNum);
> -        vector<GenRegister> tmp2(tmpRegNum);
> -        for(uint32_t i = 0; i < tmpRegNum; i++) {
> +        uint32_t effectDataNum = typeSize*valueNum / 4;
> +        vector<GenRegister> tmp(effectDataNum + 1);
> +        vector<GenRegister> tmp2(effectDataNum + 1);
> +        vector<GenRegister> effectData(effectDataNum);
> +        for(uint32_t i = 0; i < effectDataNum + 1; i++)
>            tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> -        }
>  
> -        readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
> +        GenRegister alignedAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +        sel.push();
> +          if (simdWidth == 1)
> +            sel.curr.noMask = 1;
> +          sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
> +        sel.pop();
>  
> -        for(uint32_t i = 0; i < tmpRegNum; i++) {
> -          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
> +        uint32_t remainedReg = effectDataNum + 1;
> +        uint32_t pos = 0;
> +        do {
> +          uint32_t width = remainedReg > 4 ? 4 : remainedReg;
> +          vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
> +          vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
> +          if (pos != 0) {
> +            sel.push();
> +              if (simdWidth == 1)
> +                sel.curr.noMask = 1;
> +              sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
> +            sel.pop();
> +          }
> +          readDWord(sel, t1, t2, alignedAddr, width, insn.getAddressSpace(), bti);
> +          remainedReg -= width;
> +          pos += width;
> +        } while(remainedReg);
> +
> +        for(uint32_t i = 0; i < effectDataNum; i++)
> +          effectData[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +
> +        getEffectByteData(sel, effectData, tmp, effectDataNum, address, simdWidth);
> +
> +        for(uint32_t i = 0; i < effectDataNum; i++) {
> +          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], 4/typeSize);
>          }
>        } else {
>          GBE_ASSERT(insn.getValueNum() == 1);
> @@ -2954,17 +3068,19 @@ namespace gbe
>            this->emitRead64(sel, insn, address, bti);
>          else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
>            this->emitDWordGather(sel, insn, address, bti);
> -        else {
> -          this->emitByteGather(sel, insn, elemSize, address, bti);
> -        }
> +        else if (insn.isAligned() == true)
> +          this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
> +        else
> +          this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
>        } else {
>          if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
>            this->emitRead64(sel, insn, address, bti);
>          else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
>            this->emitUntypedRead(sel, insn, address, bti);
> -        else {
> -          this->emitByteGather(sel, insn, elemSize, address, bti);
> -        }
> +        else if (insn.isAligned())
> +          this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
> +        else
> +          this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
>        }
>        return true;
>      }
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 3a46951..b956bc6 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -614,7 +614,8 @@ namespace gbe
>      // batch vec4/8/16 load/store
>      INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
>                    Value *llvmValue, const ir::Register ptr,
> -                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
> +                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
> +                  bool dwAligned);
>      void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
>      private:
>        ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
> @@ -3290,7 +3291,8 @@ handle_write_image:
>    void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
>                                        Value *llvmValues, const ir::Register ptr,
>                                        const ir::AddressSpace addrSpace,
> -                                      Type * elemType, bool isLoad, ir::BTI bti) {
> +                                      Type * elemType, bool isLoad, ir::BTI bti,
> +                                      bool dwAligned) {
>      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
>      uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
>      uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
> @@ -3336,9 +3338,9 @@ handle_write_image:
>  
>        // Emit the instruction
>        if (isLoad)
> -        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
> +        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
>        else
> -        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
> +        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
>      }
>    }
>  
> @@ -3510,11 +3512,11 @@ handle_write_image:
>          // Not supported by the hardware. So, we split the message and we use
>          // strided loads and stores
>          else {
> -          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
> +          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
>          }
>        }
>        else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
> -          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
> +          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
>        } else {
>          for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
>            if(regTranslator.isUndefConst(llvmValues, elemID))
> diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
> index 4bfc7f6..19726b0 100644
> --- a/backend/src/llvm/llvm_loadstore_optimization.cpp
> +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
> @@ -87,12 +87,12 @@ namespace gbe {
>      bool     optimizeLoadStore(BasicBlock &BB);
>  
>      bool     isLoadStoreCompatible(Value *A, Value *B);
> -    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
> -    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
> +    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
> +    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
>      BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
> -                                               SmallVector<Instruction*, 4> &merged,
> +                                               SmallVector<Instruction*, 16> &merged,
>                                                 BasicBlock::iterator &start,
> -                                               unsigned maxLimit,
> +                                               unsigned maxVecSize,
>                                                 bool isLoad);
>  
>      virtual const char *getPassName() const {
> @@ -154,11 +154,11 @@ namespace gbe {
>      return ((-offset) == sz);
>    }
>  
> -  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
> +  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
>      IRBuilder<> Builder(&BB);
>  
>      unsigned size = merged.size();
> -    SmallVector<Value *, 4> values;
> +    SmallVector<Value *, 16> values;
>      for(unsigned i = 0; i < size; i++) {
>        values.push_back(merged[i]);
>      }
> @@ -169,7 +169,7 @@ namespace gbe {
>      Builder.SetInsertPoint(ld);
>      VectorType *vecTy = VectorType::get(ld->getType(), size);
>      Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
> -                                          PointerType::get(vecTy, addrSpace));
> +                                        PointerType::get(vecTy, addrSpace));
>      LoadInst *vecValue = Builder.CreateLoad(vecPtr);
>      vecValue->setAlignment(align);
>  
> @@ -181,9 +181,9 @@ namespace gbe {
>  
>    BasicBlock::iterator
>    GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
> -                            SmallVector<Instruction*, 4> &merged,
> +                            SmallVector<Instruction*, 16> &merged,
>                              BasicBlock::iterator &start,
> -                            unsigned maxLimit,
> +                            unsigned maxVecSize,
>                              bool isLoad) {
>  
>      BasicBlock::iterator stepForward = start;
> @@ -194,6 +194,8 @@ namespace gbe {
>      BasicBlock::iterator E = BB.end();
>      BasicBlock::iterator J = ++start;
>  
> +    unsigned maxLimit = maxVecSize * 3;
> +
>      for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
>        if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
>          if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
> @@ -205,12 +207,12 @@ namespace gbe {
>          break;
>        }
>  
> -      if(merged.size() >= 4) break;
> +      if(merged.size() > maxVecSize) break;
Tony pointed out this should be merged.size() >= maxVecSize, I already fixed it locally. Thanks Tony.

>      }
>      return stepForward;
>    }
>  
> -  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
> +  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
>      IRBuilder<> Builder(&BB);
>  
>      unsigned size = merged.size();
> @@ -239,25 +241,37 @@ namespace gbe {
>  
>    bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
>      bool changed = false;
> -    SmallVector<Instruction*, 4> merged;
> +    SmallVector<Instruction*, 16> merged;
>      for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
>        if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
>          bool isLoad = isa<LoadInst>(*BBI) ? true: false;
>          Type *ty = getValueType(BBI);
>          if(ty->isVectorTy()) continue;
> -        // we only support DWORD data type merge
> -        if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
> -        BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
> -        if(merged.size() > 1) {
> +        // TODO Support DWORD/WORD/BYTE LOAD for store support DWORD only now.
> +        if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
> +             ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
> +          continue;
> +        unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
> +                              (ty->isIntegerTy(16) ? 8 : 16);
> +        BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
> +        uint32_t size = merged.size();
> +        uint32_t pos = 0;
> +        while(size > 1) {
> +          unsigned vecSize = (size >= 16) ? 16 :
> +                             (size >= 8 ? 8 :
> +                             (size >= 4 ? 4 :
> +                             (size >= 2 ? 2 : size)));
> +          SmallVector<Instruction*, 16> mergedVec(merged.begin() + pos, merged.begin() + pos + vecSize);
>            if(isLoad)
> -            mergeLoad(BB, merged);
> +            mergeLoad(BB, mergedVec);
>            else
> -            mergeStore(BB, merged);
> +            mergeStore(BB, mergedVec);
>            // remove merged insn
> -          int size = merged.size();
> -          for(int i = 0; i < size; i++)
> -            merged[i]->eraseFromParent();
> +          for(uint32_t i = 0; i < mergedVec.size(); i++)
> +            mergedVec[i]->eraseFromParent();
>            changed = true;
> +          pos += vecSize;
> +          size -= vecSize;
>          }
>          merged.clear();
>        }
> -- 
> 1.8.3.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet