[Beignet] [PATCH 2/2] GBE: optimize unaligned char and short data vector's load.

Wed Aug 27 17:16:56 PDT 2014

You may notice that the vload2/3 char and vload3 short haven't been optimized yet.
The reason is that our loadOrStore always handle integer multiple time of
DWORD vector as batchload and the vector bytegather function at instruction
selection stage use that assumption.

I'm working on fix the restrication and let vload2/3 char short could get
the same beneficial of this optimization. That will be in another patch.

Thanks,
Zhigang Gong

On Wed, Aug 27, 2014 at 12:12:44PM +0800, Zhigang Gong wrote:
> The gather the contiguous short/char loads into a single load instruction
> could give us a good pportunity to use untyped load to optimize them.
> 
> This patch enable the short/char load gathering at the load store optimize
> pass. Then at the backend, it will load corresponding DWORDs then covert to
> short/char accordingly by applying shift and bitwise operations.
> 
> The benchmark shows, for vload4/8/16 char or vload/2/4/8/16 short, this patch brings
> about 80%-100% improvement.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp       | 154 ++++++++++++++++++++---
>  backend/src/llvm/llvm_gen_backend.cpp            |  14 ++-
>  backend/src/llvm/llvm_loadstore_optimization.cpp |  56 +++++----
>  3 files changed, 178 insertions(+), 46 deletions(-)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index b7a39af..8478616 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -2843,11 +2843,97 @@ namespace gbe
>          sel.pop();
>      }
>  
> -    void emitByteGather(Selection::Opaque &sel,
> -                        const ir::LoadInstruction &insn,
> -                        const uint32_t elemSize,
> -                        GenRegister address,
> -                        ir::BTI bti) const
> +    // The address is dw aligned.
> +    void emitAlignedByteGather(Selection::Opaque &sel,
> +                               const ir::LoadInstruction &insn,
> +                               const uint32_t elemSize,
> +                               GenRegister address,
> +                               ir::BTI bti) const
> +    {
> +      using namespace ir;
> +      const uint32_t valueNum = insn.getValueNum();
> +      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
> +                                 1 : sel.ctx.getSimdWidth();
> +      RegisterFamily family = getFamily(insn.getValueType());
> +
> +      vector<GenRegister> dst(valueNum);
> +      const uint32_t typeSize = getFamilySize(family);
> +
> +      for(uint32_t i = 0; i < valueNum; i++)
> +        dst[i] = sel.selReg(insn.getValue(i), getType(family));
> +
> +      uint32_t tmpRegNum = typeSize*valueNum / 4;
> +      if (tmpRegNum == 0)
> +        tmpRegNum = 1;
> +      vector<GenRegister> tmp(tmpRegNum);
> +      vector<GenRegister> tmp2(tmpRegNum);
> +      vector<Register> tmpReg(tmpRegNum);
> +      for(uint32_t i = 0; i < tmpRegNum; i++) {
> +        tmpReg[i] = sel.reg(FAMILY_DWORD);
> +        tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]);
> +      }
> +
> +      readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
> +
> +      if (valueNum > 1) {
> +        for(uint32_t i = 0; i < tmpRegNum; i++)
> +          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
> +      }
> +      else {
> +        if (elemSize == GEN_BYTE_SCATTER_WORD)
> +          sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UW), sel.unpacked_uw(tmpReg[0]));
> +        else if (elemSize == GEN_BYTE_SCATTER_BYTE)
> +          sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UB), sel.unpacked_ub(tmpReg[0]));
> +      }
> +    }
> +
> +    // Gather effect data to the effectData vector from the tmp vector.
> +    //  x x d0 d1 | d2 d3 d4 d5 | ... ==> d0 d1 d2 d3 | d4 d5 ...
> +    void getEffectByteData(Selection::Opaque &sel,
> +                           vector<GenRegister> &effectData,
> +                           vector<GenRegister> &tmp,
> +                           uint32_t effectDataNum,
> +                           GenRegister addr,
> +                           uint32_t simdWidth) const
> +    {
> +      using namespace ir;
> +      GBE_ASSERT(effectData.size() == effectDataNum);
> +      GBE_ASSERT(tmp.size() == effectDataNum + 1);
> +      sel.push();
> +        sel.curr.noMask = 1;
> +        for(uint32_t i = 0; i < effectDataNum; i++) {
> +          GenRegister tmpH = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +          GenRegister tmpL = effectData[i];
> +          GenRegister shift = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +          Register shift1Reg = sel.reg(FAMILY_DWORD);
> +          GenRegister shift1 = GenRegister::udxgrf(simdWidth, shift1Reg);
> +          GenRegister factor = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +          sel.AND(shift, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(0x3));
> +          sel.SHL(shift, shift, GenRegister::immud(0x3));
> +          sel.SHR(tmpL, tmp[i], shift);
> +          sel.ADD(shift1, GenRegister::negate(shift), GenRegister::immud(32));
> +          sel.push();
> +            // Only need to consider the tmpH when the shift is not 32.
> +            Register flag = sel.reg(FAMILY_BOOL);
> +            sel.curr.physicalFlag = 0;
> +            sel.curr.modFlag = 1;
> +            sel.curr.predicate = GEN_PREDICATE_NONE;
> +            sel.curr.flagIndex = (uint16_t)flag;
> +            sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shift1Reg), GenRegister::immuw(32), factor);
> +            sel.curr.modFlag = 0;
> +            sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +            sel.SHL(tmpH, tmp[i + 1], shift1);
> +            sel.OR(effectData[i], tmpL, tmpH);
> +          sel.pop();
> +        }
> +      sel.pop();
> +    }
> +
> +    void emitUnalignedByteGather(Selection::Opaque &sel,
> +                                 const ir::LoadInstruction &insn,
> +                                 const uint32_t elemSize,
> +                                 GenRegister address,
> +                                 ir::BTI bti) const
>      {
>        using namespace ir;
>        const uint32_t valueNum = insn.getValueNum();
> @@ -2862,17 +2948,45 @@ namespace gbe
>          for(uint32_t i = 0; i < valueNum; i++)
>            dst[i] = sel.selReg(insn.getValue(i), getType(family));
>  
> -        uint32_t tmpRegNum = typeSize*valueNum / 4;
> -        vector<GenRegister> tmp(tmpRegNum);
> -        vector<GenRegister> tmp2(tmpRegNum);
> -        for(uint32_t i = 0; i < tmpRegNum; i++) {
> +        uint32_t effectDataNum = typeSize*valueNum / 4;
> +        vector<GenRegister> tmp(effectDataNum + 1);
> +        vector<GenRegister> tmp2(effectDataNum + 1);
> +        vector<GenRegister> effectData(effectDataNum);
> +        for(uint32_t i = 0; i < effectDataNum + 1; i++)
>            tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> -        }
>  
> -        readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
> +        GenRegister alignedAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +        sel.push();
> +          if (simdWidth == 1)
> +            sel.curr.noMask = 1;
> +          sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
> +        sel.pop();
>  
> -        for(uint32_t i = 0; i < tmpRegNum; i++) {
> -          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
> +        uint32_t remainedReg = effectDataNum + 1;
> +        uint32_t pos = 0;
> +        do {
> +          uint32_t width = remainedReg > 4 ? 4 : remainedReg;
> +          vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
> +          vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
> +          if (pos != 0) {
> +            sel.push();
> +              if (simdWidth == 1)
> +                sel.curr.noMask = 1;
> +              sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
> +            sel.pop();
> +          }
> +          readDWord(sel, t1, t2, alignedAddr, width, insn.getAddressSpace(), bti);
> +          remainedReg -= width;
> +          pos += width;
> +        } while(remainedReg);
> +
> +        for(uint32_t i = 0; i < effectDataNum; i++)
> +          effectData[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
> +
> +        getEffectByteData(sel, effectData, tmp, effectDataNum, address, simdWidth);
> +
> +        for(uint32_t i = 0; i < effectDataNum; i++) {
> +          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], 4/typeSize);
>          }
>        } else {
>          GBE_ASSERT(insn.getValueNum() == 1);
> @@ -2954,17 +3068,19 @@ namespace gbe
>            this->emitRead64(sel, insn, address, bti);
>          else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
>            this->emitDWordGather(sel, insn, address, bti);
> -        else {
> -          this->emitByteGather(sel, insn, elemSize, address, bti);
> -        }
> +        else if (insn.isAligned() == true)
> +          this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
> +        else
> +          this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
>        } else {
>          if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
>            this->emitRead64(sel, insn, address, bti);
>          else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
>            this->emitUntypedRead(sel, insn, address, bti);
> -        else {
> -          this->emitByteGather(sel, insn, elemSize, address, bti);
> -        }
> +        else if (insn.isAligned())
> +          this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
> +        else
> +          this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
>        }
>        return true;
>      }
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 3a46951..b956bc6 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -614,7 +614,8 @@ namespace gbe
>      // batch vec4/8/16 load/store
>      INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
>                    Value *llvmValue, const ir::Register ptr,
> -                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
> +                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
> +                  bool dwAligned);
>      void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
>      private:
>        ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
> @@ -3290,7 +3291,8 @@ handle_write_image:
>    void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
>                                        Value *llvmValues, const ir::Register ptr,
>                                        const ir::AddressSpace addrSpace,
> -                                      Type * elemType, bool isLoad, ir::BTI bti) {
> +                                      Type * elemType, bool isLoad, ir::BTI bti,
> +                                      bool dwAligned) {
>      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
>      uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
>      uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
> @@ -3336,9 +3338,9 @@ handle_write_image:
>  
>        // Emit the instruction
>        if (isLoad)
> -        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
> +        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
>        else
> -        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
> +        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
>      }
>    }
>  
> @@ -3510,11 +3512,11 @@ handle_write_image:
>          // Not supported by the hardware. So, we split the message and we use
>          // strided loads and stores
>          else {
> -          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
> +          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
>          }
>        }
>        else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
> -          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
> +          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
>        } else {
>          for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
>            if(regTranslator.isUndefConst(llvmValues, elemID))
> diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
> index 4bfc7f6..19726b0 100644
> --- a/backend/src/llvm/llvm_loadstore_optimization.cpp
> +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
> @@ -87,12 +87,12 @@ namespace gbe {
>      bool     optimizeLoadStore(BasicBlock &BB);
>  
>      bool     isLoadStoreCompatible(Value *A, Value *B);
> -    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
> -    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
> +    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
> +    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
>      BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
> -                                               SmallVector<Instruction*, 4> &merged,
> +                                               SmallVector<Instruction*, 16> &merged,
>                                                 BasicBlock::iterator &start,
> -                                               unsigned maxLimit,
> +                                               unsigned maxVecSize,
>                                                 bool isLoad);
>  
>      virtual const char *getPassName() const {
> @@ -154,11 +154,11 @@ namespace gbe {
>      return ((-offset) == sz);
>    }
>  
> -  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
> +  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
>      IRBuilder<> Builder(&BB);
>  
>      unsigned size = merged.size();
> -    SmallVector<Value *, 4> values;
> +    SmallVector<Value *, 16> values;
>      for(unsigned i = 0; i < size; i++) {
>        values.push_back(merged[i]);
>      }
> @@ -169,7 +169,7 @@ namespace gbe {
>      Builder.SetInsertPoint(ld);
>      VectorType *vecTy = VectorType::get(ld->getType(), size);
>      Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
> -                                          PointerType::get(vecTy, addrSpace));
> +                                        PointerType::get(vecTy, addrSpace));
>      LoadInst *vecValue = Builder.CreateLoad(vecPtr);
>      vecValue->setAlignment(align);
>  
> @@ -181,9 +181,9 @@ namespace gbe {
>  
>    BasicBlock::iterator
>    GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
> -                            SmallVector<Instruction*, 4> &merged,
> +                            SmallVector<Instruction*, 16> &merged,
>                              BasicBlock::iterator &start,
> -                            unsigned maxLimit,
> +                            unsigned maxVecSize,
>                              bool isLoad) {
>  
>      BasicBlock::iterator stepForward = start;
> @@ -194,6 +194,8 @@ namespace gbe {
>      BasicBlock::iterator E = BB.end();
>      BasicBlock::iterator J = ++start;
>  
> +    unsigned maxLimit = maxVecSize * 3;
> +
>      for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
>        if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
>          if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
> @@ -205,12 +207,12 @@ namespace gbe {
>          break;
>        }
>  
> -      if(merged.size() >= 4) break;
> +      if(merged.size() > maxVecSize) break;
>      }
>      return stepForward;
>    }
>  
> -  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
> +  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
>      IRBuilder<> Builder(&BB);
>  
>      unsigned size = merged.size();
> @@ -239,25 +241,37 @@ namespace gbe {
>  
>    bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
>      bool changed = false;
> -    SmallVector<Instruction*, 4> merged;
> +    SmallVector<Instruction*, 16> merged;
>      for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
>        if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
>          bool isLoad = isa<LoadInst>(*BBI) ? true: false;
>          Type *ty = getValueType(BBI);
>          if(ty->isVectorTy()) continue;
> -        // we only support DWORD data type merge
> -        if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
> -        BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
> -        if(merged.size() > 1) {
> +        // TODO Support DWORD/WORD/BYTE LOAD for store support DWORD only now.
> +        if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
> +             ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
> +          continue;
> +        unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
> +                              (ty->isIntegerTy(16) ? 8 : 16);
> +        BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
> +        uint32_t size = merged.size();
> +        uint32_t pos = 0;
> +        while(size > 1) {
> +          unsigned vecSize = (size >= 16) ? 16 :
> +                             (size >= 8 ? 8 :
> +                             (size >= 4 ? 4 :
> +                             (size >= 2 ? 2 : size)));
> +          SmallVector<Instruction*, 16> mergedVec(merged.begin() + pos, merged.begin() + pos + vecSize);
>            if(isLoad)
> -            mergeLoad(BB, merged);
> +            mergeLoad(BB, mergedVec);
>            else
> -            mergeStore(BB, merged);
> +            mergeStore(BB, mergedVec);
>            // remove merged insn
> -          int size = merged.size();
> -          for(int i = 0; i < size; i++)
> -            merged[i]->eraseFromParent();
> +          for(uint32_t i = 0; i < mergedVec.size(); i++)
> +            mergedVec[i]->eraseFromParent();
>            changed = true;
> +          pos += vecSize;
> +          size -= vecSize;
>          }
>          merged.clear();
>        }
> -- 
> 1.8.3.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet