[Beignet] [PATCH] GBE: Support unaligned load/store of dword/qword in GenIR.

Thu Feb 26 22:55:56 PST 2015

LGTM, pushed, thanks.

On Fri, Feb 27, 2015 at 02:33:08PM +0800, Ruiling Song wrote:
> Although opencl does not allow unaligned load/store of dword/qword,
> LLVM still may generate such kind of instructions, especially
> large integer load/store is legalized into load/store of qword with
> possible unaligned address. The implementation is simple:
> for store, bitcast d/q word to vector of bytes before writing out,
> for load, load vector of bytes and then bitcast them to d/q word.
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  backend/src/llvm/llvm_gen_backend.cpp |   76 +++++++++++++++++++++++++++++++++
>  1 file changed, 76 insertions(+)
> 
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index de846cb..c7bf153 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -651,6 +651,8 @@ namespace gbe
>                    Value *llvmValue, const ir::Register ptr,
>                    const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
>                    bool dwAligned);
> +    // handle load of dword/qword with unaligned address
> +    void emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned);
>      void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
>      private:
>        ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
> @@ -3931,6 +3933,67 @@ error:
>      }
>      GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
>    }
> +  // handle load of dword/qword with unaligned address
> +  void GenWriter::emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned)
> +  {
> +    Type *llvmType = llvmValues->getType();
> +    const ir::Type type = getType(ctx, llvmType);
> +    unsigned byteSize = getTypeByteSize(unit, llvmType);
> +    const ir::Register ptr = this->getRegister(llvmPtr);
> +
> +    Type *elemType = llvmType;
> +    unsigned elemNum = 1;
> +    if (!isScalarType(llvmType)) {
> +      VectorType *vectorType = cast<VectorType>(llvmType);
> +      elemType = vectorType->getElementType();
> +      elemNum = vectorType->getNumElements();
> +    }
> +
> +    vector<ir::Register> tupleData;
> +    for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
> +      ir::Register reg;
> +      if(regTranslator.isUndefConst(llvmValues, elemID)) {
> +        Value *v = Constant::getNullValue(elemType);
> +        reg = this->getRegister(v);
> +      } else
> +        reg = this->getRegister(llvmValues, elemID);
> +
> +      tupleData.push_back(reg);
> +    }
> +    const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
> +
> +    vector<ir::Register> byteTupleData;
> +    for (uint32_t elemID = 0; elemID < byteSize; ++elemID) {
> +      byteTupleData.push_back(ctx.reg(ir::FAMILY_BYTE));
> +    }
> +    const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
> +
> +    if (isLoad) {
> +      ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
> +      ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
> +    } else {
> +      ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
> +      // FIXME: byte scatter does not handle correctly vector store, after fix that,
> +      //        we can directly use on store instruction like:
> +      //        ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
> +      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> +      for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
> +        const ir::Register reg = byteTupleData[elemID];
> +        ir::Register addr;
> +        if (elemID == 0)
> +          addr = ptr;
> +        else {
> +          const ir::Register offset = ctx.reg(pointerFamily);
> +          ir::ImmediateIndex immIndex;
> +          immIndex = ctx.newImmediate(int32_t(elemID));
> +          addr = ctx.reg(pointerFamily);
> +          ctx.LOADI(ir::TYPE_S32, offset, immIndex);
> +          ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
> +        }
> +       ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
> +      }
> +    }
> +  }
>  
>    extern int OCL_SIMD_WIDTH;
>    template <bool isLoad, typename T>
> @@ -3946,6 +4009,19 @@ error:
>      ir::BTI binding;
>      gatherBTI(&I, binding);
>  
> +    Type *scalarType = llvmType;
> +    if (!isScalarType(llvmType)) {
> +      VectorType *vectorType = cast<VectorType>(llvmType);
> +      scalarType = vectorType->getElementType();
> +    }
> +
> +    if (!dwAligned
> +       && (scalarType == IntegerType::get(I.getContext(), 64)
> +          || scalarType == IntegerType::get(I.getContext(), 32))
> +       ) {
> +      emitUnalignedDQLoadStore(llvmPtr, llvmValues, addrSpace, binding, isLoad, dwAligned);
> +      return;
> +    }
>      // Scalar is easy. We neednot build register tuples
>      if (isScalarType(llvmType) == true) {
>        const ir::Type type = getType(ctx, llvmType);
> -- 
> 1.7.10.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet