[Beignet] [patch v3 2/3] libocl: reimplement clz with lzd instruction instead of fbh.
Zhigang Gong
zhigang.gong at linux.intel.com
Sun Jan 25 22:34:02 PST 2015
Just as we discussed offline. Please use llvm.ctlz directly in clz()
builtin function, thus we don't need to implement the __gen_ocl_lzd()
which is non-standard intrinsics. And we can avoid to duplicate a lot
of code.
On Mon, Jan 26, 2015 at 02:57:46PM +0800, xionghu.luo at intel.com wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
>
> the fbh style is inefficient.
>
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
> backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 86 +++++++----------------------
> backend/src/llvm/llvm_gen_backend.cpp | 76 +++++++++++++++++++++++++
> backend/src/llvm/llvm_gen_ocl_function.hxx | 1 +
> 3 files changed, 98 insertions(+), 65 deletions(-)
>
> diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
> index 6da0bab..36da959 100644
> --- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
> @@ -19,6 +19,16 @@
>
> PURE CONST uint __gen_ocl_fbh(uint);
> PURE CONST uint __gen_ocl_fbl(uint);
> +
> +PURE CONST OVERLOADABLE ulong __gen_ocl_lzd(ulong);
> +PURE CONST OVERLOADABLE long __gen_ocl_lzd(long);
> +PURE CONST OVERLOADABLE uint __gen_ocl_lzd(uint);
> +PURE CONST OVERLOADABLE int __gen_ocl_lzd(int);
> +PURE CONST OVERLOADABLE ushort __gen_ocl_lzd(ushort);
> +PURE CONST OVERLOADABLE short __gen_ocl_lzd(short);
> +PURE CONST OVERLOADABLE uchar __gen_ocl_lzd(uchar);
> +PURE CONST OVERLOADABLE char __gen_ocl_lzd(char);
> +
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint);
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int);
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort);
> @@ -26,71 +36,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short);
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar);
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char);
>
> -OVERLOADABLE char clz(char x) {
> - if (x < 0)
> - return 0;
> - if (x == 0)
> - return 8;
> - return __gen_ocl_fbh(x) - 24;
> -}
> -
> -OVERLOADABLE uchar clz(uchar x) {
> - if (x == 0)
> - return 8;
> - return __gen_ocl_fbh(x) - 24;
> -}
> -
> -OVERLOADABLE short clz(short x) {
> - if (x < 0)
> - return 0;
> - if (x == 0)
> - return 16;
> - return __gen_ocl_fbh(x) - 16;
> -}
> -
> -OVERLOADABLE ushort clz(ushort x) {
> - if (x == 0)
> - return 16;
> - return __gen_ocl_fbh(x) - 16;
> -}
> -
> -OVERLOADABLE int clz(int x) {
> - if (x < 0)
> - return 0;
> - if (x == 0)
> - return 32;
> - return __gen_ocl_fbh(x);
> -}
> -
> -OVERLOADABLE uint clz(uint x) {
> - if (x == 0)
> - return 32;
> - return __gen_ocl_fbh(x);
> -}
> -
> -OVERLOADABLE long clz(long x) {
> - union { int i[2]; long x; } u;
> - u.x = x;
> - if (u.i[1] & 0x80000000u)
> - return 0;
> - if (u.i[1] == 0 && u.i[0] == 0)
> - return 64;
> - uint v = clz(u.i[1]);
> - if(v == 32)
> - v += clz(u.i[0]);
> - return v;
> -}
> -
> -OVERLOADABLE ulong clz(ulong x) {
> - if (x == 0)
> - return 64;
> - union { uint i[2]; ulong x; } u;
> - u.x = x;
> - uint v = clz(u.i[1]);
> - if(v == 32)
> - v += clz(u.i[0]);
> - return v;
> -}
> +#define SDEF(TYPE) \
> +OVERLOADABLE TYPE clz(TYPE x){ return __gen_ocl_lzd(x);}
> +SDEF(char);
> +SDEF(uchar);
> +SDEF(short);
> +SDEF(ushort);
> +SDEF(int);
> +SDEF(uint);
> +SDEF(long);
> +SDEF(ulong);
> +#undef SDEF
>
> #define SDEF(TYPE) \
> OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 7922ddb..7948c26 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2902,6 +2902,7 @@ error:
> regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
> case GEN_OCL_FBH:
> case GEN_OCL_FBL:
> + case GEN_OCL_LZD:
> case GEN_OCL_CBIT:
> case GEN_OCL_COS:
> case GEN_OCL_SIN:
> @@ -3463,6 +3464,81 @@ error:
> }
> case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
> case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
> + case GEN_OCL_LZD:
> + {
> + Type *llvmDstType = I.getType();
> + ir::Type dstType = getType(ctx, llvmDstType);
> + Type *llvmSrcType = I.getOperand(0)->getType();
> + ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
> +
> + const ir::Register dst = this->getRegister(&I);
> + const ir::Register src = this->getRegister(I.getOperand(0));
> + int imm_value = 0;
> + if(srcType == ir::TYPE_U16) {
> + imm_value = 16;
> + }else if(srcType == ir::TYPE_U8) {
> + imm_value = 24;
> + }else if(srcType == ir::TYPE_U64) {
> + imm_value = 32;
> + }
> +
> + if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) {
> + ir::ImmediateIndex imm;
> + ir::Type tmpType = ir::TYPE_S32;
> + imm = ctx.newIntegerImmediate(imm_value, tmpType);
> + const ir::RegisterFamily family = getFamily(tmpType);
> + const ir::Register immReg = ctx.reg(family);
> + ctx.LOADI(ir::TYPE_S32, immReg, imm);
> +
> + ir::Register tmp0 = ctx.reg(getFamily(tmpType));
> + ir::Register tmp1 = ctx.reg(getFamily(tmpType));
> + ir::Register tmp2 = ctx.reg(getFamily(tmpType));
> + ctx.CVT(tmpType, srcType, tmp0, src);
> + ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0);
> + ctx.SUB(tmpType, tmp2, tmp1, immReg);
> + ctx.CVT(dstType, tmpType, dst, tmp2);
> + }
> + else if(srcType == ir::TYPE_U64) {
> + ir::ImmediateIndex imm;
> + ir::Type tmpType = ir::TYPE_U32;
> + imm = ctx.newIntegerImmediate(imm_value, srcType);
> + const ir::RegisterFamily family = getFamily(srcType);
> + const ir::Register immReg = ctx.reg(family);
> + ctx.LOADI(ir::TYPE_S64, immReg, imm);
> +
> + const ir::RegisterFamily tmpFamily = getFamily(tmpType);
> + const ir::ImmediateIndex imm32 = ctx.newIntegerImmediate(32, tmpType);
> + const ir::Register imm32Reg = ctx.reg(tmpFamily);
> + ctx.LOADI(ir::TYPE_S32, imm32Reg, imm32);
> +
> + ir::Register tmp0 = ctx.reg(getFamily(srcType));
> + ir::Register tmp1 = ctx.reg(getFamily(tmpType));
> + ir::Register tmp2 = ctx.reg(getFamily(tmpType));
> + ir::Register tmp3 = ctx.reg(getFamily(tmpType));
> + ir::Register tmp4 = ctx.reg(getFamily(tmpType));
> + ir::Register tmp5 = ctx.reg(getFamily(tmpType));
> + ir::Register tmp6 = ctx.reg(getFamily(tmpType));
> + ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
> +
> + ctx.SHR(srcType, tmp0, src, immReg);
> + ctx.CVT(tmpType, srcType, tmp1, tmp0);
> +
> + ctx.ALU1(ir::OP_LZD, tmpType, tmp2, tmp1);
> + ctx.LT(tmpType, cmp, tmp2, imm32Reg);
> +
> + ctx.CVT(tmpType, srcType, tmp3, src);
> + ctx.ALU1(ir::OP_LZD, tmpType, tmp4, tmp3);
> + ctx.ADD(tmpType, tmp5, tmp4, imm32Reg);
> +
> + ctx.SEL(tmpType, tmp6, cmp, tmp2, tmp5);
> + ctx.CVT(dstType, tmpType, dst, tmp6);
> + }
> + else
> + {
> + ctx.ALU1(ir::OP_LZD, dstType, dst, src);
> + }
> + }
> + break;
> case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, getUnsignedType(ctx, (*AI)->getType())); break;
> case GEN_OCL_ABS:
> {
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 8ec8336..5a9b377 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -127,6 +127,7 @@ DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
> DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
> DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
> DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
> +DECL_LLVM_GEN_FUNCTION(LZD, __gen_ocl_lzd)
> DECL_LLVM_GEN_FUNCTION(CBIT, __gen_ocl_cbit)
>
> // saturate convert
> --
> 1.9.1
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list