[Beignet] [patch v3 2/3] libocl: reimplement clz with lzd instruction instead of fbh.

Zhigang Gong zhigang.gong at linux.intel.com
Sun Jan 25 22:34:02 PST 2015


Just as we discussed offline. Please use llvm.ctlz directly in clz()
builtin function, thus we don't need to implement the __gen_ocl_lzd()
which is non-standard intrinsics. And we can avoid to duplicate a lot
of code.

On Mon, Jan 26, 2015 at 02:57:46PM +0800, xionghu.luo at intel.com wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
> 
> the fbh style is inefficient.
> 
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
>  backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 86 +++++++----------------------
>  backend/src/llvm/llvm_gen_backend.cpp       | 76 +++++++++++++++++++++++++
>  backend/src/llvm/llvm_gen_ocl_function.hxx  |  1 +
>  3 files changed, 98 insertions(+), 65 deletions(-)
> 
> diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
> index 6da0bab..36da959 100644
> --- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
> @@ -19,6 +19,16 @@
>  
>  PURE CONST uint __gen_ocl_fbh(uint);
>  PURE CONST uint __gen_ocl_fbl(uint);
> +
> +PURE CONST OVERLOADABLE ulong  __gen_ocl_lzd(ulong);
> +PURE CONST OVERLOADABLE long   __gen_ocl_lzd(long);
> +PURE CONST OVERLOADABLE uint   __gen_ocl_lzd(uint);
> +PURE CONST OVERLOADABLE int    __gen_ocl_lzd(int);
> +PURE CONST OVERLOADABLE ushort __gen_ocl_lzd(ushort);
> +PURE CONST OVERLOADABLE short  __gen_ocl_lzd(short);
> +PURE CONST OVERLOADABLE uchar  __gen_ocl_lzd(uchar);
> +PURE CONST OVERLOADABLE char   __gen_ocl_lzd(char);
> +
>  PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint);
>  PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int);
>  PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort);
> @@ -26,71 +36,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short);
>  PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar);
>  PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char);
>  
> -OVERLOADABLE char clz(char x) {
> -  if (x < 0)
> -    return 0;
> -  if (x == 0)
> -    return 8;
> -  return __gen_ocl_fbh(x) - 24;
> -}
> -
> -OVERLOADABLE uchar clz(uchar x) {
> -  if (x == 0)
> -    return 8;
> -  return __gen_ocl_fbh(x) - 24;
> -}
> -
> -OVERLOADABLE short clz(short x) {
> -  if (x < 0)
> -    return 0;
> -  if (x == 0)
> -    return 16;
> -  return __gen_ocl_fbh(x) - 16;
> -}
> -
> -OVERLOADABLE ushort clz(ushort x) {
> -  if (x == 0)
> -    return 16;
> -  return __gen_ocl_fbh(x) - 16;
> -}
> -
> -OVERLOADABLE int clz(int x) {
> -  if (x < 0)
> -    return 0;
> -  if (x == 0)
> -    return 32;
> -  return __gen_ocl_fbh(x);
> -}
> -
> -OVERLOADABLE uint clz(uint x) {
> -  if (x == 0)
> -    return 32;
> -  return __gen_ocl_fbh(x);
> -}
> -
> -OVERLOADABLE long clz(long x) {
> -  union { int i[2]; long x; } u;
> -  u.x = x;
> -  if (u.i[1] & 0x80000000u)
> -    return 0;
> -  if (u.i[1] == 0 && u.i[0] == 0)
> -    return 64;
> -  uint v = clz(u.i[1]);
> -  if(v == 32)
> -    v += clz(u.i[0]);
> -  return v;
> -}
> -
> -OVERLOADABLE ulong clz(ulong x) {
> -  if (x == 0)
> -    return 64;
> -  union { uint i[2]; ulong x; } u;
> -  u.x = x;
> -  uint v = clz(u.i[1]);
> -  if(v == 32)
> -    v += clz(u.i[0]);
> -  return v;
> -}
> +#define SDEF(TYPE)        \
> +OVERLOADABLE TYPE clz(TYPE x){ return __gen_ocl_lzd(x);}
> +SDEF(char);
> +SDEF(uchar);
> +SDEF(short);
> +SDEF(ushort);
> +SDEF(int);
> +SDEF(uint);
> +SDEF(long);
> +SDEF(ulong);
> +#undef SDEF
>  
>  #define SDEF(TYPE)        \
>  OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 7922ddb..7948c26 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2902,6 +2902,7 @@ error:
>          regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
>        case GEN_OCL_FBH:
>        case GEN_OCL_FBL:
> +      case GEN_OCL_LZD:
>        case GEN_OCL_CBIT:
>        case GEN_OCL_COS:
>        case GEN_OCL_SIN:
> @@ -3463,6 +3464,81 @@ error:
>            }
>            case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
>            case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
> +          case GEN_OCL_LZD:
> +          {
> +            Type *llvmDstType = I.getType();
> +            ir::Type dstType = getType(ctx, llvmDstType);
> +            Type *llvmSrcType = I.getOperand(0)->getType();
> +            ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
> +
> +            const ir::Register dst = this->getRegister(&I);
> +            const ir::Register src = this->getRegister(I.getOperand(0));
> +            int imm_value = 0;
> +            if(srcType == ir::TYPE_U16) {
> +              imm_value = 16;
> +            }else if(srcType == ir::TYPE_U8) {
> +              imm_value = 24;
> +            }else if(srcType == ir::TYPE_U64) {
> +              imm_value = 32;
> +            }
> +
> +            if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) {
> +              ir::ImmediateIndex imm;
> +              ir::Type tmpType = ir::TYPE_S32;
> +              imm = ctx.newIntegerImmediate(imm_value, tmpType);
> +              const ir::RegisterFamily family = getFamily(tmpType);
> +              const ir::Register immReg = ctx.reg(family);
> +              ctx.LOADI(ir::TYPE_S32, immReg, imm);
> +
> +              ir::Register tmp0 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp1 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp2 = ctx.reg(getFamily(tmpType));
> +              ctx.CVT(tmpType, srcType, tmp0, src);
> +              ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0);
> +              ctx.SUB(tmpType, tmp2, tmp1, immReg);
> +              ctx.CVT(dstType, tmpType, dst, tmp2);
> +            }
> +            else if(srcType == ir::TYPE_U64) {
> +              ir::ImmediateIndex imm;
> +              ir::Type tmpType = ir::TYPE_U32;
> +              imm = ctx.newIntegerImmediate(imm_value, srcType);
> +              const ir::RegisterFamily family = getFamily(srcType);
> +              const ir::Register immReg = ctx.reg(family);
> +              ctx.LOADI(ir::TYPE_S64, immReg, imm);
> +
> +              const ir::RegisterFamily tmpFamily = getFamily(tmpType);
> +              const ir::ImmediateIndex imm32 = ctx.newIntegerImmediate(32, tmpType);
> +              const ir::Register imm32Reg = ctx.reg(tmpFamily);
> +              ctx.LOADI(ir::TYPE_S32, imm32Reg, imm32);
> +
> +              ir::Register tmp0 = ctx.reg(getFamily(srcType));
> +              ir::Register tmp1 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp2 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp3 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp4 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp5 = ctx.reg(getFamily(tmpType));
> +              ir::Register tmp6 = ctx.reg(getFamily(tmpType));
> +              ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
> +
> +              ctx.SHR(srcType, tmp0, src, immReg);
> +              ctx.CVT(tmpType, srcType, tmp1, tmp0);
> +
> +              ctx.ALU1(ir::OP_LZD, tmpType, tmp2, tmp1);
> +              ctx.LT(tmpType, cmp, tmp2, imm32Reg);
> +
> +              ctx.CVT(tmpType, srcType, tmp3, src);
> +              ctx.ALU1(ir::OP_LZD, tmpType, tmp4, tmp3);
> +              ctx.ADD(tmpType, tmp5, tmp4, imm32Reg);
> +
> +              ctx.SEL(tmpType, tmp6, cmp, tmp2, tmp5);
> +              ctx.CVT(dstType, tmpType, dst, tmp6);
> +            }
> +            else
> +            {
> +              ctx.ALU1(ir::OP_LZD, dstType, dst, src);
> +            }
> +          }
> +          break;
>            case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, getUnsignedType(ctx, (*AI)->getType())); break;
>            case GEN_OCL_ABS:
>            {
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 8ec8336..5a9b377 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -127,6 +127,7 @@ DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
>  DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
>  DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
>  DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
> +DECL_LLVM_GEN_FUNCTION(LZD, __gen_ocl_lzd)
>  DECL_LLVM_GEN_FUNCTION(CBIT, __gen_ocl_cbit)
>  
>  // saturate convert
> -- 
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list