[Beignet] [patch v3 2/3] libocl: reimplement clz with lzd instruction instead of fbh.
xionghu.luo at intel.com
xionghu.luo at intel.com
Sun Jan 25 22:57:46 PST 2015
From: Luo Xionghu <xionghu.luo at intel.com>
the fbh style is inefficient.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 86 +++++++----------------------
backend/src/llvm/llvm_gen_backend.cpp | 76 +++++++++++++++++++++++++
backend/src/llvm/llvm_gen_ocl_function.hxx | 1 +
3 files changed, 98 insertions(+), 65 deletions(-)
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 6da0bab..36da959 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -19,6 +19,16 @@
PURE CONST uint __gen_ocl_fbh(uint);
PURE CONST uint __gen_ocl_fbl(uint);
+
+PURE CONST OVERLOADABLE ulong __gen_ocl_lzd(ulong);
+PURE CONST OVERLOADABLE long __gen_ocl_lzd(long);
+PURE CONST OVERLOADABLE uint __gen_ocl_lzd(uint);
+PURE CONST OVERLOADABLE int __gen_ocl_lzd(int);
+PURE CONST OVERLOADABLE ushort __gen_ocl_lzd(ushort);
+PURE CONST OVERLOADABLE short __gen_ocl_lzd(short);
+PURE CONST OVERLOADABLE uchar __gen_ocl_lzd(uchar);
+PURE CONST OVERLOADABLE char __gen_ocl_lzd(char);
+
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort);
@@ -26,71 +36,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char);
-OVERLOADABLE char clz(char x) {
- if (x < 0)
- return 0;
- if (x == 0)
- return 8;
- return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE uchar clz(uchar x) {
- if (x == 0)
- return 8;
- return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE short clz(short x) {
- if (x < 0)
- return 0;
- if (x == 0)
- return 16;
- return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE ushort clz(ushort x) {
- if (x == 0)
- return 16;
- return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE int clz(int x) {
- if (x < 0)
- return 0;
- if (x == 0)
- return 32;
- return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE uint clz(uint x) {
- if (x == 0)
- return 32;
- return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE long clz(long x) {
- union { int i[2]; long x; } u;
- u.x = x;
- if (u.i[1] & 0x80000000u)
- return 0;
- if (u.i[1] == 0 && u.i[0] == 0)
- return 64;
- uint v = clz(u.i[1]);
- if(v == 32)
- v += clz(u.i[0]);
- return v;
-}
-
-OVERLOADABLE ulong clz(ulong x) {
- if (x == 0)
- return 64;
- union { uint i[2]; ulong x; } u;
- u.x = x;
- uint v = clz(u.i[1]);
- if(v == 32)
- v += clz(u.i[0]);
- return v;
-}
+#define SDEF(TYPE) \
+OVERLOADABLE TYPE clz(TYPE x){ return __gen_ocl_lzd(x);}
+SDEF(char);
+SDEF(uchar);
+SDEF(short);
+SDEF(ushort);
+SDEF(int);
+SDEF(uint);
+SDEF(long);
+SDEF(ulong);
+#undef SDEF
#define SDEF(TYPE) \
OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 7922ddb..7948c26 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2902,6 +2902,7 @@ error:
regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
case GEN_OCL_FBH:
case GEN_OCL_FBL:
+ case GEN_OCL_LZD:
case GEN_OCL_CBIT:
case GEN_OCL_COS:
case GEN_OCL_SIN:
@@ -3463,6 +3464,81 @@ error:
}
case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
+ case GEN_OCL_LZD:
+ {
+ Type *llvmDstType = I.getType();
+ ir::Type dstType = getType(ctx, llvmDstType);
+ Type *llvmSrcType = I.getOperand(0)->getType();
+ ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
+
+ const ir::Register dst = this->getRegister(&I);
+ const ir::Register src = this->getRegister(I.getOperand(0));
+ int imm_value = 0;
+ if(srcType == ir::TYPE_U16) {
+ imm_value = 16;
+ }else if(srcType == ir::TYPE_U8) {
+ imm_value = 24;
+ }else if(srcType == ir::TYPE_U64) {
+ imm_value = 32;
+ }
+
+ if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) {
+ ir::ImmediateIndex imm;
+ ir::Type tmpType = ir::TYPE_S32;
+ imm = ctx.newIntegerImmediate(imm_value, tmpType);
+ const ir::RegisterFamily family = getFamily(tmpType);
+ const ir::Register immReg = ctx.reg(family);
+ ctx.LOADI(ir::TYPE_S32, immReg, imm);
+
+ ir::Register tmp0 = ctx.reg(getFamily(tmpType));
+ ir::Register tmp1 = ctx.reg(getFamily(tmpType));
+ ir::Register tmp2 = ctx.reg(getFamily(tmpType));
+ ctx.CVT(tmpType, srcType, tmp0, src);
+ ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0);
+ ctx.SUB(tmpType, tmp2, tmp1, immReg);
+ ctx.CVT(dstType, tmpType, dst, tmp2);
+ }
+ else if(srcType == ir::TYPE_U64) {
+ ir::ImmediateIndex imm;
+ ir::Type tmpType = ir::TYPE_U32;
+ imm = ctx.newIntegerImmediate(imm_value, srcType);
+ const ir::RegisterFamily family = getFamily(srcType);
+ const ir::Register immReg = ctx.reg(family);
+ ctx.LOADI(ir::TYPE_S64, immReg, imm);
+
+ const ir::RegisterFamily tmpFamily = getFamily(tmpType);
+ const ir::ImmediateIndex imm32 = ctx.newIntegerImmediate(32, tmpType);
+ const ir::Register imm32Reg = ctx.reg(tmpFamily);
+ ctx.LOADI(ir::TYPE_S32, imm32Reg, imm32);
+
+ ir::Register tmp0 = ctx.reg(getFamily(srcType));
+ ir::Register tmp1 = ctx.reg(getFamily(tmpType));
+ ir::Register tmp2 = ctx.reg(getFamily(tmpType));
+ ir::Register tmp3 = ctx.reg(getFamily(tmpType));
+ ir::Register tmp4 = ctx.reg(getFamily(tmpType));
+ ir::Register tmp5 = ctx.reg(getFamily(tmpType));
+ ir::Register tmp6 = ctx.reg(getFamily(tmpType));
+ ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
+
+ ctx.SHR(srcType, tmp0, src, immReg);
+ ctx.CVT(tmpType, srcType, tmp1, tmp0);
+
+ ctx.ALU1(ir::OP_LZD, tmpType, tmp2, tmp1);
+ ctx.LT(tmpType, cmp, tmp2, imm32Reg);
+
+ ctx.CVT(tmpType, srcType, tmp3, src);
+ ctx.ALU1(ir::OP_LZD, tmpType, tmp4, tmp3);
+ ctx.ADD(tmpType, tmp5, tmp4, imm32Reg);
+
+ ctx.SEL(tmpType, tmp6, cmp, tmp2, tmp5);
+ ctx.CVT(dstType, tmpType, dst, tmp6);
+ }
+ else
+ {
+ ctx.ALU1(ir::OP_LZD, dstType, dst, src);
+ }
+ }
+ break;
case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, getUnsignedType(ctx, (*AI)->getType())); break;
case GEN_OCL_ABS:
{
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 8ec8336..5a9b377 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -127,6 +127,7 @@ DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+DECL_LLVM_GEN_FUNCTION(LZD, __gen_ocl_lzd)
DECL_LLVM_GEN_FUNCTION(CBIT, __gen_ocl_cbit)
// saturate convert
--
1.9.1
More information about the Beignet
mailing list