[Beignet] [patch v3 2/3] libocl: reimplement clz with lzd instruction instead of fbh.

xionghu.luo at intel.com xionghu.luo at intel.com
Sun Jan 25 22:57:46 PST 2015


From: Luo Xionghu <xionghu.luo at intel.com>

the fbh style is inefficient.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 86 +++++++----------------------
 backend/src/llvm/llvm_gen_backend.cpp       | 76 +++++++++++++++++++++++++
 backend/src/llvm/llvm_gen_ocl_function.hxx  |  1 +
 3 files changed, 98 insertions(+), 65 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 6da0bab..36da959 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -19,6 +19,16 @@
 
 PURE CONST uint __gen_ocl_fbh(uint);
 PURE CONST uint __gen_ocl_fbl(uint);
+
+PURE CONST OVERLOADABLE ulong  __gen_ocl_lzd(ulong);
+PURE CONST OVERLOADABLE long   __gen_ocl_lzd(long);
+PURE CONST OVERLOADABLE uint   __gen_ocl_lzd(uint);
+PURE CONST OVERLOADABLE int    __gen_ocl_lzd(int);
+PURE CONST OVERLOADABLE ushort __gen_ocl_lzd(ushort);
+PURE CONST OVERLOADABLE short  __gen_ocl_lzd(short);
+PURE CONST OVERLOADABLE uchar  __gen_ocl_lzd(uchar);
+PURE CONST OVERLOADABLE char   __gen_ocl_lzd(char);
+
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort);
@@ -26,71 +36,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char);
 
-OVERLOADABLE char clz(char x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE uchar clz(uchar x) {
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE short clz(short x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE ushort clz(ushort x) {
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE int clz(int x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE uint clz(uint x) {
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE long clz(long x) {
-  union { int i[2]; long x; } u;
-  u.x = x;
-  if (u.i[1] & 0x80000000u)
-    return 0;
-  if (u.i[1] == 0 && u.i[0] == 0)
-    return 64;
-  uint v = clz(u.i[1]);
-  if(v == 32)
-    v += clz(u.i[0]);
-  return v;
-}
-
-OVERLOADABLE ulong clz(ulong x) {
-  if (x == 0)
-    return 64;
-  union { uint i[2]; ulong x; } u;
-  u.x = x;
-  uint v = clz(u.i[1]);
-  if(v == 32)
-    v += clz(u.i[0]);
-  return v;
-}
+#define SDEF(TYPE)        \
+OVERLOADABLE TYPE clz(TYPE x){ return __gen_ocl_lzd(x);}
+SDEF(char);
+SDEF(uchar);
+SDEF(short);
+SDEF(ushort);
+SDEF(int);
+SDEF(uint);
+SDEF(long);
+SDEF(ulong);
+#undef SDEF
 
 #define SDEF(TYPE)        \
 OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 7922ddb..7948c26 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2902,6 +2902,7 @@ error:
         regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
       case GEN_OCL_FBH:
       case GEN_OCL_FBL:
+      case GEN_OCL_LZD:
       case GEN_OCL_CBIT:
       case GEN_OCL_COS:
       case GEN_OCL_SIN:
@@ -3463,6 +3464,81 @@ error:
           }
           case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
           case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
+          case GEN_OCL_LZD:
+          {
+            Type *llvmDstType = I.getType();
+            ir::Type dstType = getType(ctx, llvmDstType);
+            Type *llvmSrcType = I.getOperand(0)->getType();
+            ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
+
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register src = this->getRegister(I.getOperand(0));
+            int imm_value = 0;
+            if(srcType == ir::TYPE_U16) {
+              imm_value = 16;
+            }else if(srcType == ir::TYPE_U8) {
+              imm_value = 24;
+            }else if(srcType == ir::TYPE_U64) {
+              imm_value = 32;
+            }
+
+            if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) {
+              ir::ImmediateIndex imm;
+              ir::Type tmpType = ir::TYPE_S32;
+              imm = ctx.newIntegerImmediate(imm_value, tmpType);
+              const ir::RegisterFamily family = getFamily(tmpType);
+              const ir::Register immReg = ctx.reg(family);
+              ctx.LOADI(ir::TYPE_S32, immReg, imm);
+
+              ir::Register tmp0 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp1 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp2 = ctx.reg(getFamily(tmpType));
+              ctx.CVT(tmpType, srcType, tmp0, src);
+              ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0);
+              ctx.SUB(tmpType, tmp2, tmp1, immReg);
+              ctx.CVT(dstType, tmpType, dst, tmp2);
+            }
+            else if(srcType == ir::TYPE_U64) {
+              ir::ImmediateIndex imm;
+              ir::Type tmpType = ir::TYPE_U32;
+              imm = ctx.newIntegerImmediate(imm_value, srcType);
+              const ir::RegisterFamily family = getFamily(srcType);
+              const ir::Register immReg = ctx.reg(family);
+              ctx.LOADI(ir::TYPE_S64, immReg, imm);
+
+              const ir::RegisterFamily tmpFamily = getFamily(tmpType);
+              const ir::ImmediateIndex imm32 = ctx.newIntegerImmediate(32, tmpType);
+              const ir::Register imm32Reg = ctx.reg(tmpFamily);
+              ctx.LOADI(ir::TYPE_S32, imm32Reg, imm32);
+
+              ir::Register tmp0 = ctx.reg(getFamily(srcType));
+              ir::Register tmp1 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp2 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp3 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp4 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp5 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp6 = ctx.reg(getFamily(tmpType));
+              ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
+
+              ctx.SHR(srcType, tmp0, src, immReg);
+              ctx.CVT(tmpType, srcType, tmp1, tmp0);
+
+              ctx.ALU1(ir::OP_LZD, tmpType, tmp2, tmp1);
+              ctx.LT(tmpType, cmp, tmp2, imm32Reg);
+
+              ctx.CVT(tmpType, srcType, tmp3, src);
+              ctx.ALU1(ir::OP_LZD, tmpType, tmp4, tmp3);
+              ctx.ADD(tmpType, tmp5, tmp4, imm32Reg);
+
+              ctx.SEL(tmpType, tmp6, cmp, tmp2, tmp5);
+              ctx.CVT(dstType, tmpType, dst, tmp6);
+            }
+            else
+            {
+              ctx.ALU1(ir::OP_LZD, dstType, dst, src);
+            }
+          }
+          break;
           case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, getUnsignedType(ctx, (*AI)->getType())); break;
           case GEN_OCL_ABS:
           {
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 8ec8336..5a9b377 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -127,6 +127,7 @@ DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+DECL_LLVM_GEN_FUNCTION(LZD, __gen_ocl_lzd)
 DECL_LLVM_GEN_FUNCTION(CBIT, __gen_ocl_cbit)
 
 // saturate convert
-- 
1.9.1



More information about the Beignet mailing list