[Beignet] [PATCH OCL2.0 v4 1/2] Backend: Add built-in ctz function

Wed Feb 24 02:19:28 UTC 2016

From: Pan Xiuli <xiuli.pan at intel.com>

Gen doesn't have a tailing zero detection function. Use bit field
reverse to reverse the interger first and leading zero detection
to get the number of tailing zeros. Also add some workaroud for
unsupport short and char type to get expected result.

V2: Add missing file ocl_ctz.ll
V3: Add utests
V4: Update SPIR target
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c   |  1 +
 backend/src/backend/gen_context.cpp         |  1 +
 backend/src/backend/gen_defs.hpp            |  1 +
 backend/src/backend/gen_encoder.cpp         |  1 +
 backend/src/backend/gen_encoder.hpp         |  1 +
 backend/src/backend/gen_insn_selection.cpp  |  4 +-
 backend/src/backend/gen_insn_selection.hxx  |  1 +
 backend/src/ir/instruction.cpp              |  1 +
 backend/src/ir/instruction.hpp              |  2 +
 backend/src/ir/instruction.hxx              |  1 +
 backend/src/libocl/CMakeLists.txt           |  2 +-
 backend/src/libocl/script/ocl_integer.def   |  1 +
 backend/src/libocl/src/ocl_ctz.ll           | 65 +++++++++++++++++++++++++++++
 backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 12 ++++++
 backend/src/libocl/tmpl/ocl_integer.tmpl.h  | 18 ++++++++
 backend/src/llvm/llvm_gen_backend.cpp       | 50 ++++++++++++++++++++++
 16 files changed, 160 insertions(+), 2 deletions(-)
 create mode 100644 backend/src/libocl/src/ocl_ctz.ll

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 82a7524..2515b1b 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -70,6 +70,7 @@ static const struct {
   [GEN_OPCODE_CBIT] = { .name = "cbit", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_F16TO32] = { .name = "f16to32", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_F32TO16] = { .name = "f32to16", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_BFREV] = { .name = "bfrev", .nsrc = 1, .ndst = 1 },
 
   [GEN_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0acf00c..57adaf1 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -285,6 +285,7 @@ namespace gbe
       case SEL_OP_F16TO32: p->F16TO32(dst, src); break;
       case SEL_OP_F32TO16: p->F32TO16(dst, src); break;
       case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src); break;
+      case SEL_OP_BFREV: p->BFREV(dst, src); break;
       case SEL_OP_CONVI64_TO_I:
        {
         p->MOV(dst, src.bottom_half());
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 586c9a1..60ebdd6 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -129,6 +129,7 @@ enum opcode {
   GEN_OPCODE_CMPN = 17,
   GEN_OPCODE_F32TO16 = 19,
   GEN_OPCODE_F16TO32 = 20,
+  GEN_OPCODE_BFREV = 23,
   GEN_OPCODE_JMPI = 32,
   GEN_OPCODE_BRD = 33,
   GEN_OPCODE_IF = 34,
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 564f207..2e3a2f5 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -862,6 +862,7 @@ namespace gbe
   ALU2(PLN)
   ALU2(MACH)
   ALU3(MAD)
+  ALU1(BFREV)
  // ALU2(BRC)
  // ALU1(ENDIF)
  //  ALU1(IF)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 5b4f4c2..65263fe 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -128,6 +128,7 @@ namespace gbe
     //ALU2(MOV_DF);
     ALU2(BRC)
     ALU1(BRD)
+    ALU1(BFREV)
 #undef ALU1
 #undef ALU2
 #undef ALU2_MOD
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index cc0ace0..5535851 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -571,6 +571,7 @@ namespace gbe
     I64Shift(I64SHL)
     I64Shift(I64SHR)
     I64Shift(I64ASR)
+    ALU1(BFREV)
 #undef ALU1
 #undef ALU1WithTemp
 #undef ALU2
@@ -2639,7 +2640,7 @@ namespace gbe
         return isSrc ? insnType : ir::TYPE_U32;
       if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
         return insnType;
-      if (opcode == ir::OP_FBH || opcode == ir::OP_FBL || opcode == ir::OP_LZD)
+      if (opcode == ir::OP_FBH || opcode == ir::OP_FBL || opcode == ir::OP_LZD || opcode == ir::OP_BFREV)
         return ir::TYPE_U32;
       if (opcode == ir::OP_SIMD_ANY || opcode == ir::OP_SIMD_ALL)
         return ir::TYPE_S32;
@@ -2694,6 +2695,7 @@ namespace gbe
           case ir::OP_FBL: sel.FBL(dst, src); break;
           case ir::OP_CBIT: sel.CBIT(dst, src); break;
           case ir::OP_LZD: sel.LZD(dst, src); break;
+          case ir::OP_BFREV: sel.BFREV(dst, src); break;
           case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
           case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
           case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 7c4991a..c594e58 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -99,3 +99,4 @@ DECL_SELECTION_IR(READ_ARF, UnaryInstruction)
 DECL_SELECTION_IR(WHILE, UnaryInstruction)
 DECL_SELECTION_IR(F64DIV, F64DIVInstruction)
 DECL_SELECTION_IR(WORKGROUP_OP, WorkGroupOpInstruction)
+DECL_SELECTION_IR(BFREV, UnaryInstruction)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 464e483..5e26584 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1985,6 +1985,7 @@ DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
   DECL_EMIT_FUNCTION(RNDE)
   DECL_EMIT_FUNCTION(RNDU)
   DECL_EMIT_FUNCTION(RNDZ)
+  DECL_EMIT_FUNCTION(BFREV)
 
 #undef DECL_EMIT_FUNCTION
 
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 67ce833..1017c19 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -686,6 +686,8 @@ namespace ir {
   Instruction RNDZ(Type type, Register dst, Register src);
   /*! bswap.type dst src */
   Instruction BSWAP(Type type, Register dst, Register src);
+  /*! bfrev.type dst src */
+  Instruction BFREV(Type type, Register dst, Register src);
   /*! pow.type dst src0 src1 */
   Instruction POW(Type type, Register dst, Register src0, Register src1);
   /*! mul.type dst src0 src1 */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 080647d..3474c9d 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -107,3 +107,4 @@ DECL_INSN(ENDIF, BranchInstruction)
 DECL_INSN(ELSE, BranchInstruction)
 DECL_INSN(WHILE, BranchInstruction)
 DECL_INSN(WORKGROUP, WorkGroupInstruction)
+DECL_INSN(BFREV, UnaryInstruction)
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 8bb4c1e..53183b8 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
 	)
 ENDMACRO(ADD_LL_TO_BC_TARGET)
 
-SET (OCL_LL_MODULES ocl_barrier ocl_clz)
+SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_ctz)
 FOREACH(f ${OCL_LL_MODULES})
     COPY_THE_LL(${f})
     ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/script/ocl_integer.def b/backend/src/libocl/script/ocl_integer.def
index c35c242..5e41c34 100644
--- a/backend/src/libocl/script/ocl_integer.def
+++ b/backend/src/libocl/script/ocl_integer.def
@@ -7,6 +7,7 @@ gentype rhadd (gentype x, gentype y)
 gentype clamp (gentype x, gentype minval, gentype maxval)
 gentype clamp (gentype x, sgentype minval, sgentype maxval)
 gentype clz (gentype x)
+gentype ctz (gentype x)
 gentype mad_hi (gentype a, gentype b, gentype c)
 gentype mad_sat (gentype a, gentype b, gentype c)
 gentype max (gentype x,  gentype y)
diff --git a/backend/src/libocl/src/ocl_ctz.ll b/backend/src/libocl/src/ocl_ctz.ll
new file mode 100644
index 0000000..0a79b26
--- /dev/null
+++ b/backend/src/libocl/src/ocl_ctz.ll
@@ -0,0 +1,65 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i8 @ctz_s8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i8 @ctz_u8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.cttz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i16 @ctz_s16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i16 @ctz_u16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.cttz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i32 @ctz_s32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i32 @ctz_u32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.cttz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i64 @ctz_s64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call1, 32
+  %4 = add i32 %call2, 32
+  %5 = select i1 %cmp, i32 %call1, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
+}
+
+define i64 @ctz_u64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.cttz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.cttz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call1, 32
+  %4 = add i32 %call2, 32
+  %5 = select i1 %cmp, i32 %call1, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
+}
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 12408eb..292cc63 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -40,6 +40,18 @@ SDEF(long, s, 64);
 SDEF(ulong, u, 64);
 #undef SDEF
 
+#define SDEF(TYPE, TYPE_NAME, SIZE)        \
+OVERLOADABLE TYPE ctz(TYPE x){ return ctz_##TYPE_NAME##SIZE(x);}
+SDEF(char, s, 8);
+SDEF(uchar, u, 8);
+SDEF(short, s, 16);
+SDEF(ushort, u, 16);
+SDEF(int, s, 32);
+SDEF(uint, u, 32);
+SDEF(long, s, 64);
+SDEF(ulong, u, 64);
+#undef SDEF
+
 #define SDEF(TYPE)        \
 OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
 SDEF(char);
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
index 4b3b5ae..ac1800b 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
@@ -54,6 +54,24 @@ uint   clz_u32(uint);
 long   clz_s64(long);
 ulong  clz_u64(ulong);
 
+OVERLOADABLE char ctz(char x);
+OVERLOADABLE uchar ctz(uchar x);
+OVERLOADABLE short ctz(short x);
+OVERLOADABLE ushort ctz(ushort x);
+OVERLOADABLE int ctz(int x);
+OVERLOADABLE uint ctz(uint x);
+OVERLOADABLE long ctz(long x);
+OVERLOADABLE ulong ctz(ulong x);
+
+char   ctz_s8(char);
+uchar  ctz_u8(uchar);
+short  ctz_s16(short);
+ushort ctz_u16(ushort);
+int    ctz_s32(int);
+uint   ctz_u32(uint);
+long   ctz_s64(long);
+ulong  ctz_u64(ulong);
+
 OVERLOADABLE char popcount(char x);
 OVERLOADABLE uchar popcount(uchar x);
 OVERLOADABLE short popcount(short x);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index d5d02f5..6b81275 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3409,6 +3409,7 @@ namespace gbe
             this->newRegister(&I);
           break;
           case Intrinsic::ctlz:
+          case Intrinsic::cttz:
           case Intrinsic::bswap:
             this->newRegister(&I);
           break;
@@ -3917,6 +3918,55 @@ namespace gbe
             }
           }
           break;
+          case Intrinsic::cttz:
+          {
+            Type *llvmDstType = I.getType();
+            ir::Type dstType = getType(ctx, llvmDstType);
+            Type *llvmSrcType = I.getOperand(0)->getType();
+            ir::Type srcType = getUnsignedType(ctx, llvmSrcType);
+
+            //the llvm.ctlz.i64 is lowered to two llvm.cttz.i32 call in ocl_ctz.ll
+            GBE_ASSERT(srcType != ir::TYPE_U64);
+
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register src = this->getRegister(I.getOperand(0));
+
+            uint32_t imm_value = 0;
+            if(srcType == ir::TYPE_U16) {
+              imm_value = 0xFFFF0000;
+            }else if(srcType == ir::TYPE_U8) {
+              imm_value = 0xFFFFFF00;
+            }
+            if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) {
+              ir::ImmediateIndex imm;
+              ir::Type tmpType = ir::TYPE_S32;
+              ir::Type revType = ir::TYPE_U32;
+              imm = ctx.newIntegerImmediate(imm_value, revType);
+              const ir::RegisterFamily family = getFamily(revType);
+              const ir::Register immReg = ctx.reg(family);
+              ctx.LOADI(ir::TYPE_U32, immReg, imm);
+
+              ir::Register tmp0 = ctx.reg(getFamily(tmpType));
+              ir::Register tmp1 = ctx.reg(getFamily(revType));
+              ir::Register tmp2 = ctx.reg(getFamily(revType));
+              ir::Register revTmp = ctx.reg(getFamily(revType));
+
+              ctx.CVT(tmpType, srcType, tmp0, src);
+              //gen does not have 'tzd', so reverse first
+              ctx.ADD(revType, tmp1, tmp0, immReg);
+              ctx.ALU1(ir::OP_BFREV, revType, revTmp, tmp1);
+              ctx.ALU1(ir::OP_LZD, tmpType, tmp2, revTmp);
+              ctx.CVT(dstType, tmpType, dst, tmp2);
+            }
+            else
+            {
+              ir::Type revType = ir::TYPE_U32;
+              ir::Register revTmp = ctx.reg(getFamily(revType));
+              ctx.ALU1(ir::OP_BFREV, revType, revTmp, src);
+              ctx.ALU1(ir::OP_LZD, dstType, dst, revTmp);
+            }
+          }
+          break;
           case Intrinsic::fma:
           {
             ir::Type srcType = getType(ctx, I.getType());
-- 
2.5.0