[Beignet] [patch v4 1/3] libocl: reimplement clz with lzd instruction instead of fbh.

xionghu.luo at intel.com xionghu.luo at intel.com
Tue Jan 27 19:49:49 PST 2015


From: Luo Xionghu <xionghu.luo at intel.com>

the fbh style is inefficient.

v2: use llvm.ctlz to call llvm intrinsic instead of beignet non-standard
intrinsic call style; remove the non-standard clz call path.
v3: lower the qword call to two dword call as the gen platform not
support native qword lzd instruction.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/CMakeLists.txt           |  2 +-
 backend/src/libocl/src/ocl_clz.ll           | 62 +++++++++++++++++++++++
 backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 78 +++++------------------------
 backend/src/libocl/tmpl/ocl_integer.tmpl.h  |  9 ++++
 4 files changed, 85 insertions(+), 66 deletions(-)
 create mode 100644 backend/src/libocl/src/ocl_clz.ll

diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 314d373..16f00ee 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -181,7 +181,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
 	)
 ENDMACRO(ADD_LL_TO_BC_TARGET)
 
-SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset)
+SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset ocl_clz)
 FOREACH(f ${OCL_LL_MODULES})
     COPY_THE_LL(${f})
     ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/src/ocl_clz.ll b/backend/src/libocl/src/ocl_clz.ll
new file mode 100644
index 0000000..a274cde
--- /dev/null
+++ b/backend/src/libocl/src/ocl_clz.ll
@@ -0,0 +1,62 @@
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @clz_s8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i8 @clz_u8(i8 %x) nounwind readnone alwaysinline {
+  %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+  ret i8 %call
+}
+
+define i16 @clz_s16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i16 @clz_u16(i16 %x) nounwind readnone alwaysinline {
+  %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+  ret i16 %call
+}
+
+define i32 @clz_s32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i32 @clz_u32(i32 %x) nounwind readnone alwaysinline {
+  %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+  ret i32 %call
+}
+
+define i64 @clz_s64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call2, 32
+  %4 = add i32 %call1, 32
+  %5 = select i1 %cmp, i32 %call2, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
+}
+
+define i64 @clz_u64(i64 %x) nounwind readnone alwaysinline {
+  %1 = bitcast i64 %x to <2 x i32>
+  %2 = extractelement <2 x i32> %1, i32 0
+  %3 = extractelement <2 x i32> %1, i32 1
+  %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+  %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+  %cmp = icmp ult i32 %call2, 32
+  %4 = add i32 %call1, 32
+  %5 = select i1 %cmp, i32 %call2, i32 %4
+  %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+  %call = bitcast <2 x i32> %6 to i64
+  ret i64 %call
+}
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 6da0bab..a5e1dbc 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -19,6 +19,8 @@
 
 PURE CONST uint __gen_ocl_fbh(uint);
 PURE CONST uint __gen_ocl_fbl(uint);
+
+
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort);
@@ -26,71 +28,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar);
 PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char);
 
-OVERLOADABLE char clz(char x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE uchar clz(uchar x) {
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE short clz(short x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE ushort clz(ushort x) {
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE int clz(int x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE uint clz(uint x) {
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE long clz(long x) {
-  union { int i[2]; long x; } u;
-  u.x = x;
-  if (u.i[1] & 0x80000000u)
-    return 0;
-  if (u.i[1] == 0 && u.i[0] == 0)
-    return 64;
-  uint v = clz(u.i[1]);
-  if(v == 32)
-    v += clz(u.i[0]);
-  return v;
-}
-
-OVERLOADABLE ulong clz(ulong x) {
-  if (x == 0)
-    return 64;
-  union { uint i[2]; ulong x; } u;
-  u.x = x;
-  uint v = clz(u.i[1]);
-  if(v == 32)
-    v += clz(u.i[0]);
-  return v;
-}
+#define SDEF(TYPE, TYPE_NAME, SIZE)        \
+OVERLOADABLE TYPE clz(TYPE x){ return clz_##TYPE_NAME##SIZE(x);}
+SDEF(char, s, 8);
+SDEF(uchar, u, 8);
+SDEF(short, s, 16);
+SDEF(ushort, u, 16);
+SDEF(int, s, 32);
+SDEF(uint, u, 32);
+SDEF(long, s, 64);
+SDEF(ulong, u, 64);
+#undef SDEF
 
 #define SDEF(TYPE)        \
 OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
index f067b8d..4b3b5ae 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
@@ -45,6 +45,15 @@ OVERLOADABLE uint clz(uint x);
 OVERLOADABLE long clz(long x);
 OVERLOADABLE ulong clz(ulong x);
 
+char   clz_s8(char);
+uchar  clz_u8(uchar);
+short  clz_s16(short);
+ushort clz_u16(ushort);
+int    clz_s32(int);
+uint   clz_u32(uint);
+long   clz_s64(long);
+ulong  clz_u64(ulong);
+
 OVERLOADABLE char popcount(char x);
 OVERLOADABLE uchar popcount(uchar x);
 OVERLOADABLE short popcount(short x);
-- 
1.9.1



More information about the Beignet mailing list