[Beignet] [patch v4] libocl: reimplement clz with lzd instruction instead of fbh.
xionghu.luo at intel.com
xionghu.luo at intel.com
Mon Jan 26 19:39:21 PST 2015
From: Luo Xionghu <xionghu.luo at intel.com>
the fbh style is inefficient.
v2: use llvm.ctlz to call llvm intrinsic instead of beignet non-standard
intrinsic call style; remove the non-standard clz call path.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
backend/src/libocl/CMakeLists.txt | 2 +-
backend/src/libocl/src/ocl_clz.ll | 44 ++++++++++++++++
backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 78 +++++------------------------
backend/src/libocl/tmpl/ocl_integer.tmpl.h | 9 ++++
4 files changed, 67 insertions(+), 66 deletions(-)
create mode 100644 backend/src/libocl/src/ocl_clz.ll
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 314d373..16f00ee 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -181,7 +181,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
)
ENDMACRO(ADD_LL_TO_BC_TARGET)
-SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset)
+SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset ocl_clz)
FOREACH(f ${OCL_LL_MODULES})
COPY_THE_LL(${f})
ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/src/ocl_clz.ll b/backend/src/libocl/src/ocl_clz.ll
new file mode 100644
index 0000000..0863b6f
--- /dev/null
+++ b/backend/src/libocl/src/ocl_clz.ll
@@ -0,0 +1,44 @@
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @clz_s8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i8 @clz_u8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i16 @clz_s16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i16 @clz_u16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i32 @clz_s32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i32 @clz_u32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i64 @clz_s64(i64 %x) nounwind readnone alwaysinline {
+ %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
+ ret i64 %call
+}
+
+define i64 @clz_u64(i64 %x) nounwind readnone alwaysinline {
+ %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
+ ret i64 %call
+}
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 6da0bab..a5e1dbc 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -19,6 +19,8 @@
PURE CONST uint __gen_ocl_fbh(uint);
PURE CONST uint __gen_ocl_fbl(uint);
+
+
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort);
@@ -26,71 +28,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char);
-OVERLOADABLE char clz(char x) {
- if (x < 0)
- return 0;
- if (x == 0)
- return 8;
- return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE uchar clz(uchar x) {
- if (x == 0)
- return 8;
- return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE short clz(short x) {
- if (x < 0)
- return 0;
- if (x == 0)
- return 16;
- return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE ushort clz(ushort x) {
- if (x == 0)
- return 16;
- return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE int clz(int x) {
- if (x < 0)
- return 0;
- if (x == 0)
- return 32;
- return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE uint clz(uint x) {
- if (x == 0)
- return 32;
- return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE long clz(long x) {
- union { int i[2]; long x; } u;
- u.x = x;
- if (u.i[1] & 0x80000000u)
- return 0;
- if (u.i[1] == 0 && u.i[0] == 0)
- return 64;
- uint v = clz(u.i[1]);
- if(v == 32)
- v += clz(u.i[0]);
- return v;
-}
-
-OVERLOADABLE ulong clz(ulong x) {
- if (x == 0)
- return 64;
- union { uint i[2]; ulong x; } u;
- u.x = x;
- uint v = clz(u.i[1]);
- if(v == 32)
- v += clz(u.i[0]);
- return v;
-}
+#define SDEF(TYPE, TYPE_NAME, SIZE) \
+OVERLOADABLE TYPE clz(TYPE x){ return clz_##TYPE_NAME##SIZE(x);}
+SDEF(char, s, 8);
+SDEF(uchar, u, 8);
+SDEF(short, s, 16);
+SDEF(ushort, u, 16);
+SDEF(int, s, 32);
+SDEF(uint, u, 32);
+SDEF(long, s, 64);
+SDEF(ulong, u, 64);
+#undef SDEF
#define SDEF(TYPE) \
OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
index f067b8d..4b3b5ae 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
@@ -45,6 +45,15 @@ OVERLOADABLE uint clz(uint x);
OVERLOADABLE long clz(long x);
OVERLOADABLE ulong clz(ulong x);
+char clz_s8(char);
+uchar clz_u8(uchar);
+short clz_s16(short);
+ushort clz_u16(ushort);
+int clz_s32(int);
+uint clz_u32(uint);
+long clz_s64(long);
+ulong clz_u64(ulong);
+
OVERLOADABLE char popcount(char x);
OVERLOADABLE uchar popcount(uchar x);
OVERLOADABLE short popcount(short x);
--
1.9.1
More information about the Beignet
mailing list