[Beignet] [patch v4] libocl: reimplement clz with lzd instruction instead of fbh.
Zhigang Gong
zhigang.gong at linux.intel.com
Tue Jan 27 00:16:50 PST 2015
This patch LGTM, will push latter, thanks.
On Tue, Jan 27, 2015 at 11:39:21AM +0800, xionghu.luo at intel.com wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
>
> the fbh style is inefficient.
>
> v2: use llvm.ctlz to call llvm intrinsic instead of beignet non-standard
> intrinsic call style; remove the non-standard clz call path.
>
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
> backend/src/libocl/CMakeLists.txt | 2 +-
> backend/src/libocl/src/ocl_clz.ll | 44 ++++++++++++++++
> backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 78 +++++------------------------
> backend/src/libocl/tmpl/ocl_integer.tmpl.h | 9 ++++
> 4 files changed, 67 insertions(+), 66 deletions(-)
> create mode 100644 backend/src/libocl/src/ocl_clz.ll
>
> diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
> index 314d373..16f00ee 100644
> --- a/backend/src/libocl/CMakeLists.txt
> +++ b/backend/src/libocl/CMakeLists.txt
> @@ -181,7 +181,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
> )
> ENDMACRO(ADD_LL_TO_BC_TARGET)
>
> -SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset)
> +SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset ocl_clz)
> FOREACH(f ${OCL_LL_MODULES})
> COPY_THE_LL(${f})
> ADD_LL_TO_BC_TARGET(${f})
> diff --git a/backend/src/libocl/src/ocl_clz.ll b/backend/src/libocl/src/ocl_clz.ll
> new file mode 100644
> index 0000000..0863b6f
> --- /dev/null
> +++ b/backend/src/libocl/src/ocl_clz.ll
> @@ -0,0 +1,44 @@
> +declare i8 @llvm.ctlz.i8(i8, i1)
> +declare i16 @llvm.ctlz.i16(i16, i1)
> +declare i32 @llvm.ctlz.i32(i32, i1)
> +declare i64 @llvm.ctlz.i64(i64, i1)
> +
> +define i8 @clz_s8(i8 %x) nounwind readnone alwaysinline {
> + %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
> + ret i8 %call
> +}
> +
> +define i8 @clz_u8(i8 %x) nounwind readnone alwaysinline {
> + %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
> + ret i8 %call
> +}
> +
> +define i16 @clz_s16(i16 %x) nounwind readnone alwaysinline {
> + %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
> + ret i16 %call
> +}
> +
> +define i16 @clz_u16(i16 %x) nounwind readnone alwaysinline {
> + %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
> + ret i16 %call
> +}
> +
> +define i32 @clz_s32(i32 %x) nounwind readnone alwaysinline {
> + %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
> + ret i32 %call
> +}
> +
> +define i32 @clz_u32(i32 %x) nounwind readnone alwaysinline {
> + %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
> + ret i32 %call
> +}
> +
> +define i64 @clz_s64(i64 %x) nounwind readnone alwaysinline {
> + %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
> + ret i64 %call
> +}
> +
> +define i64 @clz_u64(i64 %x) nounwind readnone alwaysinline {
> + %call = call i64 @llvm.ctlz.i64(i64 %x, i1 0)
> + ret i64 %call
> +}
> diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
> index 6da0bab..a5e1dbc 100644
> --- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
> @@ -19,6 +19,8 @@
>
> PURE CONST uint __gen_ocl_fbh(uint);
> PURE CONST uint __gen_ocl_fbl(uint);
> +
> +
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint);
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int);
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort);
> @@ -26,71 +28,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short);
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar);
> PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char);
>
> -OVERLOADABLE char clz(char x) {
> - if (x < 0)
> - return 0;
> - if (x == 0)
> - return 8;
> - return __gen_ocl_fbh(x) - 24;
> -}
> -
> -OVERLOADABLE uchar clz(uchar x) {
> - if (x == 0)
> - return 8;
> - return __gen_ocl_fbh(x) - 24;
> -}
> -
> -OVERLOADABLE short clz(short x) {
> - if (x < 0)
> - return 0;
> - if (x == 0)
> - return 16;
> - return __gen_ocl_fbh(x) - 16;
> -}
> -
> -OVERLOADABLE ushort clz(ushort x) {
> - if (x == 0)
> - return 16;
> - return __gen_ocl_fbh(x) - 16;
> -}
> -
> -OVERLOADABLE int clz(int x) {
> - if (x < 0)
> - return 0;
> - if (x == 0)
> - return 32;
> - return __gen_ocl_fbh(x);
> -}
> -
> -OVERLOADABLE uint clz(uint x) {
> - if (x == 0)
> - return 32;
> - return __gen_ocl_fbh(x);
> -}
> -
> -OVERLOADABLE long clz(long x) {
> - union { int i[2]; long x; } u;
> - u.x = x;
> - if (u.i[1] & 0x80000000u)
> - return 0;
> - if (u.i[1] == 0 && u.i[0] == 0)
> - return 64;
> - uint v = clz(u.i[1]);
> - if(v == 32)
> - v += clz(u.i[0]);
> - return v;
> -}
> -
> -OVERLOADABLE ulong clz(ulong x) {
> - if (x == 0)
> - return 64;
> - union { uint i[2]; ulong x; } u;
> - u.x = x;
> - uint v = clz(u.i[1]);
> - if(v == 32)
> - v += clz(u.i[0]);
> - return v;
> -}
> +#define SDEF(TYPE, TYPE_NAME, SIZE) \
> +OVERLOADABLE TYPE clz(TYPE x){ return clz_##TYPE_NAME##SIZE(x);}
> +SDEF(char, s, 8);
> +SDEF(uchar, u, 8);
> +SDEF(short, s, 16);
> +SDEF(ushort, u, 16);
> +SDEF(int, s, 32);
> +SDEF(uint, u, 32);
> +SDEF(long, s, 64);
> +SDEF(ulong, u, 64);
> +#undef SDEF
>
> #define SDEF(TYPE) \
> OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
> diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
> index f067b8d..4b3b5ae 100644
> --- a/backend/src/libocl/tmpl/ocl_integer.tmpl.h
> +++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
> @@ -45,6 +45,15 @@ OVERLOADABLE uint clz(uint x);
> OVERLOADABLE long clz(long x);
> OVERLOADABLE ulong clz(ulong x);
>
> +char clz_s8(char);
> +uchar clz_u8(uchar);
> +short clz_s16(short);
> +ushort clz_u16(ushort);
> +int clz_s32(int);
> +uint clz_u32(uint);
> +long clz_s64(long);
> +ulong clz_u64(ulong);
> +
> OVERLOADABLE char popcount(char x);
> OVERLOADABLE uchar popcount(uchar x);
> OVERLOADABLE short popcount(short x);
> --
> 1.9.1
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list