[Beignet] [PATCH] add llvm intrinsic call usub_with_overflow funtion.
Zhigang Gong
zhigang.gong at linux.intel.com
Thu Oct 30 20:59:26 PDT 2014
On Thu, Oct 30, 2014 at 10:59:37AM +0800, xionghu.luo at intel.com wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
>
> as llvm couldn't recognize the pattern of usub overflow, this usub with
> is generated by calling the intrinsic function __builtin_usub_overflow;
> also this type of uadd intrinsic funtion couldn't support short/byte type
> overflow, we choose another way for the uadd kernel to generate
> short/byte overflow.
> will send patch to llvm later to fix the 2 issues.
>
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
> backend/src/llvm/llvm_gen_backend.cpp | 15 ++++-
> kernels/compiler_overflow.cl | 55 ++++++++++++++-----
> utests/compiler_overflow.cpp | 97 ++++++++++++++++++++++++++-------
> 3 files changed, 131 insertions(+), 36 deletions(-)
>
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index bb2c1dd..5e1b9d5 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2805,9 +2805,22 @@ namespace gbe
> ctx.LT(dst0Type, overflow, dst0, src1);
> }
> break;
> + case Intrinsic::usub_with_overflow:
> + {
> + Type *llvmDstType = I.getType();
> + GBE_ASSERT(llvmDstType->isStructTy());
> + ir::Type dst0Type = getType(ctx, llvmDstType->getStructElementType(0));
> + const ir::Register dst0 = this->getRegister(&I, 0);
> + const ir::Register src0 = this->getRegister(I.getOperand(0));
> + const ir::Register src1 = this->getRegister(I.getOperand(1));
> + ctx.SUB(dst0Type, dst0, src0, src1);
> +
> + ir::Register overflow = this->getRegister(&I, 1);
> + ctx.LT(dst0Type, overflow, dst0, src1);
The above implementation is incorrect. The correct one should be:
ctx.GT(dst0Type, overflow, dst0, src0);
Right?
> + }
> + break;
> case Intrinsic::sadd_with_overflow:
> case Intrinsic::ssub_with_overflow:
> - case Intrinsic::usub_with_overflow:
> case Intrinsic::smul_with_overflow:
> case Intrinsic::umul_with_overflow:
> NOT_IMPLEMENTED;
The other part should be splited to another patch.
Thanks,
Zhigang Gong.
> diff --git a/kernels/compiler_overflow.cl b/kernels/compiler_overflow.cl
> index 75ed5ce..af751b7 100644
> --- a/kernels/compiler_overflow.cl
> +++ b/kernels/compiler_overflow.cl
> @@ -1,20 +1,45 @@
> -#define COMPILER_OVERFLOW(TYPE) \
> - kernel void compiler_overflow_##TYPE (global TYPE* src, global TYPE* dst) \
> +#define COMPILER_OVERFLOW_ADD(TYPE, FUNC) \
> + kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst) \
> { \
> - __global TYPE* A = &src[get_global_id(0)]; \
> - TYPE B = 1; \
> - *A += B; \
> - TYPE carry = -convert_##TYPE((*A) < B); \
> + __global TYPE* A = &src0[get_global_id(0)]; \
> + __global TYPE* B = &src1[get_global_id(0)]; \
> + __global TYPE* C = &dst[get_global_id(0)]; \
> + *C = *A + *B; \
> + TYPE carry = -convert_##TYPE(*C < *B); \
> \
> - (*A).y += carry.x; \
> - carry.y += ((*A).y < carry.x); \
> - (*A).z += carry.y; \
> + (*C).y += carry.x; \
> + carry.y += ((*C).y < carry.x); \
> + (*C).z += carry.y; \
> \
> - carry.z += ((*A).z < carry.y); \
> - (*A).w += carry.z; \
> - dst[get_global_id(0)] = src[get_global_id(0)]; \
> + carry.z += ((*C).z < carry.y); \
> + (*C).w += carry.z; \
> + carry.w += ((*C).w < carry.z); \
> }
>
> -COMPILER_OVERFLOW(uint4)
> -COMPILER_OVERFLOW(ushort4)
> -COMPILER_OVERFLOW(uchar4)
> +
> +COMPILER_OVERFLOW_ADD(ulong4, add)
> +COMPILER_OVERFLOW_ADD(uint4, add)
> +COMPILER_OVERFLOW_ADD(ushort4, add)
> +COMPILER_OVERFLOW_ADD(uchar4, add)
> +
> +#define COMPILER_OVERFLOW_SUB(TYPE, FUNC) \
> + kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst) \
> +{ \
> + __global TYPE* A = &src0[get_global_id(0)]; \
> + __global TYPE* B = &src1[get_global_id(0)]; \
> + __global TYPE* C = &dst[get_global_id(0)]; \
> + TYPE borrow; \
> + unsigned result; \
> + size_t num = sizeof(*A)/sizeof((*A)[0]); \
> + for (uint i = 0; i < num; i++ ) {\
> + borrow[i] = __builtin_usub_overflow((*A)[i], (*B)[i], &result); \
> + (*C)[i] = result; \
> + }\
> +\
> + for (uint i = 0; i < num-1; i++ ) {\
> + borrow[i+1] += (*C)[i+1] < borrow[i];(*C)[i+1] -= borrow[i]; \
> + }\
> +\
> +}
> +
> +COMPILER_OVERFLOW_SUB(uint4, sub)
> diff --git a/utests/compiler_overflow.cpp b/utests/compiler_overflow.cpp
> index 1d3f53d..1404cfe 100644
> --- a/utests/compiler_overflow.cpp
> +++ b/utests/compiler_overflow.cpp
> @@ -3,6 +3,13 @@
> namespace {
>
> typedef struct {
> + unsigned long x;
> + unsigned long y;
> + unsigned long z;
> + unsigned long w;
> +}ulong4;
> +
> +typedef struct {
> uint32_t x;
> uint32_t y;
> uint32_t z;
> @@ -23,8 +30,18 @@ typedef struct {
> uint8_t w;
> } uchar4;
>
> -template<typename T>
> -void test(const char *kernel_name, int s_type)
> +template <typename U>
> +U get_max()
> +{
> + int shift_bit = sizeof(U)*8;
> + U u_max = 0;
> + for (int i = 0; i < shift_bit; i++)
> + u_max |= 1<<(shift_bit-i-1);
> + return u_max;
> +}
> +
> +template<typename T, typename U>
> +void test(const char *kernel_name, int func_type)
> {
> const size_t n = 16;
>
> @@ -32,41 +49,81 @@ void test(const char *kernel_name, int s_type)
> OCL_CREATE_KERNEL_FROM_FILE("compiler_overflow", kernel_name);
> OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
> OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
> + OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
> OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
> OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
> + OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
> +
> + U max = get_max<U>();
>
> OCL_MAP_BUFFER(0);
> for (uint32_t i = 0; i < n; ++i) {
> - ((T*)buf_data[0])[i].x = s_type?CL_INT_MAX:CL_UINT_MAX;
> - ((T*)buf_data[0])[i].y = s_type?CL_INT_MAX:CL_UINT_MAX;
> - ((T*)buf_data[0])[i].z = s_type?CL_INT_MAX:CL_UINT_MAX;
> - ((T*)buf_data[0])[i].w = i;
> + if(func_type == 0) {
> + ((T*)buf_data[0])[i].x = max;
> + ((T*)buf_data[0])[i].y = max;
> + ((T*)buf_data[0])[i].z = max;
> + ((T*)buf_data[0])[i].w = i;
> + }else if(func_type == 1) {
> + ((T*)buf_data[0])[i].x = 0;
> + ((T*)buf_data[0])[i].y = 0;
> + ((T*)buf_data[0])[i].z = 0;
> + ((T*)buf_data[0])[i].w = n+2-i;
> + }else
> + OCL_ASSERT(0);
> }
> OCL_UNMAP_BUFFER(0);
> + OCL_MAP_BUFFER(1);
> + for (uint32_t i = 0; i < n; ++i) {
> + ((T*)buf_data[1])[i].x = 1;
> + ((T*)buf_data[1])[i].y = 1;
> + ((T*)buf_data[1])[i].z = 1;
> + ((T*)buf_data[1])[i].w = 1;
> + }
> + OCL_UNMAP_BUFFER(1);
>
> globals[0] = n;
> locals[0] = 16;
> OCL_NDRANGE(1);
> -
> - OCL_MAP_BUFFER(1);
> + OCL_MAP_BUFFER(2);
> for (uint32_t i = 0; i < 16; ++i) {
> - OCL_ASSERT(((T*)buf_data[1])[i].x == 0);
> - OCL_ASSERT(((T*)buf_data[1])[i].y == 1);
> - OCL_ASSERT(((T*)buf_data[1])[i].z == 1);
> - OCL_ASSERT(((T*)buf_data[1])[i].w == i+2);
> + // printf("%u,%u,%u,%u\n", ((T*)buf_data[2])[i].x,((T*)buf_data[2])[i].y, ((T*)buf_data[2])[i].z, ((T*)buf_data[2])[i].w );
> + if(func_type == 0) {
> + OCL_ASSERT(((T*)buf_data[2])[i].x == 0);
> + OCL_ASSERT(((T*)buf_data[2])[i].y == 1);
> + OCL_ASSERT(((T*)buf_data[2])[i].z == 1);
> + OCL_ASSERT(((T*)buf_data[2])[i].w == i+2);
> + }else if(func_type == 1) {
> + OCL_ASSERT(((T*)buf_data[2])[i].x == max);
> + OCL_ASSERT(((T*)buf_data[2])[i].y == max-1);
> + OCL_ASSERT(((T*)buf_data[2])[i].z == max-1);
> + OCL_ASSERT(((T*)buf_data[2])[i].w == n-i);
> + }else
> + OCL_ASSERT(0);
> }
> - OCL_UNMAP_BUFFER(1);
> + OCL_UNMAP_BUFFER(2);
> }
>
> }
>
> -#define compiler_overflow(type, kernel, s_type) \
> -static void compiler_overflow_ ##type(void)\
> +#define compiler_overflow_add(type, subtype, kernel, func_type) \
> +static void compiler_overflow_add_ ##type(void)\
> {\
> - test<type>(# kernel, s_type);\
> + test<type, subtype>(# kernel, func_type);\
> }\
> -MAKE_UTEST_FROM_FUNCTION(compiler_overflow_ ## type);
> +MAKE_UTEST_FROM_FUNCTION(compiler_overflow_add_ ## type);
> +
> +#define compiler_overflow_sub(type, subtype, kernel, func_type) \
> +static void compiler_overflow_sub_ ##type(void)\
> +{\
> + test<type, subtype>(# kernel, func_type);\
> +}\
> +MAKE_UTEST_FROM_FUNCTION(compiler_overflow_sub_ ## type);
> +
> +compiler_overflow_add(ulong4, unsigned long, compiler_overflow_ulong4_add, 0)
> +compiler_overflow_add(uint4, uint32_t, compiler_overflow_uint4_add, 0)
> +compiler_overflow_add(ushort4, uint16_t, compiler_overflow_ushort4_add, 0)
> +compiler_overflow_add(uchar4, uint8_t, compiler_overflow_uchar4_add, 0)
>
> -compiler_overflow(uint4, compiler_overflow_uint4, 0)
> -compiler_overflow(ushort4, compiler_overflow_ushort4, 0)
> -compiler_overflow(uchar4, compiler_overflow_uchar4, 0)
> +// as llvm intrincs function doesn't support byte/short overflow,
> +// we just test uint overflow here.
> +compiler_overflow_sub(uint4, uint32_t, compiler_overflow_uint4_sub, 1)
> --
> 1.7.9.5
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list