[Beignet] [PATCH] add llvm intrinsic call usub_with_overflow funtion.

Thu Oct 30 20:59:26 PDT 2014

On Thu, Oct 30, 2014 at 10:59:37AM +0800, xionghu.luo at intel.com wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
> 
> as llvm couldn't recognize the pattern of usub overflow, this usub with
> is generated by calling the intrinsic function __builtin_usub_overflow;
> also this type of uadd intrinsic funtion couldn't support short/byte type
> overflow, we choose another way for the uadd kernel to generate
> short/byte overflow.
> will send patch to llvm later to fix the 2 issues.
> 
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
>  backend/src/llvm/llvm_gen_backend.cpp |   15 ++++-
>  kernels/compiler_overflow.cl          |   55 ++++++++++++++-----
>  utests/compiler_overflow.cpp          |   97 ++++++++++++++++++++++++++-------
>  3 files changed, 131 insertions(+), 36 deletions(-)
> 
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index bb2c1dd..5e1b9d5 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2805,9 +2805,22 @@ namespace gbe
>              ctx.LT(dst0Type, overflow, dst0, src1);
>            }
>            break;
> +          case Intrinsic::usub_with_overflow:
> +          {
> +            Type *llvmDstType = I.getType();
> +            GBE_ASSERT(llvmDstType->isStructTy());
> +            ir::Type dst0Type = getType(ctx, llvmDstType->getStructElementType(0));
> +            const ir::Register dst0  = this->getRegister(&I, 0);
> +            const ir::Register src0 = this->getRegister(I.getOperand(0));
> +            const ir::Register src1 = this->getRegister(I.getOperand(1));
> +            ctx.SUB(dst0Type, dst0, src0, src1);
> +
> +            ir::Register overflow = this->getRegister(&I, 1);
> +            ctx.LT(dst0Type, overflow, dst0, src1);
The above implementation is incorrect. The correct one should be:
               ctx.GT(dst0Type, overflow, dst0, src0);
Right?

> +          }
> +          break;
>            case Intrinsic::sadd_with_overflow:
>            case Intrinsic::ssub_with_overflow:
> -          case Intrinsic::usub_with_overflow:
>            case Intrinsic::smul_with_overflow:
>            case Intrinsic::umul_with_overflow:
>            NOT_IMPLEMENTED;

The other part should be splited to another patch.

Thanks,
Zhigang Gong.

> diff --git a/kernels/compiler_overflow.cl b/kernels/compiler_overflow.cl
> index 75ed5ce..af751b7 100644
> --- a/kernels/compiler_overflow.cl
> +++ b/kernels/compiler_overflow.cl
> @@ -1,20 +1,45 @@
> -#define COMPILER_OVERFLOW(TYPE) \
> -    kernel void compiler_overflow_##TYPE (global TYPE* src, global TYPE* dst)   \
> +#define COMPILER_OVERFLOW_ADD(TYPE, FUNC) \
> +    kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst)   \
>  {                                                \
> -  __global TYPE* A = &src[get_global_id(0)];    \
> -  TYPE B = 1; \
> -  *A += B;    \
> -  TYPE carry = -convert_##TYPE((*A) < B); \
> +  __global TYPE* A = &src0[get_global_id(0)];    \
> +  __global TYPE* B = &src1[get_global_id(0)];    \
> +  __global TYPE* C = &dst[get_global_id(0)];    \
> +  *C = *A + *B;    \
> +  TYPE carry = -convert_##TYPE(*C < *B); \
>                 \
> -  (*A).y += carry.x;  \
> -  carry.y += ((*A).y < carry.x); \
> -  (*A).z += carry.y;   \
> +  (*C).y += carry.x;  \
> +  carry.y += ((*C).y < carry.x); \
> +  (*C).z += carry.y;   \
>      \
> -  carry.z += ((*A).z < carry.y); \
> -  (*A).w += carry.z; \
> -   dst[get_global_id(0)] = src[get_global_id(0)]; \
> +  carry.z += ((*C).z < carry.y); \
> +  (*C).w += carry.z; \
> +  carry.w += ((*C).w < carry.z); \
>  }
>  
> -COMPILER_OVERFLOW(uint4)
> -COMPILER_OVERFLOW(ushort4)
> -COMPILER_OVERFLOW(uchar4)
> +
> +COMPILER_OVERFLOW_ADD(ulong4, add)
> +COMPILER_OVERFLOW_ADD(uint4, add)
> +COMPILER_OVERFLOW_ADD(ushort4, add)
> +COMPILER_OVERFLOW_ADD(uchar4, add)
> +
> +#define COMPILER_OVERFLOW_SUB(TYPE, FUNC) \
> +    kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst)   \
> +{                                                \
> +  __global TYPE* A = &src0[get_global_id(0)];    \
> +  __global TYPE* B = &src1[get_global_id(0)];    \
> +  __global TYPE* C = &dst[get_global_id(0)];    \
> +  TYPE borrow; \
> +  unsigned result; \
> +  size_t num = sizeof(*A)/sizeof((*A)[0]); \
> +  for (uint i = 0; i < num; i++ ) {\
> +     borrow[i] = __builtin_usub_overflow((*A)[i], (*B)[i], &result); \
> +     (*C)[i] = result;  \
> +   }\
> +\
> +  for (uint i = 0; i < num-1; i++ ) {\
> +    borrow[i+1] += (*C)[i+1] < borrow[i];(*C)[i+1] -= borrow[i]; \
> +  }\
> +\
> +}
> +
> +COMPILER_OVERFLOW_SUB(uint4, sub)
> diff --git a/utests/compiler_overflow.cpp b/utests/compiler_overflow.cpp
> index 1d3f53d..1404cfe 100644
> --- a/utests/compiler_overflow.cpp
> +++ b/utests/compiler_overflow.cpp
> @@ -3,6 +3,13 @@
>  namespace {
>  
>  typedef struct {
> +  unsigned long x;
> +  unsigned long y;
> +  unsigned long z;
> +  unsigned long w;
> +}ulong4;
> +
> +typedef struct {
>    uint32_t x;
>    uint32_t y;
>    uint32_t z;
> @@ -23,8 +30,18 @@ typedef struct {
>    uint8_t w;
>  } uchar4;
>  
> -template<typename T>
> -void test(const char *kernel_name, int s_type)
> +template <typename U>
> +U get_max()
> +{
> +  int shift_bit = sizeof(U)*8;
> +  U u_max = 0;
> +  for (int i = 0; i < shift_bit; i++)
> +    u_max |= 1<<(shift_bit-i-1);
> +  return u_max;
> +}
> +
> +template<typename T, typename U>
> +void test(const char *kernel_name, int func_type)
>  {
>    const size_t n = 16;
>  
> @@ -32,41 +49,81 @@ void test(const char *kernel_name, int s_type)
>    OCL_CREATE_KERNEL_FROM_FILE("compiler_overflow", kernel_name);
>    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
>    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
> +  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
>    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
>    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
> +  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
> +
> +  U max = get_max<U>();
>  
>    OCL_MAP_BUFFER(0);
>    for (uint32_t i = 0; i < n; ++i) {
> -    ((T*)buf_data[0])[i].x = s_type?CL_INT_MAX:CL_UINT_MAX;
> -    ((T*)buf_data[0])[i].y = s_type?CL_INT_MAX:CL_UINT_MAX;
> -    ((T*)buf_data[0])[i].z = s_type?CL_INT_MAX:CL_UINT_MAX;
> -    ((T*)buf_data[0])[i].w = i;
> +    if(func_type == 0) {
> +      ((T*)buf_data[0])[i].x = max;
> +      ((T*)buf_data[0])[i].y = max;
> +      ((T*)buf_data[0])[i].z = max;
> +      ((T*)buf_data[0])[i].w = i;
> +    }else if(func_type == 1) {
> +      ((T*)buf_data[0])[i].x = 0;
> +      ((T*)buf_data[0])[i].y = 0;
> +      ((T*)buf_data[0])[i].z = 0;
> +      ((T*)buf_data[0])[i].w = n+2-i;
> +    }else
> +      OCL_ASSERT(0);
>    }
>    OCL_UNMAP_BUFFER(0);
> +  OCL_MAP_BUFFER(1);
> +  for (uint32_t i = 0; i < n; ++i) {
> +      ((T*)buf_data[1])[i].x = 1;
> +      ((T*)buf_data[1])[i].y = 1;
> +      ((T*)buf_data[1])[i].z = 1;
> +      ((T*)buf_data[1])[i].w = 1;
> +  }
> +  OCL_UNMAP_BUFFER(1);
>  
>    globals[0] = n;
>    locals[0] = 16;
>    OCL_NDRANGE(1);
> -
> -  OCL_MAP_BUFFER(1);
> +  OCL_MAP_BUFFER(2);
>    for (uint32_t i = 0; i < 16; ++i) {
> -    OCL_ASSERT(((T*)buf_data[1])[i].x == 0);
> -    OCL_ASSERT(((T*)buf_data[1])[i].y == 1);
> -    OCL_ASSERT(((T*)buf_data[1])[i].z == 1);
> -    OCL_ASSERT(((T*)buf_data[1])[i].w == i+2);
> +   // printf("%u,%u,%u,%u\n", ((T*)buf_data[2])[i].x,((T*)buf_data[2])[i].y, ((T*)buf_data[2])[i].z, ((T*)buf_data[2])[i].w  );
> +    if(func_type == 0) {
> +      OCL_ASSERT(((T*)buf_data[2])[i].x == 0);
> +      OCL_ASSERT(((T*)buf_data[2])[i].y == 1);
> +      OCL_ASSERT(((T*)buf_data[2])[i].z == 1);
> +      OCL_ASSERT(((T*)buf_data[2])[i].w == i+2);
> +    }else if(func_type == 1) {
> +      OCL_ASSERT(((T*)buf_data[2])[i].x == max);
> +      OCL_ASSERT(((T*)buf_data[2])[i].y == max-1);
> +      OCL_ASSERT(((T*)buf_data[2])[i].z == max-1);
> +      OCL_ASSERT(((T*)buf_data[2])[i].w == n-i);
> +    }else
> +      OCL_ASSERT(0);
>    }
> -  OCL_UNMAP_BUFFER(1);
> +  OCL_UNMAP_BUFFER(2);
>  }
>  
>  }
>  
> -#define compiler_overflow(type, kernel, s_type) \
> -static void compiler_overflow_ ##type(void)\
> +#define compiler_overflow_add(type, subtype, kernel, func_type) \
> +static void compiler_overflow_add_ ##type(void)\
>  {\
> -  test<type>(# kernel, s_type);\
> +  test<type, subtype>(# kernel, func_type);\
>  }\
> -MAKE_UTEST_FROM_FUNCTION(compiler_overflow_ ## type);
> +MAKE_UTEST_FROM_FUNCTION(compiler_overflow_add_ ## type);
> +
> +#define compiler_overflow_sub(type, subtype, kernel, func_type) \
> +static void compiler_overflow_sub_ ##type(void)\
> +{\
> +  test<type, subtype>(# kernel, func_type);\
> +}\
> +MAKE_UTEST_FROM_FUNCTION(compiler_overflow_sub_ ## type);
> +
> +compiler_overflow_add(ulong4, unsigned long, compiler_overflow_ulong4_add, 0)
> +compiler_overflow_add(uint4, uint32_t, compiler_overflow_uint4_add, 0)
> +compiler_overflow_add(ushort4, uint16_t, compiler_overflow_ushort4_add, 0)
> +compiler_overflow_add(uchar4, uint8_t, compiler_overflow_uchar4_add, 0)
>  
> -compiler_overflow(uint4, compiler_overflow_uint4, 0)
> -compiler_overflow(ushort4, compiler_overflow_ushort4, 0)
> -compiler_overflow(uchar4, compiler_overflow_uchar4, 0)
> +// as llvm intrincs function doesn't support byte/short overflow,
> +// we just test uint overflow here.
> +compiler_overflow_sub(uint4, uint32_t, compiler_overflow_uint4_sub, 1)
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet