[Beignet] [PATCH] add llvm intrinsic call usub_with_overflow funtion.

xionghu.luo at intel.com xionghu.luo at intel.com
Wed Oct 29 19:59:37 PDT 2014


From: Luo Xionghu <xionghu.luo at intel.com>

as llvm couldn't recognize the pattern of usub overflow, this usub with
is generated by calling the intrinsic function __builtin_usub_overflow;
also this type of uadd intrinsic funtion couldn't support short/byte type
overflow, we choose another way for the uadd kernel to generate
short/byte overflow.
will send patch to llvm later to fix the 2 issues.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/llvm/llvm_gen_backend.cpp |   15 ++++-
 kernels/compiler_overflow.cl          |   55 ++++++++++++++-----
 utests/compiler_overflow.cpp          |   97 ++++++++++++++++++++++++++-------
 3 files changed, 131 insertions(+), 36 deletions(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index bb2c1dd..5e1b9d5 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2805,9 +2805,22 @@ namespace gbe
             ctx.LT(dst0Type, overflow, dst0, src1);
           }
           break;
+          case Intrinsic::usub_with_overflow:
+          {
+            Type *llvmDstType = I.getType();
+            GBE_ASSERT(llvmDstType->isStructTy());
+            ir::Type dst0Type = getType(ctx, llvmDstType->getStructElementType(0));
+            const ir::Register dst0  = this->getRegister(&I, 0);
+            const ir::Register src0 = this->getRegister(I.getOperand(0));
+            const ir::Register src1 = this->getRegister(I.getOperand(1));
+            ctx.SUB(dst0Type, dst0, src0, src1);
+
+            ir::Register overflow = this->getRegister(&I, 1);
+            ctx.LT(dst0Type, overflow, dst0, src1);
+          }
+          break;
           case Intrinsic::sadd_with_overflow:
           case Intrinsic::ssub_with_overflow:
-          case Intrinsic::usub_with_overflow:
           case Intrinsic::smul_with_overflow:
           case Intrinsic::umul_with_overflow:
           NOT_IMPLEMENTED;
diff --git a/kernels/compiler_overflow.cl b/kernels/compiler_overflow.cl
index 75ed5ce..af751b7 100644
--- a/kernels/compiler_overflow.cl
+++ b/kernels/compiler_overflow.cl
@@ -1,20 +1,45 @@
-#define COMPILER_OVERFLOW(TYPE) \
-    kernel void compiler_overflow_##TYPE (global TYPE* src, global TYPE* dst)   \
+#define COMPILER_OVERFLOW_ADD(TYPE, FUNC) \
+    kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst)   \
 {                                                \
-  __global TYPE* A = &src[get_global_id(0)];    \
-  TYPE B = 1; \
-  *A += B;    \
-  TYPE carry = -convert_##TYPE((*A) < B); \
+  __global TYPE* A = &src0[get_global_id(0)];    \
+  __global TYPE* B = &src1[get_global_id(0)];    \
+  __global TYPE* C = &dst[get_global_id(0)];    \
+  *C = *A + *B;    \
+  TYPE carry = -convert_##TYPE(*C < *B); \
                \
-  (*A).y += carry.x;  \
-  carry.y += ((*A).y < carry.x); \
-  (*A).z += carry.y;   \
+  (*C).y += carry.x;  \
+  carry.y += ((*C).y < carry.x); \
+  (*C).z += carry.y;   \
     \
-  carry.z += ((*A).z < carry.y); \
-  (*A).w += carry.z; \
-   dst[get_global_id(0)] = src[get_global_id(0)]; \
+  carry.z += ((*C).z < carry.y); \
+  (*C).w += carry.z; \
+  carry.w += ((*C).w < carry.z); \
 }
 
-COMPILER_OVERFLOW(uint4)
-COMPILER_OVERFLOW(ushort4)
-COMPILER_OVERFLOW(uchar4)
+
+COMPILER_OVERFLOW_ADD(ulong4, add)
+COMPILER_OVERFLOW_ADD(uint4, add)
+COMPILER_OVERFLOW_ADD(ushort4, add)
+COMPILER_OVERFLOW_ADD(uchar4, add)
+
+#define COMPILER_OVERFLOW_SUB(TYPE, FUNC) \
+    kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst)   \
+{                                                \
+  __global TYPE* A = &src0[get_global_id(0)];    \
+  __global TYPE* B = &src1[get_global_id(0)];    \
+  __global TYPE* C = &dst[get_global_id(0)];    \
+  TYPE borrow; \
+  unsigned result; \
+  size_t num = sizeof(*A)/sizeof((*A)[0]); \
+  for (uint i = 0; i < num; i++ ) {\
+     borrow[i] = __builtin_usub_overflow((*A)[i], (*B)[i], &result); \
+     (*C)[i] = result;  \
+   }\
+\
+  for (uint i = 0; i < num-1; i++ ) {\
+    borrow[i+1] += (*C)[i+1] < borrow[i];(*C)[i+1] -= borrow[i]; \
+  }\
+\
+}
+
+COMPILER_OVERFLOW_SUB(uint4, sub)
diff --git a/utests/compiler_overflow.cpp b/utests/compiler_overflow.cpp
index 1d3f53d..1404cfe 100644
--- a/utests/compiler_overflow.cpp
+++ b/utests/compiler_overflow.cpp
@@ -3,6 +3,13 @@
 namespace {
 
 typedef struct {
+  unsigned long x;
+  unsigned long y;
+  unsigned long z;
+  unsigned long w;
+}ulong4;
+
+typedef struct {
   uint32_t x;
   uint32_t y;
   uint32_t z;
@@ -23,8 +30,18 @@ typedef struct {
   uint8_t w;
 } uchar4;
 
-template<typename T>
-void test(const char *kernel_name, int s_type)
+template <typename U>
+U get_max()
+{
+  int shift_bit = sizeof(U)*8;
+  U u_max = 0;
+  for (int i = 0; i < shift_bit; i++)
+    u_max |= 1<<(shift_bit-i-1);
+  return u_max;
+}
+
+template<typename T, typename U>
+void test(const char *kernel_name, int func_type)
 {
   const size_t n = 16;
 
@@ -32,41 +49,81 @@ void test(const char *kernel_name, int s_type)
   OCL_CREATE_KERNEL_FROM_FILE("compiler_overflow", kernel_name);
   OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  U max = get_max<U>();
 
   OCL_MAP_BUFFER(0);
   for (uint32_t i = 0; i < n; ++i) {
-    ((T*)buf_data[0])[i].x = s_type?CL_INT_MAX:CL_UINT_MAX;
-    ((T*)buf_data[0])[i].y = s_type?CL_INT_MAX:CL_UINT_MAX;
-    ((T*)buf_data[0])[i].z = s_type?CL_INT_MAX:CL_UINT_MAX;
-    ((T*)buf_data[0])[i].w = i;
+    if(func_type == 0) {
+      ((T*)buf_data[0])[i].x = max;
+      ((T*)buf_data[0])[i].y = max;
+      ((T*)buf_data[0])[i].z = max;
+      ((T*)buf_data[0])[i].w = i;
+    }else if(func_type == 1) {
+      ((T*)buf_data[0])[i].x = 0;
+      ((T*)buf_data[0])[i].y = 0;
+      ((T*)buf_data[0])[i].z = 0;
+      ((T*)buf_data[0])[i].w = n+2-i;
+    }else
+      OCL_ASSERT(0);
   }
   OCL_UNMAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i) {
+      ((T*)buf_data[1])[i].x = 1;
+      ((T*)buf_data[1])[i].y = 1;
+      ((T*)buf_data[1])[i].z = 1;
+      ((T*)buf_data[1])[i].w = 1;
+  }
+  OCL_UNMAP_BUFFER(1);
 
   globals[0] = n;
   locals[0] = 16;
   OCL_NDRANGE(1);
-
-  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
   for (uint32_t i = 0; i < 16; ++i) {
-    OCL_ASSERT(((T*)buf_data[1])[i].x == 0);
-    OCL_ASSERT(((T*)buf_data[1])[i].y == 1);
-    OCL_ASSERT(((T*)buf_data[1])[i].z == 1);
-    OCL_ASSERT(((T*)buf_data[1])[i].w == i+2);
+   // printf("%u,%u,%u,%u\n", ((T*)buf_data[2])[i].x,((T*)buf_data[2])[i].y, ((T*)buf_data[2])[i].z, ((T*)buf_data[2])[i].w  );
+    if(func_type == 0) {
+      OCL_ASSERT(((T*)buf_data[2])[i].x == 0);
+      OCL_ASSERT(((T*)buf_data[2])[i].y == 1);
+      OCL_ASSERT(((T*)buf_data[2])[i].z == 1);
+      OCL_ASSERT(((T*)buf_data[2])[i].w == i+2);
+    }else if(func_type == 1) {
+      OCL_ASSERT(((T*)buf_data[2])[i].x == max);
+      OCL_ASSERT(((T*)buf_data[2])[i].y == max-1);
+      OCL_ASSERT(((T*)buf_data[2])[i].z == max-1);
+      OCL_ASSERT(((T*)buf_data[2])[i].w == n-i);
+    }else
+      OCL_ASSERT(0);
   }
-  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
 }
 
 }
 
-#define compiler_overflow(type, kernel, s_type) \
-static void compiler_overflow_ ##type(void)\
+#define compiler_overflow_add(type, subtype, kernel, func_type) \
+static void compiler_overflow_add_ ##type(void)\
 {\
-  test<type>(# kernel, s_type);\
+  test<type, subtype>(# kernel, func_type);\
 }\
-MAKE_UTEST_FROM_FUNCTION(compiler_overflow_ ## type);
+MAKE_UTEST_FROM_FUNCTION(compiler_overflow_add_ ## type);
+
+#define compiler_overflow_sub(type, subtype, kernel, func_type) \
+static void compiler_overflow_sub_ ##type(void)\
+{\
+  test<type, subtype>(# kernel, func_type);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_overflow_sub_ ## type);
+
+compiler_overflow_add(ulong4, unsigned long, compiler_overflow_ulong4_add, 0)
+compiler_overflow_add(uint4, uint32_t, compiler_overflow_uint4_add, 0)
+compiler_overflow_add(ushort4, uint16_t, compiler_overflow_ushort4_add, 0)
+compiler_overflow_add(uchar4, uint8_t, compiler_overflow_uchar4_add, 0)
 
-compiler_overflow(uint4, compiler_overflow_uint4, 0)
-compiler_overflow(ushort4, compiler_overflow_ushort4, 0)
-compiler_overflow(uchar4, compiler_overflow_uchar4, 0)
+// as llvm intrincs function doesn't support byte/short overflow,
+// we just test uint overflow here.
+compiler_overflow_sub(uint4, uint32_t, compiler_overflow_uint4_sub, 1)
-- 
1.7.9.5



More information about the Beignet mailing list