[Beignet] [PATCH] add llvm intrinsic call usub_with_overflow funtion.
xionghu.luo at intel.com
xionghu.luo at intel.com
Wed Oct 29 19:59:37 PDT 2014
From: Luo Xionghu <xionghu.luo at intel.com>
as llvm couldn't recognize the pattern of usub overflow, this usub with
is generated by calling the intrinsic function __builtin_usub_overflow;
also this type of uadd intrinsic funtion couldn't support short/byte type
overflow, we choose another way for the uadd kernel to generate
short/byte overflow.
will send patch to llvm later to fix the 2 issues.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
backend/src/llvm/llvm_gen_backend.cpp | 15 ++++-
kernels/compiler_overflow.cl | 55 ++++++++++++++-----
utests/compiler_overflow.cpp | 97 ++++++++++++++++++++++++++-------
3 files changed, 131 insertions(+), 36 deletions(-)
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index bb2c1dd..5e1b9d5 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2805,9 +2805,22 @@ namespace gbe
ctx.LT(dst0Type, overflow, dst0, src1);
}
break;
+ case Intrinsic::usub_with_overflow:
+ {
+ Type *llvmDstType = I.getType();
+ GBE_ASSERT(llvmDstType->isStructTy());
+ ir::Type dst0Type = getType(ctx, llvmDstType->getStructElementType(0));
+ const ir::Register dst0 = this->getRegister(&I, 0);
+ const ir::Register src0 = this->getRegister(I.getOperand(0));
+ const ir::Register src1 = this->getRegister(I.getOperand(1));
+ ctx.SUB(dst0Type, dst0, src0, src1);
+
+ ir::Register overflow = this->getRegister(&I, 1);
+ ctx.LT(dst0Type, overflow, dst0, src1);
+ }
+ break;
case Intrinsic::sadd_with_overflow:
case Intrinsic::ssub_with_overflow:
- case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
NOT_IMPLEMENTED;
diff --git a/kernels/compiler_overflow.cl b/kernels/compiler_overflow.cl
index 75ed5ce..af751b7 100644
--- a/kernels/compiler_overflow.cl
+++ b/kernels/compiler_overflow.cl
@@ -1,20 +1,45 @@
-#define COMPILER_OVERFLOW(TYPE) \
- kernel void compiler_overflow_##TYPE (global TYPE* src, global TYPE* dst) \
+#define COMPILER_OVERFLOW_ADD(TYPE, FUNC) \
+ kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst) \
{ \
- __global TYPE* A = &src[get_global_id(0)]; \
- TYPE B = 1; \
- *A += B; \
- TYPE carry = -convert_##TYPE((*A) < B); \
+ __global TYPE* A = &src0[get_global_id(0)]; \
+ __global TYPE* B = &src1[get_global_id(0)]; \
+ __global TYPE* C = &dst[get_global_id(0)]; \
+ *C = *A + *B; \
+ TYPE carry = -convert_##TYPE(*C < *B); \
\
- (*A).y += carry.x; \
- carry.y += ((*A).y < carry.x); \
- (*A).z += carry.y; \
+ (*C).y += carry.x; \
+ carry.y += ((*C).y < carry.x); \
+ (*C).z += carry.y; \
\
- carry.z += ((*A).z < carry.y); \
- (*A).w += carry.z; \
- dst[get_global_id(0)] = src[get_global_id(0)]; \
+ carry.z += ((*C).z < carry.y); \
+ (*C).w += carry.z; \
+ carry.w += ((*C).w < carry.z); \
}
-COMPILER_OVERFLOW(uint4)
-COMPILER_OVERFLOW(ushort4)
-COMPILER_OVERFLOW(uchar4)
+
+COMPILER_OVERFLOW_ADD(ulong4, add)
+COMPILER_OVERFLOW_ADD(uint4, add)
+COMPILER_OVERFLOW_ADD(ushort4, add)
+COMPILER_OVERFLOW_ADD(uchar4, add)
+
+#define COMPILER_OVERFLOW_SUB(TYPE, FUNC) \
+ kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst) \
+{ \
+ __global TYPE* A = &src0[get_global_id(0)]; \
+ __global TYPE* B = &src1[get_global_id(0)]; \
+ __global TYPE* C = &dst[get_global_id(0)]; \
+ TYPE borrow; \
+ unsigned result; \
+ size_t num = sizeof(*A)/sizeof((*A)[0]); \
+ for (uint i = 0; i < num; i++ ) {\
+ borrow[i] = __builtin_usub_overflow((*A)[i], (*B)[i], &result); \
+ (*C)[i] = result; \
+ }\
+\
+ for (uint i = 0; i < num-1; i++ ) {\
+ borrow[i+1] += (*C)[i+1] < borrow[i];(*C)[i+1] -= borrow[i]; \
+ }\
+\
+}
+
+COMPILER_OVERFLOW_SUB(uint4, sub)
diff --git a/utests/compiler_overflow.cpp b/utests/compiler_overflow.cpp
index 1d3f53d..1404cfe 100644
--- a/utests/compiler_overflow.cpp
+++ b/utests/compiler_overflow.cpp
@@ -3,6 +3,13 @@
namespace {
typedef struct {
+ unsigned long x;
+ unsigned long y;
+ unsigned long z;
+ unsigned long w;
+}ulong4;
+
+typedef struct {
uint32_t x;
uint32_t y;
uint32_t z;
@@ -23,8 +30,18 @@ typedef struct {
uint8_t w;
} uchar4;
-template<typename T>
-void test(const char *kernel_name, int s_type)
+template <typename U>
+U get_max()
+{
+ int shift_bit = sizeof(U)*8;
+ U u_max = 0;
+ for (int i = 0; i < shift_bit; i++)
+ u_max |= 1<<(shift_bit-i-1);
+ return u_max;
+}
+
+template<typename T, typename U>
+void test(const char *kernel_name, int func_type)
{
const size_t n = 16;
@@ -32,41 +49,81 @@ void test(const char *kernel_name, int s_type)
OCL_CREATE_KERNEL_FROM_FILE("compiler_overflow", kernel_name);
OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+ U max = get_max<U>();
OCL_MAP_BUFFER(0);
for (uint32_t i = 0; i < n; ++i) {
- ((T*)buf_data[0])[i].x = s_type?CL_INT_MAX:CL_UINT_MAX;
- ((T*)buf_data[0])[i].y = s_type?CL_INT_MAX:CL_UINT_MAX;
- ((T*)buf_data[0])[i].z = s_type?CL_INT_MAX:CL_UINT_MAX;
- ((T*)buf_data[0])[i].w = i;
+ if(func_type == 0) {
+ ((T*)buf_data[0])[i].x = max;
+ ((T*)buf_data[0])[i].y = max;
+ ((T*)buf_data[0])[i].z = max;
+ ((T*)buf_data[0])[i].w = i;
+ }else if(func_type == 1) {
+ ((T*)buf_data[0])[i].x = 0;
+ ((T*)buf_data[0])[i].y = 0;
+ ((T*)buf_data[0])[i].z = 0;
+ ((T*)buf_data[0])[i].w = n+2-i;
+ }else
+ OCL_ASSERT(0);
}
OCL_UNMAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < n; ++i) {
+ ((T*)buf_data[1])[i].x = 1;
+ ((T*)buf_data[1])[i].y = 1;
+ ((T*)buf_data[1])[i].z = 1;
+ ((T*)buf_data[1])[i].w = 1;
+ }
+ OCL_UNMAP_BUFFER(1);
globals[0] = n;
locals[0] = 16;
OCL_NDRANGE(1);
-
- OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
for (uint32_t i = 0; i < 16; ++i) {
- OCL_ASSERT(((T*)buf_data[1])[i].x == 0);
- OCL_ASSERT(((T*)buf_data[1])[i].y == 1);
- OCL_ASSERT(((T*)buf_data[1])[i].z == 1);
- OCL_ASSERT(((T*)buf_data[1])[i].w == i+2);
+ // printf("%u,%u,%u,%u\n", ((T*)buf_data[2])[i].x,((T*)buf_data[2])[i].y, ((T*)buf_data[2])[i].z, ((T*)buf_data[2])[i].w );
+ if(func_type == 0) {
+ OCL_ASSERT(((T*)buf_data[2])[i].x == 0);
+ OCL_ASSERT(((T*)buf_data[2])[i].y == 1);
+ OCL_ASSERT(((T*)buf_data[2])[i].z == 1);
+ OCL_ASSERT(((T*)buf_data[2])[i].w == i+2);
+ }else if(func_type == 1) {
+ OCL_ASSERT(((T*)buf_data[2])[i].x == max);
+ OCL_ASSERT(((T*)buf_data[2])[i].y == max-1);
+ OCL_ASSERT(((T*)buf_data[2])[i].z == max-1);
+ OCL_ASSERT(((T*)buf_data[2])[i].w == n-i);
+ }else
+ OCL_ASSERT(0);
}
- OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
}
}
-#define compiler_overflow(type, kernel, s_type) \
-static void compiler_overflow_ ##type(void)\
+#define compiler_overflow_add(type, subtype, kernel, func_type) \
+static void compiler_overflow_add_ ##type(void)\
{\
- test<type>(# kernel, s_type);\
+ test<type, subtype>(# kernel, func_type);\
}\
-MAKE_UTEST_FROM_FUNCTION(compiler_overflow_ ## type);
+MAKE_UTEST_FROM_FUNCTION(compiler_overflow_add_ ## type);
+
+#define compiler_overflow_sub(type, subtype, kernel, func_type) \
+static void compiler_overflow_sub_ ##type(void)\
+{\
+ test<type, subtype>(# kernel, func_type);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_overflow_sub_ ## type);
+
+compiler_overflow_add(ulong4, unsigned long, compiler_overflow_ulong4_add, 0)
+compiler_overflow_add(uint4, uint32_t, compiler_overflow_uint4_add, 0)
+compiler_overflow_add(ushort4, uint16_t, compiler_overflow_ushort4_add, 0)
+compiler_overflow_add(uchar4, uint8_t, compiler_overflow_uchar4_add, 0)
-compiler_overflow(uint4, compiler_overflow_uint4, 0)
-compiler_overflow(ushort4, compiler_overflow_ushort4, 0)
-compiler_overflow(uchar4, compiler_overflow_uchar4, 0)
+// as llvm intrincs function doesn't support byte/short overflow,
+// we just test uint overflow here.
+compiler_overflow_sub(uint4, uint32_t, compiler_overflow_uint4_sub, 1)
--
1.7.9.5
More information about the Beignet
mailing list