[Beignet] [PATCH 4/8] [OCL20] add opencl builtin atomic functions implementation.
xionghu.luo at intel.com
xionghu.luo at intel.com
Tue Mar 1 11:26:52 UTC 2016
From: Luo Xionghu <xionghu.luo at intel.com>
add 'atomic_int, atomic_uint' type support for operations:
fetch_add, fetch_sub, fetch_or, fetch_xor, fetch_and, exchange,
fetch_min, fetch_max.
add 'atomic_int, atomic_uint, atomic_long, atomic_ulong' type support
for operations as load, store, init, compare_exchange_strong, compare_exchange_weak.
these builtins are implemented by llvm bitcode with native atomicrmw and
xchg llvm IR, so the pass optimization could recognize all the atomic
instructions to avoid miss optimized.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
backend/src/ir/instruction.cpp | 5 +-
backend/src/libocl/CMakeLists.txt | 2 +-
backend/src/libocl/include/ocl_atom.h | 82 ++++++++++++-
backend/src/libocl/include/ocl_types.h | 2 +
backend/src/libocl/src/ocl_atom.cl | 208 +++++++++++++++++++++++++++++++++
backend/src/libocl/src/ocl_atomic.ll | 153 ++++++++++++++++++++++++
backend/src/llvm/llvm_gen_backend.cpp | 102 ++++++++++++++++
backend/src/llvm/llvm_to_gen.cpp | 2 +-
8 files changed, 551 insertions(+), 5 deletions(-)
create mode 100644 backend/src/libocl/src/ocl_atomic.ll
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 464e483..d8640f9 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1088,10 +1088,11 @@ namespace ir {
{
if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
return false;
- if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
+ const RegisterFamily family = getFamily(this->type);
+ if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
return false;
for (uint32_t srcID = 0; srcID < srcNum-1u; ++srcID)
- if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
+ if (UNLIKELY(checkRegisterData(family, getSrc(fn, srcID+1u), fn, whyNot) == false))
return false;
return true;
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 8bb4c1e..5f0b2e2 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
)
ENDMACRO(ADD_LL_TO_BC_TARGET)
-SET (OCL_LL_MODULES ocl_barrier ocl_clz)
+SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_atomic)
FOREACH(f ${OCL_LL_MODULES})
COPY_THE_LL(${f})
ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/include/ocl_atom.h b/backend/src/libocl/include/ocl_atom.h
index d0f6b10..f83e3c0 100644
--- a/backend/src/libocl/include/ocl_atom.h
+++ b/backend/src/libocl/include/ocl_atom.h
@@ -98,5 +98,85 @@ OVERLOADABLE int atomic_cmpxchg (volatile __local int *p, int cmp, int val);
#define atom_dec atomic_dec
#define atom_cmpxchg atomic_cmpxchg
-
+//OpenCL 2.0 features
+#define ATOMIC_GEN_FUNCTIONS(ATYPE, CTYPE, POSTFIX) \
+CTYPE __gen_ocl_atomic_exchange##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_add##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_sub##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_or##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_xor##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_and##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_imin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_umin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_imax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_umax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope);\
+CTYPE __gen_ocl_atomic_compare_exchange_strong##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope); \
+CTYPE __gen_ocl_atomic_compare_exchange_weak##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope);
+
+ATOMIC_GEN_FUNCTIONS(atomic_int, int, 32)
+ATOMIC_GEN_FUNCTIONS(atomic_long, long, 64)
+
+#undef ATOMIC_GEN_FUNCTIONS
+
+/* only used to initialize global address space */
+//#define ATOMIC_VAR_INIT(C value)
+#define ATOMIC_VAR_INIT
+#define ATOMIC_FLAG_INIT 0
+
+//store
+#define ATOMIC_FUNCTIONS(ATYPE, CTYPE, MTYPE1, MTYPE2) \
+OVERLOADABLE void atomic_init(volatile ATYPE *object, CTYPE desired); \
+OVERLOADABLE void atomic_store(volatile ATYPE *object, CTYPE desired); \
+OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order); \
+OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_load(volatile ATYPE *object); \
+OVERLOADABLE CTYPE atomic_load_explicit(volatile ATYPE *object, memory_order order); \
+OVERLOADABLE CTYPE atomic_load_explicit(volatile ATYPE *object, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_exchange(volatile ATYPE *object, CTYPE desired); \
+OVERLOADABLE CTYPE atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope); \
+OVERLOADABLE bool atomic_compare_exchange_strong(volatile ATYPE *object, CTYPE *expected, CTYPE desired); \
+OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure); \
+OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope); \
+OVERLOADABLE bool atomic_compare_exchange_weak(volatile ATYPE *object, CTYPE *expected, CTYPE desired); \
+OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure); \
+OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_add(volatile ATYPE *object, MTYPE1 desired); \
+OVERLOADABLE CTYPE atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_sub(volatile ATYPE *object, MTYPE1 desired); \
+OVERLOADABLE CTYPE atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_or(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_xor(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_and(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_min(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope); \
+OVERLOADABLE CTYPE atomic_fetch_max(volatile ATYPE *object, MTYPE2 desired); \
+OVERLOADABLE CTYPE atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order); \
+OVERLOADABLE CTYPE atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);
+
+ATOMIC_FUNCTIONS(atomic_int, int, int, int)
+ATOMIC_FUNCTIONS(atomic_uint, uint, uint, uint)
+ATOMIC_FUNCTIONS(atomic_long, long, long, long)
+ATOMIC_FUNCTIONS(atomic_ulong, ulong, ulong, ulong)
+ATOMIC_FUNCTIONS(atomic_float, float, float, float)
+#undef ATOMIC_FUNCTIONS
+
+
+OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object);
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order);
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+
+OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope);
#endif /* __OCL_ATOM_H__ */
diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h
index 736e4ce..2ff02c5 100644
--- a/backend/src/libocl/include/ocl_types.h
+++ b/backend/src/libocl/include/ocl_types.h
@@ -20,6 +20,8 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
#include "ocl_defines.h"
#define NULL 0
diff --git a/backend/src/libocl/src/ocl_atom.cl b/backend/src/libocl/src/ocl_atom.cl
index 0b6c671..0f8377b 100644
--- a/backend/src/libocl/src/ocl_atom.cl
+++ b/backend/src/libocl/src/ocl_atom.cl
@@ -17,6 +17,7 @@
*/
#include "ocl_atom.h"
#include "ocl_as.h"
+#include "ocl_sync.h"
OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
@@ -135,3 +136,210 @@ DECL_ATOMIC_OP(cmpxchg)
#define atom_inc atomic_inc
#define atom_dec atomic_dec
#define atom_cmpxchg atomic_cmpxchg
+
+// OpenCL 2.0 features.
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val) { \
+ CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); \
+ bool ret = oldValue == *expected; \
+ *expected = oldValue; \
+ return ret; \
+ }
+
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, memory_order_seq_cst, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val) { \
+ __gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_OP(NAME, PREFIX) \
+ DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+ DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+ DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+ DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_OP(NAME, PREFIX) \
+ DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+ DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+ DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+ DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+#define DECL_ATOMIC_LOAD_OP(NAME, PREFIX) \
+ DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+ DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+ DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+ DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+#define DECL_ATOMIC_NO_RET_OP(NAME, PREFIX) \
+ DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+ DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+ DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+ DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+DECL_ATOMIC_OP(exchange, exchange)
+DECL_ATOMIC_OP(fetch_add, fetch_add)
+DECL_ATOMIC_OP(fetch_sub, fetch_sub)
+DECL_ATOMIC_OP(fetch_and, fetch_and)
+DECL_ATOMIC_OP(fetch_or, fetch_or)
+DECL_ATOMIC_OP(fetch_xor, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load, fetch_add)
+DECL_ATOMIC_NO_RET_OP(init, exchange)
+DECL_ATOMIC_NO_RET_OP(store, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax64, atomic_ulong, atomic_long, ulong)
+
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_LOAD_TYPE
+#undef DECL_ATOMIC_NO_RET_TYPE
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE
+
+// with memory_order.
+
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure) { \
+ CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure, memory_scope_device); \
+ bool ret = oldValue == *expected; \
+ *expected = oldValue; \
+ return ret; \
+ }
+
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, memory_scope_device); \
+ }
+
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \
+ __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device); \
+ }
+
+DECL_ATOMIC_OP(exchange_explicit, exchange)
+DECL_ATOMIC_OP(fetch_add_explicit, fetch_add)
+DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub)
+DECL_ATOMIC_OP(fetch_and_explicit, fetch_and)
+DECL_ATOMIC_OP(fetch_or_explicit, fetch_or)
+DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add)
+DECL_ATOMIC_NO_RET_OP(store_explicit, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong)
+
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_LOAD_TYPE
+#undef DECL_ATOMIC_NO_RET_TYPE
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE
+
+// with memory_order and memory_scope
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope); \
+ }
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure, memory_scope scope) { \
+ CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure, scope); \
+ bool ret = oldValue == *expected; \
+ *expected = oldValue; \
+ return ret; \
+ }
+
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order, memory_scope scope) { \
+ return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, scope); \
+ }
+
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE) \
+ OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \
+ __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope); \
+ }
+
+DECL_ATOMIC_OP(exchange_explicit, exchange)
+DECL_ATOMIC_OP(fetch_add_explicit, fetch_add)
+DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub)
+DECL_ATOMIC_OP(fetch_and_explicit, fetch_and)
+DECL_ATOMIC_OP(fetch_or_explicit, fetch_or)
+DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add)
+DECL_ATOMIC_NO_RET_OP(store_explicit, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong)
+
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_LOAD_TYPE
+#undef DECL_ATOMIC_NO_RET_TYPE
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_LOAD_OP
+#undef DECL_ATOMIC_NO_RET_OP
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_OP
+
+OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object) {
+ atomic_int * temp = (atomic_int*)object;
+ return (bool)__gen_ocl_atomic_compare_exchange_strong32(temp, 0, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order) {
+ atomic_int * temp = (atomic_int*)object;
+ return (bool)__gen_ocl_atomic_compare_exchange_strong32(temp, 0, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){
+ atomic_int * temp = (atomic_int*)object;
+ return (bool)__gen_ocl_atomic_compare_exchange_strong32(temp, 0, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object){
+ atomic_int * temp = (atomic_int*)object;
+ __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order){
+ atomic_int * temp = (atomic_int*)object;
+ __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){
+ atomic_int * temp = (atomic_int*)object;
+ __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope){
+}
diff --git a/backend/src/libocl/src/ocl_atomic.ll b/backend/src/libocl/src/ocl_atomic.ll
new file mode 100644
index 0000000..6b789b3
--- /dev/null
+++ b/backend/src/libocl/src/ocl_atomic.ll
@@ -0,0 +1,153 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+;32bit version.
+define i32 @__gen_ocl_atomic_exchange32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_add32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_sub32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_or32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_xor32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_and32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_imin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_imax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_umin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_umax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %value seq_cst
+ ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_compare_exchange_strong32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %success, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+ %1 = extractvalue { i32, i1 } %0, 0
+ ret i32 %1
+}
+
+define i32 @__gen_ocl_atomic_compare_exchange_weak32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg weak volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+ %1 = extractvalue { i32, i1 } %0, 0
+ ret i32 %1
+}
+
+;64bit version
+
+define i64 @__gen_ocl_atomic_exchange64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_add64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_sub64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_or64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_xor64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_and64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_imin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_imax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_umin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_umax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %value seq_cst
+ ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_compare_exchange_strong64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst
+ %1 = extractvalue { i64, i1 } %0, 0
+ ret i64 %1
+}
+
+define i64 @__gen_ocl_atomic_compare_exchange_weak64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+ %0 = cmpxchg weak volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst
+ %1 = extractvalue { i64, i1 } %0, 0
+ ret i64 %1
+}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index d5d02f5..fb0a72c 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -639,6 +639,8 @@ namespace gbe
DECL_VISIT_FN(BranchInst, BranchInst);
DECL_VISIT_FN(PHINode, PHINode);
DECL_VISIT_FN(AllocaInst, AllocaInst);
+ DECL_VISIT_FN(AtomicRMWInst, AtomicRMWInst);
+ DECL_VISIT_FN(AtomicCmpXchgInst, AtomicCmpXchgInst);
#undef DECL_VISIT_FN
// Emit unary instructions from gen native function
@@ -675,6 +677,7 @@ namespace gbe
// handle load of dword/qword with unaligned address
void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI);
void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
+ void emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple);
private:
ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
template <typename T, typename P = T>
@@ -3644,6 +3647,105 @@ namespace gbe
ctx.ALU1(opcode, type, dst, src);
}
+ void GenWriter::regAllocateAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+ this->newRegister(&I);
+ }
+
+ void GenWriter::emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple) {
+ ir::Register pointer = this->getRegister(llvmPtr);
+ ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
+ // Get the function arguments
+ ir::Register ptr;
+ ir::Register btiReg;
+ unsigned SurfaceIndex = 0xff;
+ ir::AddressMode AM;
+ if (legacyMode) {
+ Value *bti = getBtiRegister(llvmPtr);
+ Value *ptrBase = getPointerBase(llvmPtr);
+ ir::Register baseReg = this->getRegister(ptrBase);
+ if (isa<ConstantInt>(bti)) {
+ AM = ir::AM_StaticBti;
+ SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+ addrSpace = btiToGen(SurfaceIndex);
+ } else {
+ AM = ir::AM_DynamicBti;
+ addrSpace = ir::MEM_MIXED;
+ btiReg = this->getRegister(bti);
+ }
+ const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+ ptr = ctx.reg(pointerFamily);
+ ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+ } else {
+ AM = ir::AM_Stateless;
+ ptr = pointer;
+ }
+
+ ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM, SurfaceIndex);
+ }
+
+ void GenWriter::emitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+ // Get the function arguments
+ Value *llvmPtr = I.getPointerOperand();
+ ir::AtomicOps opcode = ir::ATOMIC_OP_CMPXCHG;
+ uint32_t payloadNum = 0;
+ vector<ir::Register> payload;
+ const ir::Register dst = this->getRegister(&I);
+
+ payload.push_back(this->getRegister(I.getCompareOperand()));
+ payloadNum++;
+ payload.push_back(this->getRegister(I.getNewValOperand()));
+ payloadNum++;
+ ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+ const ir::Tuple payloadTuple = payloadNum == 0 ?
+ ir::Tuple(0) :
+ ctx.arrayTuple(&payload[0], payloadNum);
+ this->emitAtomicInstHelper(opcode, type, dst, llvmPtr, payloadTuple);
+ }
+
+ void GenWriter::regAllocateAtomicRMWInst(AtomicRMWInst &I) {
+ this->newRegister(&I);
+ }
+
+ static INLINE ir::AtomicOps atomicOpsLLVMToGen(llvm::AtomicRMWInst::BinOp llvmOp) {
+ switch(llvmOp) {
+ case llvm::AtomicRMWInst::Xchg: return ir::ATOMIC_OP_XCHG;
+ case llvm::AtomicRMWInst::Add: return ir::ATOMIC_OP_ADD;
+ case llvm::AtomicRMWInst::Sub: return ir::ATOMIC_OP_SUB;
+ case llvm::AtomicRMWInst::And: return ir::ATOMIC_OP_AND;
+ case llvm::AtomicRMWInst::Or: return ir::ATOMIC_OP_OR;
+ case llvm::AtomicRMWInst::Xor: return ir::ATOMIC_OP_XOR;
+ case llvm::AtomicRMWInst::Max: return ir::ATOMIC_OP_IMAX;
+ case llvm::AtomicRMWInst::Min: return ir::ATOMIC_OP_IMIN;
+ case llvm::AtomicRMWInst::UMax: return ir::ATOMIC_OP_UMAX;
+ case llvm::AtomicRMWInst::UMin: return ir::ATOMIC_OP_UMIN;
+ case llvm::AtomicRMWInst::Nand:
+ case llvm::AtomicRMWInst::BAD_BINOP: break;
+ }
+ GBE_ASSERT(false);
+ return ir::ATOMIC_OP_INVALID;
+ }
+
+ void GenWriter::emitAtomicRMWInst(AtomicRMWInst &I) {
+ // Get the function arguments
+ llvm::AtomicOrdering Order = I.getOrdering();
+ llvm::AtomicRMWInst::BinOp llvmOpcode = I.getOperation();
+ Value *llvmPtr = I.getOperand(0);
+ ir::AtomicOps opcode = atomicOpsLLVMToGen(llvmOpcode);
+
+ const ir::Register dst = this->getRegister(&I);
+
+ uint32_t payloadNum = 0;
+ vector<ir::Register> payload;
+
+ payload.push_back(this->getRegister(I.getOperand(1)));
+ payloadNum++;
+ ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+ const ir::Tuple payloadTuple = payloadNum == 0 ?
+ ir::Tuple(0) :
+ ctx.arrayTuple(&payload[0], payloadNum);
+ this->emitAtomicInstHelper(opcode, type, dst, llvmPtr, payloadTuple);
+ }
+
void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
CallSite::arg_iterator AI = CS.arg_begin();
CallSite::arg_iterator AE = CS.arg_end();
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 0b3f0d2..11cb79f 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -201,7 +201,7 @@ namespace gbe
// Run instcombine after redundancy elimination to exploit opportunities
// opened up by them.
MPM.add(createInstructionCombiningPass());
- MPM.add(createJumpThreadingPass()); // Thread jumps
+ //MPM.add(createJumpThreadingPass()); // Thread jumps
MPM.add(createCorrelatedValuePropagationPass());
MPM.add(createDeadStoreEliminationPass()); // Delete dead stores
MPM.add(createAggressiveDCEPass()); // Delete dead instructions
--
2.1.4
More information about the Beignet
mailing list