[Beignet] [PATCH 4/8] [OCL20] add opencl builtin atomic functions implementation.

Tue Mar 1 11:26:52 UTC 2016

From: Luo Xionghu <xionghu.luo at intel.com>

add 'atomic_int, atomic_uint' type support for operations:
fetch_add, fetch_sub, fetch_or, fetch_xor, fetch_and, exchange,
fetch_min, fetch_max.

add 'atomic_int, atomic_uint, atomic_long, atomic_ulong' type support
for operations as load, store, init, compare_exchange_strong, compare_exchange_weak.

these builtins are implemented by llvm bitcode with native atomicrmw and
xchg llvm IR, so the pass optimization could recognize all the atomic
instructions to avoid miss optimized.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/ir/instruction.cpp         |   5 +-
 backend/src/libocl/CMakeLists.txt      |   2 +-
 backend/src/libocl/include/ocl_atom.h  |  82 ++++++++++++-
 backend/src/libocl/include/ocl_types.h |   2 +
 backend/src/libocl/src/ocl_atom.cl     | 208 +++++++++++++++++++++++++++++++++
 backend/src/libocl/src/ocl_atomic.ll   | 153 ++++++++++++++++++++++++
 backend/src/llvm/llvm_gen_backend.cpp  | 102 ++++++++++++++++
 backend/src/llvm/llvm_to_gen.cpp       |   2 +-
 8 files changed, 551 insertions(+), 5 deletions(-)
 create mode 100644 backend/src/libocl/src/ocl_atomic.ll

diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 464e483..d8640f9 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1088,10 +1088,11 @@ namespace ir {
     {
       if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
         return false;
-      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
+      const RegisterFamily family = getFamily(this->type);
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
         return false;
       for (uint32_t srcID = 0; srcID < srcNum-1u; ++srcID)
-        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
+        if (UNLIKELY(checkRegisterData(family, getSrc(fn, srcID+1u), fn, whyNot) == false))
           return false;
 
       return true;
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 8bb4c1e..5f0b2e2 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
 	)
 ENDMACRO(ADD_LL_TO_BC_TARGET)
 
-SET (OCL_LL_MODULES ocl_barrier ocl_clz)
+SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_atomic)
 FOREACH(f ${OCL_LL_MODULES})
     COPY_THE_LL(${f})
     ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/include/ocl_atom.h b/backend/src/libocl/include/ocl_atom.h
index d0f6b10..f83e3c0 100644
--- a/backend/src/libocl/include/ocl_atom.h
+++ b/backend/src/libocl/include/ocl_atom.h
@@ -98,5 +98,85 @@ OVERLOADABLE int atomic_cmpxchg (volatile __local int *p, int cmp, int val);
 #define atom_dec atomic_dec
 #define atom_cmpxchg atomic_cmpxchg
 
-
+//OpenCL 2.0 features
+#define ATOMIC_GEN_FUNCTIONS(ATYPE, CTYPE, POSTFIX) \
+CTYPE __gen_ocl_atomic_exchange##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope);  \
+CTYPE __gen_ocl_atomic_fetch_add##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_sub##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_or##POSTFIX(volatile ATYPE *p,  CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_xor##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_and##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_imin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_umin##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_imax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope); \
+CTYPE __gen_ocl_atomic_fetch_umax##POSTFIX(volatile ATYPE *p, CTYPE val, int order, int scope);\
+CTYPE __gen_ocl_atomic_compare_exchange_strong##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope); \
+CTYPE __gen_ocl_atomic_compare_exchange_weak##POSTFIX(volatile ATYPE* object, CTYPE expected, CTYPE desired, int sucess, int failure, int scope);
+
+ATOMIC_GEN_FUNCTIONS(atomic_int, int, 32)
+ATOMIC_GEN_FUNCTIONS(atomic_long, long, 64)
+
+#undef ATOMIC_GEN_FUNCTIONS
+
+/* only used to initialize global address space */
+//#define ATOMIC_VAR_INIT(C value)
+#define ATOMIC_VAR_INIT
+#define ATOMIC_FLAG_INIT 0
+
+//store
+#define ATOMIC_FUNCTIONS(ATYPE, CTYPE, MTYPE1, MTYPE2) \
+OVERLOADABLE void atomic_init(volatile ATYPE *object, CTYPE desired);  \
+OVERLOADABLE void atomic_store(volatile ATYPE *object, CTYPE desired);  \
+OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order);  \
+OVERLOADABLE void atomic_store_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_load(volatile ATYPE *object);  \
+OVERLOADABLE CTYPE  atomic_load_explicit(volatile ATYPE *object, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_load_explicit(volatile ATYPE *object, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_exchange(volatile ATYPE *object, CTYPE desired);  \
+OVERLOADABLE CTYPE  atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_exchange_explicit(volatile ATYPE *object, CTYPE desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE bool atomic_compare_exchange_strong(volatile ATYPE *object, CTYPE *expected, CTYPE desired);  \
+OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure);  \
+OVERLOADABLE bool atomic_compare_exchange_strong_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope);  \
+OVERLOADABLE bool atomic_compare_exchange_weak(volatile ATYPE *object, CTYPE *expected, CTYPE desired);  \
+OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure);  \
+OVERLOADABLE bool atomic_compare_exchange_weak_explicit(volatile ATYPE *object, CTYPE *expected, CTYPE desired, memory_order success, memory_order failure, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_add(volatile ATYPE *object, MTYPE1 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_add_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_sub(volatile ATYPE *object, MTYPE1 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_sub_explicit(volatile ATYPE *object, MTYPE1 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_or(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_or_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_xor(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_xor_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_and(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_and_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_min(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_min_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);  \
+OVERLOADABLE CTYPE  atomic_fetch_max(volatile ATYPE *object, MTYPE2 desired);  \
+OVERLOADABLE CTYPE  atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order);  \
+OVERLOADABLE CTYPE  atomic_fetch_max_explicit(volatile ATYPE *object, MTYPE2 desired, memory_order order, memory_scope scope);
+
+ATOMIC_FUNCTIONS(atomic_int, int, int, int)
+ATOMIC_FUNCTIONS(atomic_uint, uint, uint, uint)
+ATOMIC_FUNCTIONS(atomic_long, long, long, long)
+ATOMIC_FUNCTIONS(atomic_ulong, ulong, ulong, ulong)
+ATOMIC_FUNCTIONS(atomic_float, float, float, float)
+#undef ATOMIC_FUNCTIONS
+
+
+OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object);
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order);
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
+
+OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope);
 #endif  /* __OCL_ATOM_H__ */
diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h
index 736e4ce..2ff02c5 100644
--- a/backend/src/libocl/include/ocl_types.h
+++ b/backend/src/libocl/include/ocl_types.h
@@ -20,6 +20,8 @@
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
 #include "ocl_defines.h"
 
 #define NULL 0
diff --git a/backend/src/libocl/src/ocl_atom.cl b/backend/src/libocl/src/ocl_atom.cl
index 0b6c671..0f8377b 100644
--- a/backend/src/libocl/src/ocl_atom.cl
+++ b/backend/src/libocl/src/ocl_atom.cl
@@ -17,6 +17,7 @@
  */
 #include "ocl_atom.h"
 #include "ocl_as.h"
+#include "ocl_sync.h"
 
 OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
 OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
@@ -135,3 +136,210 @@ DECL_ATOMIC_OP(cmpxchg)
 #define atom_inc atomic_inc
 #define atom_dec atomic_dec
 #define atom_cmpxchg atomic_cmpxchg
+
+// OpenCL 2.0 features.
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device);              \
+  }
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val) { \
+    CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);   \
+    bool ret = oldValue == *expected; \
+    *expected = oldValue; \
+    return ret;  \
+  }
+
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, memory_order_seq_cst, memory_scope_device);              \
+  }
+
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val) { \
+    __gen_ocl_atomic_##PREFIX((STYPE*)p, val, memory_order_seq_cst, memory_scope_device);              \
+  }
+
+#define DECL_ATOMIC_OP(NAME, PREFIX) \
+  DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int) \
+  DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+  DECL_ATOMIC_OP_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_OP(NAME, PREFIX) \
+  DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+  DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int)  \
+  DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+  DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+#define DECL_ATOMIC_LOAD_OP(NAME, PREFIX) \
+  DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+  DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int)  \
+  DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+  DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+#define DECL_ATOMIC_NO_RET_OP(NAME, PREFIX) \
+  DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_uint, atomic_int, uint) \
+  DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##32, atomic_int, atomic_int, int)   \
+  DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_ulong, atomic_long, ulong) \
+  DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX##64, atomic_long, atomic_long, long) \
+
+DECL_ATOMIC_OP(exchange,  exchange)
+DECL_ATOMIC_OP(fetch_add, fetch_add)
+DECL_ATOMIC_OP(fetch_sub, fetch_sub)
+DECL_ATOMIC_OP(fetch_and, fetch_and)
+DECL_ATOMIC_OP(fetch_or,  fetch_or)
+DECL_ATOMIC_OP(fetch_xor, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load, fetch_add)
+DECL_ATOMIC_NO_RET_OP(init, exchange)
+DECL_ATOMIC_NO_RET_OP(store, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max, fetch_umax64, atomic_ulong, atomic_long, ulong)
+
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_LOAD_TYPE
+#undef DECL_ATOMIC_NO_RET_TYPE
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE
+
+// with memory_order.
+
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device);              \
+  }
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure) { \
+    CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure, memory_scope_device);   \
+    bool ret = oldValue == *expected; \
+    *expected = oldValue; \
+    return ret;  \
+  }
+
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, memory_scope_device);              \
+  }
+
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order) { \
+    __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, memory_scope_device);              \
+  }
+
+DECL_ATOMIC_OP(exchange_explicit,  exchange)
+DECL_ATOMIC_OP(fetch_add_explicit, fetch_add)
+DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub)
+DECL_ATOMIC_OP(fetch_and_explicit, fetch_and)
+DECL_ATOMIC_OP(fetch_or_explicit,  fetch_or)
+DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add)
+DECL_ATOMIC_NO_RET_OP(store_explicit, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong)
+
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_LOAD_TYPE
+#undef DECL_ATOMIC_NO_RET_TYPE
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE
+
+// with memory_order and memory_scope
+#define DECL_ATOMIC_OP_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope);              \
+  }
+
+#define DECL_ATOMIC_COMPARE_EXCHANGE_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE bool atomic_##NAME (volatile ATYPE *p, CTYPE* expected, CTYPE val, memory_order success, memory_order failure, memory_scope scope) { \
+    CTYPE oldValue = __gen_ocl_atomic_##PREFIX((STYPE*)p, *expected, val, success, failure,  scope);   \
+    bool ret = oldValue == *expected; \
+    *expected = oldValue; \
+    return ret;  \
+  }
+
+#define DECL_ATOMIC_LOAD_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE CTYPE atomic_##NAME (volatile ATYPE *p, memory_order order, memory_scope scope) { \
+    return (CTYPE)__gen_ocl_atomic_##PREFIX((STYPE*)p, 0, order, scope);              \
+  }
+
+#define DECL_ATOMIC_NO_RET_TYPE(NAME, PREFIX, ATYPE, STYPE, CTYPE)                       \
+  OVERLOADABLE void atomic_##NAME (volatile ATYPE *p, CTYPE val, memory_order order, memory_scope scope) { \
+    __gen_ocl_atomic_##PREFIX((STYPE*)p, val, order, scope);              \
+  }
+
+DECL_ATOMIC_OP(exchange_explicit,  exchange)
+DECL_ATOMIC_OP(fetch_add_explicit, fetch_add)
+DECL_ATOMIC_OP(fetch_sub_explicit, fetch_sub)
+DECL_ATOMIC_OP(fetch_and_explicit, fetch_and)
+DECL_ATOMIC_OP(fetch_or_explicit,  fetch_or)
+DECL_ATOMIC_OP(fetch_xor_explicit, fetch_xor)
+DECL_ATOMIC_LOAD_OP(load_explicit, fetch_add)
+DECL_ATOMIC_NO_RET_OP(store_explicit, exchange)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_strong_explicit, compare_exchange_strong)
+DECL_ATOMIC_COMPARE_EXCHANGE_OP(compare_exchange_weak_explicit, compare_exchange_weak)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax32, atomic_int, atomic_int, int)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax32, atomic_uint, atomic_int, uint)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_imin64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_min_explicit, fetch_umin64, atomic_ulong, atomic_long, ulong)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_imax64, atomic_long, atomic_long, long)
+DECL_ATOMIC_OP_TYPE(fetch_max_explicit, fetch_umax64, atomic_ulong, atomic_long, ulong)
+
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_LOAD_TYPE
+#undef DECL_ATOMIC_NO_RET_TYPE
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_TYPE
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_LOAD_OP
+#undef DECL_ATOMIC_NO_RET_OP
+#undef DECL_ATOMIC_COMPARE_EXCHANGE_OP
+
+OVERLOADABLE bool atomic_flag_test_and_set(volatile atomic_flag *object) {
+  atomic_int * temp = (atomic_int*)object;
+  return (bool)__gen_ocl_atomic_compare_exchange_strong32(temp, 0, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order) {
+  atomic_int * temp = (atomic_int*)object;
+  return (bool)__gen_ocl_atomic_compare_exchange_strong32(temp, 0, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE bool atomic_flag_test_and_set_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){
+  atomic_int * temp = (atomic_int*)object;
+  return (bool)__gen_ocl_atomic_compare_exchange_strong32(temp, 0, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_flag_clear(volatile atomic_flag *object){
+  atomic_int * temp = (atomic_int*)object;
+  __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order){
+  atomic_int * temp = (atomic_int*)object;
+  __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope){
+  atomic_int * temp = (atomic_int*)object;
+  __gen_ocl_atomic_exchange32(temp, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+OVERLOADABLE void atomic_work_item_fence(cl_mem_fence_flags flags, memory_order order, memory_scope scope){
+}
diff --git a/backend/src/libocl/src/ocl_atomic.ll b/backend/src/libocl/src/ocl_atomic.ll
new file mode 100644
index 0000000..6b789b3
--- /dev/null
+++ b/backend/src/libocl/src/ocl_atomic.ll
@@ -0,0 +1,153 @@
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64"
+
+;32bit version.
+define i32 @__gen_ocl_atomic_exchange32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_add32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_sub32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_or32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_xor32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_and32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_imin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_imax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_umin32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_fetch_umax32(i32 addrspace(4)* nocapture %ptr, i32 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %value seq_cst
+    ret i32 %0
+}
+
+define i32 @__gen_ocl_atomic_compare_exchange_strong32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %success, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+  %0 = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+  %1 = extractvalue { i32, i1 } %0, 0
+  ret i32 %1
+}
+
+define i32 @__gen_ocl_atomic_compare_exchange_weak32(i32 addrspace(4)* nocapture %ptr,i32 %compare, i32 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+  %0 = cmpxchg weak volatile i32 addrspace(4)* %ptr, i32 %compare, i32 %value seq_cst seq_cst
+  %1 = extractvalue { i32, i1 } %0, 0
+  ret i32 %1
+}
+
+;64bit version
+
+define i64 @__gen_ocl_atomic_exchange64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_add64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_sub64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_or64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_xor64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_and64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_imin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_imax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_umin64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_fetch_umax64(i64 addrspace(4)* nocapture %ptr, i64 %value, i32 %order, i32 %scope) nounwind alwaysinline {
+entry:
+    %0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %value seq_cst
+    ret i64 %0
+}
+
+define i64 @__gen_ocl_atomic_compare_exchange_strong64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+  %0 = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst
+  %1 = extractvalue { i64, i1 } %0, 0
+  ret i64 %1
+}
+
+define i64 @__gen_ocl_atomic_compare_exchange_weak64(i64 addrspace(4)* nocapture %ptr,i64 %compare, i64 %value, i32 %sucess, i32 %failure, i32 %scope) nounwind alwaysinline {
+entry:
+  %0 = cmpxchg weak volatile i64 addrspace(4)* %ptr, i64 %compare, i64 %value seq_cst seq_cst
+  %1 = extractvalue { i64, i1 } %0, 0
+  ret i64 %1
+}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index d5d02f5..fb0a72c 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -639,6 +639,8 @@ namespace gbe
     DECL_VISIT_FN(BranchInst, BranchInst);
     DECL_VISIT_FN(PHINode, PHINode);
     DECL_VISIT_FN(AllocaInst, AllocaInst);
+    DECL_VISIT_FN(AtomicRMWInst, AtomicRMWInst);
+    DECL_VISIT_FN(AtomicCmpXchgInst, AtomicCmpXchgInst);
 #undef DECL_VISIT_FN
 
     // Emit unary instructions from gen native function
@@ -675,6 +677,7 @@ namespace gbe
     // handle load of dword/qword with unaligned address
     void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
+  void emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple);
     private:
       ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
       template <typename T, typename P = T>
@@ -3644,6 +3647,105 @@ namespace gbe
     ctx.ALU1(opcode, type, dst, src);
   }
 
+  void GenWriter::regAllocateAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    this->newRegister(&I);
+  }
+
+  void GenWriter::emitAtomicInstHelper(const ir::AtomicOps opcode,const ir::Type type, const ir::Register dst, llvm::Value* llvmPtr, const ir::Tuple payloadTuple) {
+    ir::Register pointer = this->getRegister(llvmPtr);
+    ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
+    // Get the function arguments
+    ir::Register ptr;
+    ir::Register btiReg;
+    unsigned SurfaceIndex = 0xff;
+    ir::AddressMode AM;
+    if (legacyMode) {
+      Value *bti = getBtiRegister(llvmPtr);
+      Value *ptrBase = getPointerBase(llvmPtr);
+      ir::Register baseReg = this->getRegister(ptrBase);
+      if (isa<ConstantInt>(bti)) {
+        AM = ir::AM_StaticBti;
+        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+        addrSpace = btiToGen(SurfaceIndex);
+      } else {
+        AM = ir::AM_DynamicBti;
+        addrSpace = ir::MEM_MIXED;
+        btiReg = this->getRegister(bti);
+      }
+      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      ptr = ctx.reg(pointerFamily);
+      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+    } else {
+      AM = ir::AM_Stateless;
+      ptr = pointer;
+    }
+
+    ctx.ATOMIC(opcode, type, dst, addrSpace, ptr, payloadTuple, AM, SurfaceIndex);
+  }
+
+  void GenWriter::emitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    // Get the function arguments
+    Value *llvmPtr = I.getPointerOperand();
+    ir::AtomicOps opcode = ir::ATOMIC_OP_CMPXCHG;
+    uint32_t payloadNum = 0;
+    vector<ir::Register> payload;
+    const ir::Register dst = this->getRegister(&I);
+
+    payload.push_back(this->getRegister(I.getCompareOperand()));
+    payloadNum++;
+    payload.push_back(this->getRegister(I.getNewValOperand()));
+    payloadNum++;
+    ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+    const ir::Tuple payloadTuple = payloadNum == 0 ?
+                                   ir::Tuple(0) :
+                                   ctx.arrayTuple(&payload[0], payloadNum);
+    this->emitAtomicInstHelper(opcode, type, dst, llvmPtr, payloadTuple);
+  }
+
+  void GenWriter::regAllocateAtomicRMWInst(AtomicRMWInst &I) {
+    this->newRegister(&I);
+  }
+
+  static INLINE ir::AtomicOps atomicOpsLLVMToGen(llvm::AtomicRMWInst::BinOp llvmOp) {
+    switch(llvmOp) {
+      case llvm::AtomicRMWInst::Xchg: return ir::ATOMIC_OP_XCHG;
+      case llvm::AtomicRMWInst::Add:  return ir::ATOMIC_OP_ADD;
+      case llvm::AtomicRMWInst::Sub:  return ir::ATOMIC_OP_SUB;
+      case llvm::AtomicRMWInst::And:  return ir::ATOMIC_OP_AND;
+      case llvm::AtomicRMWInst::Or:   return ir::ATOMIC_OP_OR;
+      case llvm::AtomicRMWInst::Xor:  return ir::ATOMIC_OP_XOR;
+      case llvm::AtomicRMWInst::Max:  return ir::ATOMIC_OP_IMAX;
+      case llvm::AtomicRMWInst::Min:  return ir::ATOMIC_OP_IMIN;
+      case llvm::AtomicRMWInst::UMax: return ir::ATOMIC_OP_UMAX;
+      case llvm::AtomicRMWInst::UMin: return ir::ATOMIC_OP_UMIN;
+      case llvm::AtomicRMWInst::Nand:
+      case llvm::AtomicRMWInst::BAD_BINOP: break;
+    }
+    GBE_ASSERT(false);
+    return ir::ATOMIC_OP_INVALID;
+  }
+
+  void GenWriter::emitAtomicRMWInst(AtomicRMWInst &I) {
+    // Get the function arguments
+    llvm::AtomicOrdering Order = I.getOrdering();
+    llvm::AtomicRMWInst::BinOp llvmOpcode = I.getOperation();
+    Value *llvmPtr = I.getOperand(0);
+    ir::AtomicOps opcode = atomicOpsLLVMToGen(llvmOpcode);
+
+    const ir::Register dst = this->getRegister(&I);
+
+    uint32_t payloadNum = 0;
+    vector<ir::Register> payload;
+
+    payload.push_back(this->getRegister(I.getOperand(1)));
+    payloadNum++;
+    ir::Type type = getType(ctx, llvmPtr->getType()->getPointerElementType());
+    const ir::Tuple payloadTuple = payloadNum == 0 ?
+                                   ir::Tuple(0) :
+                                   ctx.arrayTuple(&payload[0], payloadNum);
+    this->emitAtomicInstHelper(opcode, type, dst, llvmPtr, payloadTuple);
+  }
+
   void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 0b3f0d2..11cb79f 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -201,7 +201,7 @@ namespace gbe
     // Run instcombine after redundancy elimination to exploit opportunities
     // opened up by them.
     MPM.add(createInstructionCombiningPass());
-    MPM.add(createJumpThreadingPass());         // Thread jumps
+    //MPM.add(createJumpThreadingPass());         // Thread jumps
     MPM.add(createCorrelatedValuePropagationPass());
     MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
     MPM.add(createAggressiveDCEPass());         // Delete dead instructions
-- 
2.1.4