[Beignet] [PATCH 2/3] Implement async and prefetch built-in.
Yang Rong
rong.r.yang at intel.com
Thu Aug 15 02:10:16 PDT 2013
Using the normal load & store to implement async copy,
and so wait_group_events is same as barrier.
Prefetch just define and function is empty.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
backend/src/ocl_stdlib.tmpl.h | 67 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 67 insertions(+)
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index d1cc6aa..9703405 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -37,6 +37,7 @@ typedef __typeof__(sizeof(int)) size_t;
typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
typedef signed int intptr_t;
typedef unsigned int uintptr_t;
+typedef signed int event_t;
/////////////////////////////////////////////////////////////////////////////
// OpenCL address space
@@ -1226,6 +1227,72 @@ INLINE void write_mem_fence(cl_mem_fence_flags flags) {
}
/////////////////////////////////////////////////////////////////////////////
+// Async Copies and prefetch
+/////////////////////////////////////////////////////////////////////////////
+#define BODY(SRC_STRIDE, DST_STRIDE) \
+ uint size = get_local_size(2) * get_local_size(1) * get_local_size(0); \
+ uint count = num / size; \
+ uint offset = get_local_id(2) * get_local_size(1) + get_local_id(1); \
+ offset = offset * get_local_size(0) + get_local_id(0); \
+ for(uint i=0; i<count; i+=1) { \
+ *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+ offset += size; \
+ } \
+ if(offset < num) \
+ *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+ return 0;
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE event_t async_work_group_copy (local TYPE *dst, const global TYPE *src, \
+ size_t num, event_t event) { \
+ BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_copy (global TYPE *dst, const local TYPE *src, \
+ size_t num, event_t event) { \
+ BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (local TYPE *dst, const global TYPE *src, \
+ size_t num, size_t src_stride, event_t event) { \
+ BODY(src_stride, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (global TYPE *dst, const local TYPE *src, \
+ size_t num, size_t dst_stride, event_t event) { \
+ BODY(1, dst_stride); \
+}
+#define DEF(TYPE) \
+ DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16);
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+#undef BODY
+#undef DEFN
+#undef DEF
+
+void wait_group_events (int num_events, event_t *event_list) {
+ barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE void prefetch(const global TYPE *p, size_t num) { }
+#define DEF(TYPE) \
+DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16)
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+#undef DEFN
+#undef DEF
+
+/////////////////////////////////////////////////////////////////////////////
// Atomic functions
/////////////////////////////////////////////////////////////////////////////
OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
--
1.8.1.2
More information about the Beignet
mailing list