[Beignet] [PATCH 2/3] Implement async and prefetch built-in.

Yang Rong rong.r.yang at intel.com
Thu Aug 15 02:10:16 PDT 2013


Using the normal load & store to implement async copy,
and so wait_group_events is same as barrier.
Prefetch just define and function is empty.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/ocl_stdlib.tmpl.h | 67 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index d1cc6aa..9703405 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -37,6 +37,7 @@ typedef __typeof__(sizeof(int)) size_t;
 typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
 typedef signed int intptr_t;
 typedef unsigned int uintptr_t;
+typedef signed int event_t;
 
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL address space
@@ -1226,6 +1227,72 @@ INLINE void write_mem_fence(cl_mem_fence_flags flags) {
 }
 
 /////////////////////////////////////////////////////////////////////////////
+// Async Copies and prefetch
+/////////////////////////////////////////////////////////////////////////////
+#define BODY(SRC_STRIDE, DST_STRIDE) \
+  uint size = get_local_size(2) * get_local_size(1) * get_local_size(0); \
+  uint count = num / size;  \
+  uint offset = get_local_id(2) * get_local_size(1) + get_local_id(1);  \
+  offset = offset * get_local_size(0) + get_local_id(0); \
+  for(uint i=0; i<count; i+=1) { \
+    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+    offset += size;                                 \
+  } \
+  if(offset < num) \
+    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+  return 0;
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE event_t async_work_group_copy (local TYPE *dst,  const global TYPE *src, \
+										    size_t num, event_t event) { \
+  BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_copy (global TYPE *dst,  const local TYPE *src, \
+										    size_t num, event_t event) { \
+  BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (local TYPE *dst,  const global TYPE *src, \
+										            size_t num, size_t src_stride, event_t event) { \
+  BODY(src_stride, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (global TYPE *dst,  const local TYPE *src, \
+										            size_t num, size_t dst_stride, event_t event) { \
+  BODY(1, dst_stride); \
+}
+#define DEF(TYPE) \
+  DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16);
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+#undef BODY
+#undef DEFN
+#undef DEF
+
+void wait_group_events (int num_events, event_t *event_list) {
+  barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE void prefetch(const global TYPE *p, size_t num) { }
+#define DEF(TYPE) \
+DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16)
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+#undef DEFN
+#undef DEF
+
+/////////////////////////////////////////////////////////////////////////////
 // Atomic functions
 /////////////////////////////////////////////////////////////////////////////
 OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
-- 
1.8.1.2



More information about the Beignet mailing list