[Beignet] [PATCH 2/8] OCL20: add device enqueue builtins.

Fri May 20 07:46:01 UTC 2016

Add three gen helper function calls for enqueue builtins.
Store the ndrange info to stack, and write the device enqueue infos
to the auxiliary global buffer.
Store the slm informations to the global buffer.
Skip all events, because we run device enqueue in order, that would
make sure all parent's enqueue has finished.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/program.cpp          |   1 +
 backend/src/libocl/CMakeLists.txt        |   5 +-
 backend/src/libocl/include/ocl.h         |   1 +
 backend/src/libocl/include/ocl_enqueue.h |  92 ++++++++++++
 backend/src/libocl/src/ocl_enqueue.cl    | 248 +++++++++++++++++++++++++++++++
 5 files changed, 344 insertions(+), 3 deletions(-)
 create mode 100644 backend/src/libocl/include/ocl_enqueue.h
 create mode 100644 backend/src/libocl/src/ocl_enqueue.cl

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 8eab1f6..f862881 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -553,6 +553,7 @@ namespace gbe {
     // FIXME we haven't implement those builtin functions,
     // so disable it currently.
     args.push_back("-fno-builtin");
+    args.push_back("-fblocks");
     args.push_back("-disable-llvm-optzns");
     if(bFastMath)
       args.push_back("-D __FAST_RELAXED_MATH__=1");
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index bda7793..68a8880 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -53,7 +53,7 @@ FOREACH(M ${OCL_COPY_HEADERS})
 ENDFOREACH(M) 
 
 SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_memcpy
-                      ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image ocl_work_group ocl_pipe)
+                      ocl_memset ocl_misc ocl_vload ocl_geometric ocl_image ocl_work_group ocl_pipe ocl_enqueue)
 FOREACH(M ${OCL_COPY_MODULES})
     COPY_THE_HEADER(${M})
     COPY_THE_SOURCE(${M})
@@ -129,14 +129,13 @@ FOREACH(M ${OCL_BASH_GENERATED_MODULES})
     GENERATE_SOURCE_BASH(${M})
 ENDFOREACH(M) 
 
-
 if(NOT SPIRTARGET)
   set(SPIRTARGET spir)
   if(CMAKE_SIZEOF_VOID_P EQUAL 8)
     set(SPIRTARGET spir64)
   endif()
 endif()
-SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -cl-kernel-arg-info -triple ${SPIRTARGET} -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND -cl-std=CL2.0)
+SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -cl-kernel-arg-info -fblocks -triple ${SPIRTARGET} -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND -cl-std=CL2.0)
 MACRO(ADD_CL_TO_BC_TARGET _file)
     # CMake seems can not add pattern rule, use MACRO to replace.
     STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" output_name ${_file})
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index e2918c6..852a523 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -41,6 +41,7 @@
 #include "ocl_simd.h"
 #include "ocl_work_group.h"
 #include "ocl_pipe.h"
+#include "ocl_enqueue.h"
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
 #endif
diff --git a/backend/src/libocl/include/ocl_enqueue.h b/backend/src/libocl/include/ocl_enqueue.h
new file mode 100644
index 0000000..369ac45
--- /dev/null
+++ b/backend/src/libocl/include/ocl_enqueue.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __OCL_ENQUEUE_H__
+#define __OCL_ENQUEUE_H__
+
+#include "ocl_types.h"
+#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0
+#define CLK_ENQUEUE_FLAGS_NO_WAIT 1
+#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 2
+#define CLK_SUCCESS 0
+#define CL_COMPLETE 0
+#define CLK_PROFILING_COMMAND_EXEC_TIME 0
+
+struct ndrange_info_t {
+  int type;
+  int global_work_size[3];
+  int local_work_size[3];
+  int global_work_offset[3];
+};
+
+struct Block_literal {
+  void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
+  int flags;
+  int reserved;
+  __global void (*invoke)(void *, ...);
+  struct Block_descriptor_1 {
+    unsigned long int reserved;         // NULL
+    unsigned long int size;         // sizeof(struct Block_literal_1)
+    // optional helper functions
+    void (*copy_helper)(void *dst, void *src);     // IFF (1<<25)
+    void (*dispose_helper)(void *src);             // IFF (1<<25)
+    // required ABI.2010.3.16
+    const char *signature;                         // IFF (1<<30)
+  } *descriptor;
+  // imported variables
+};
+
+clk_event_t create_user_event(void);
+void retain_event(clk_event_t event);
+void release_event(clk_event_t event);
+void set_user_event_status(clk_event_t event, int status);
+bool is_valid_event(clk_event_t event);
+void capture_event_profiling_info(clk_event_t event, int name, global void *value);
+
+OVERLOADABLE uint get_kernel_work_group_size(void (^block)(void));
+OVERLOADABLE uint get_kernel_work_group_size(__private void *block);
+OVERLOADABLE uint get_kernel_preferred_work_group_size_multiple(void (^block)(void));
+OVERLOADABLE uint get_kernel_preferred_work_group_size_multiple(__private void *block);
+
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void));
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+                                clk_event_t *event_ret, void (^block)(void));
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, __private void *block, uint size0, ...);
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+                                clk_event_t *event_ret,  __private void *block, uint size0, ...);
+
+queue_t get_default_queue(void);
+int __gen_enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void), int size);
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes);
+
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size);
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size);
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_size, size_t local_work_size);
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2]);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2]);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size_t global_work_size[2], const size_t local_work_size[2]);
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3]);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3]);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3]);
+
+int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret);
+#endif
diff --git a/backend/src/libocl/src/ocl_enqueue.cl b/backend/src/libocl/src/ocl_enqueue.cl
new file mode 100644
index 0000000..083e951
--- /dev/null
+++ b/backend/src/libocl/src/ocl_enqueue.cl
@@ -0,0 +1,248 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_types.h"
+#include "ocl_enqueue.h"
+#include "ocl_workitem.h"
+#include "ocl_atom.h"
+
+queue_t get_default_queue(void)
+{
+  queue_t queue;
+  return queue; //return NULL queue
+}
+
+ndrange_t __gen_ocl_set_ndrange_info(__private struct ndrange_info_t *info);
+__private struct ndrange_info_t* __gen_ocl_get_ndrange_info(ndrange_t info);
+__global int* __gen_ocl_get_enqueue_info_addr(void);
+
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void))
+{
+  int i;
+  __private struct Block_literal *literal = (__private struct Block_literal *)block;
+  __private uchar *data = (__private uchar *)block;
+  int size = literal->descriptor->size;
+  literal->descriptor->reserved = 0;
+  __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
+  int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t));
+  __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+  __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+
+  *((__global struct ndrange_info_t *)addr) = *info;
+  addr += sizeof(*info);
+
+  for(i=0; i< size; i++) {
+    addr[i] = data[i];
+  }
+  return 0;
+}
+
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+                                clk_event_t *event_ret, void (^block)(void))
+{
+  return enqueue_kernel(q, flag, ndrange, block);
+}
+
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes)
+{
+  int i;
+  __private struct Block_literal* literal = (__private struct Block_literal *)block;
+  __private uchar* data = (__private uchar *)block;
+  int size = literal->descriptor->size;
+  int slm_size = count * sizeof(int);
+  literal->descriptor->reserved = slm_size;
+  __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
+  int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t) + slm_size);
+  __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+  __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+
+  *((__global struct ndrange_info_t *)addr) = *info;
+  addr += sizeof(*info);
+
+  for(i=0; i < size; i++) {
+    addr[i] = data[i];
+  }
+
+  addr += size;
+  for(i=0; i < count; i++) {
+    ((__global int *)addr)[i] = slm_sizes[i];
+  }
+  return 0;
+}
+
+clk_event_t create_user_event(void)
+{
+  clk_event_t e;
+  return e;
+}
+
+void retain_event(clk_event_t event)
+{
+  return;
+}
+
+void release_event(clk_event_t event)
+{
+  return;
+}
+
+void set_user_event_status(clk_event_t event, int status)
+{
+  return;
+}
+
+bool is_valid_event(clk_event_t event)
+{
+  return 1;
+}
+
+OVERLOADABLE uint get_kernel_work_group_size(void (^block)(void))
+{
+  return 512;
+}
+
+OVERLOADABLE uint get_kernel_work_group_size(__private void *block)
+{
+  return 256;
+}
+
+OVERLOADABLE uint get_kernel_preferred_work_group_size_multiple(void (^block)(void))
+{
+  return 16;
+}
+
+OVERLOADABLE uint get_kernel_preferred_work_group_size_multiple(__private void *block)
+{
+  return 16;
+}
+
+void capture_event_profiling_info(clk_event_t event, int name, global void *value)
+{
+  //fake profiing data
+  ((__global ulong *)value)[0] = 0x3000;
+  ((__global ulong *)value)[1] = 0x6000;
+}
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size)
+{
+  struct ndrange_info_t info;
+  info.type = 0x1;
+  info.global_work_size[0] = global_work_size;
+  return __gen_ocl_set_ndrange_info(&info);
+  //return ndrange;
+}
+
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size)
+{
+  struct ndrange_info_t info;
+  info.type = 0x2;
+  info.global_work_size[0] = global_work_size;
+  info.local_work_size[0] = local_work_size;
+  return __gen_ocl_set_ndrange_info(&info);
+ // return ndrange;
+}
+
+
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_size, size_t local_work_size)
+{
+  struct ndrange_info_t info;
+  info.type = 0x3;
+  info.global_work_size[0] = global_work_size;
+  info.local_work_size[0] = local_work_size;
+  info.global_work_offset[0] = global_work_offset;
+  return __gen_ocl_set_ndrange_info(&info);
+  //return ndrange;
+}
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2])
+{
+  struct ndrange_info_t info;
+  info.type = 0x11;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  return __gen_ocl_set_ndrange_info(&info);
+  //return ndrange;
+}
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2])
+{
+  struct ndrange_info_t info;
+  info.type = 0x12;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size_t global_work_size[2], const size_t local_work_size[2])
+{
+  struct ndrange_info_t info;
+  info.type = 0x13;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  info.global_work_offset[0] = global_work_offset[0];
+  info.global_work_offset[1] = global_work_offset[1];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3])
+{
+  struct ndrange_info_t info;
+  info.type = 0x21;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.global_work_size[2] = global_work_size[2];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3])
+{
+  struct ndrange_info_t info;
+  info.type = 0x22;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.global_work_size[2] = global_work_size[2];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  info.local_work_size[2] = local_work_size[2];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3])
+{
+  struct ndrange_info_t info;
+  info.type = 0x23;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.global_work_size[2] = global_work_size[2];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  info.local_work_size[2] = local_work_size[2];
+  info.global_work_offset[0] = global_work_offset[0];
+  info.global_work_offset[1] = global_work_offset[1];
+  info.global_work_offset[2] = global_work_offset[2];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret)
+{
+  return 0;
+}
-- 
1.9.1