[Beignet] [PATCH 12/19] OCL20: add device enqueue builtins.

Yang Rong rong.r.yang at intel.com
Mon Nov 28 11:32:38 UTC 2016


Add three gen helper function calls for enqueue builtins.
Store the ndrange info to stack, and write the device enqueue infos
to the auxiliary global buffer.
Store the slm informations to the global buffer.
Skip all events, because we run device enqueue in order, that would
make sure all parent's enqueue has finished.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
Reviewed-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/program.cpp          |   5 +-
 backend/src/libocl/CMakeLists.txt        |   4 +-
 backend/src/libocl/include/ocl.h         |   1 +
 backend/src/libocl/include/ocl_enqueue.h |  90 ++++++++++++
 backend/src/libocl/src/ocl_enqueue.cl    | 238 +++++++++++++++++++++++++++++++
 5 files changed, 334 insertions(+), 4 deletions(-)
 create mode 100644 backend/src/libocl/include/ocl_enqueue.h
 create mode 100644 backend/src/libocl/src/ocl_enqueue.cl

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 413f397..e0107dc 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -690,9 +690,10 @@ namespace gbe {
     args.push_back("-x");
     args.push_back("cl");
     args.push_back("-triple");
-    if (oclVersion >= 200)
+    if (oclVersion >= 200) {
       args.push_back("spir64");
-    else
+      args.push_back("-fblocks");
+    } else
       args.push_back("spir");
 #endif /* LLVM_VERSION_MINOR <= 2 */
     args.push_back("stringInput.cl");
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index a4f575f..c68ecb0 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -68,7 +68,7 @@ FOREACH(M ${OCL_COPY_MODULES_12})
     COPY_THE_SOURCE(OCL_SOURCE_FILES_12 ${M})
 ENDFOREACH(M)
 
-SET (OCL_COPY_MODULES_20 ocl_vload_20 ocl_atom_20 ocl_pipe)
+SET (OCL_COPY_MODULES_20 ocl_vload_20 ocl_atom_20 ocl_pipe ocl_enqueue)
 FOREACH(M ${OCL_COPY_MODULES_20})
     COPY_THE_HEADER(${M})
     COPY_THE_SOURCE(OCL_SOURCE_FILES_20 ${M})
@@ -157,7 +157,7 @@ FOREACH(M ${OCL_BASH_GENERATED_MODULES})
 ENDFOREACH(M) 
 
 SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -triple spir -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2" -D__OPENCL_C_VERSION__=120)
-SET (CLANG_OCL_FLAGS_20 -fno-builtin -ffp-contract=off -triple spir64 -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL2.0" -D__OPENCL_C_VERSION__=200)
+SET (CLANG_OCL_FLAGS_20 -fno-builtin -ffp-contract=off -triple spir64 -cl-kernel-arg-info -fblocks -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL2.0" -D__OPENCL_C_VERSION__=200)
 
 MACRO(ADD_CL_TO_BC_TARGET _file _output _clang_flag)
     # CMake seems can not add pattern rule, use MACRO to replace.
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 677d2d4..2548cb7 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -100,6 +100,7 @@
 #include "ocl_atom_20.h"
 #include "ocl_pipe.h"
 #include "ocl_math_20.h"
+#include "ocl_enqueue.h"
 #else
 #include "ocl_vload.h"
 #include "ocl_atom.h"
diff --git a/backend/src/libocl/include/ocl_enqueue.h b/backend/src/libocl/include/ocl_enqueue.h
new file mode 100644
index 0000000..6479df7
--- /dev/null
+++ b/backend/src/libocl/include/ocl_enqueue.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __OCL_ENQUEUE_H__
+#define __OCL_ENQUEUE_H__
+
+#include "ocl_types.h"
+#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0
+#define CLK_ENQUEUE_FLAGS_NO_WAIT 1
+#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 2
+#define CLK_SUCCESS 0
+#define CL_COMPLETE 0
+#define CLK_PROFILING_COMMAND_EXEC_TIME 0
+
+struct ndrange_info_t {
+  int type;
+  int global_work_size[3];
+  int local_work_size[3];
+  int global_work_offset[3];
+};
+
+struct Block_literal {
+  void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
+  int flags;
+  int reserved;
+  __global void (*invoke)(void *, ...);
+  struct Block_descriptor_1 {
+    unsigned long int reserved;         // NULL
+    unsigned long int size;         // sizeof(struct Block_literal_1)
+    // optional helper functions
+    void (*copy_helper)(void *dst, void *src);     // IFF (1<<25)
+    void (*dispose_helper)(void *src);             // IFF (1<<25)
+    // required ABI.2010.3.16
+    const char *signature;                         // IFF (1<<30)
+  } *descriptor;
+  // imported variables
+};
+
+clk_event_t create_user_event(void);
+void retain_event(clk_event_t event);
+void release_event(clk_event_t event);
+void set_user_event_status(clk_event_t event, int status);
+bool is_valid_event(clk_event_t event);
+void capture_event_profiling_info(clk_event_t event, int name, global void *value);
+
+uint __get_kernel_work_group_size_impl(__private void *block);
+uint __get_kernel_preferred_work_group_multiple_impl(__private void *block);
+
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void));
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+                                clk_event_t *event_ret, void (^block)(void));
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, __private void *block, uint size0, ...);
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+                                clk_event_t *event_ret,  __private void *block, uint size0, ...);
+
+queue_t get_default_queue(void);
+int __gen_enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void), int size);
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes);
+
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size);
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size);
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_size, size_t local_work_size);
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2]);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2]);
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size_t global_work_size[2], const size_t local_work_size[2]);
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3]);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3]);
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3]);
+
+int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret);
+#endif
diff --git a/backend/src/libocl/src/ocl_enqueue.cl b/backend/src/libocl/src/ocl_enqueue.cl
new file mode 100644
index 0000000..dc8fa3b
--- /dev/null
+++ b/backend/src/libocl/src/ocl_enqueue.cl
@@ -0,0 +1,238 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_types.h"
+#include "ocl_enqueue.h"
+#include "ocl_workitem.h"
+#include "ocl_atom.h"
+
+queue_t get_default_queue(void)
+{
+  queue_t queue;
+  return queue; //return NULL queue
+}
+
+ndrange_t __gen_ocl_set_ndrange_info(__private struct ndrange_info_t *info);
+__private struct ndrange_info_t* __gen_ocl_get_ndrange_info(ndrange_t info);
+__global int* __gen_ocl_get_enqueue_info_addr(void);
+
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange, void (^block)(void))
+{
+  int i;
+  __private struct Block_literal *literal = (__private struct Block_literal *)block;
+  __private uchar *data = (__private uchar *)block;
+  int size = literal->descriptor->size;
+  literal->descriptor->reserved = 0;
+  __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
+  int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t));
+  __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+  __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+
+  *((__global struct ndrange_info_t *)addr) = *info;
+  addr += sizeof(*info);
+
+  for(i=0; i< size; i++) {
+    addr[i] = data[i];
+  }
+  return 0;
+}
+
+OVERLOADABLE int enqueue_kernel(queue_t q, int flag, ndrange_t ndrange,
+                                uint num_events_in_wait_list, const clk_event_t *event_wait_list,
+                                clk_event_t *event_ret, void (^block)(void))
+{
+  return enqueue_kernel(q, flag, ndrange, block);
+}
+
+int __gen_enqueue_kernel_slm(queue_t q, int flag, ndrange_t ndrange, __private void * block, int count, __private int* slm_sizes)
+{
+  int i;
+  __private struct Block_literal* literal = (__private struct Block_literal *)block;
+  __private uchar* data = (__private uchar *)block;
+  int size = literal->descriptor->size;
+  int slm_size = count * sizeof(int);
+  literal->descriptor->reserved = slm_size;
+  __global int* start_addr = __gen_ocl_get_enqueue_info_addr();
+  int offset = atomic_add(start_addr, size + sizeof(struct ndrange_info_t) + slm_size);
+  __global uchar* addr = (__global uchar*)start_addr + offset + sizeof(int);
+  __private struct ndrange_info_t *info = __gen_ocl_get_ndrange_info(ndrange);
+
+  *((__global struct ndrange_info_t *)addr) = *info;
+  addr += sizeof(*info);
+
+  for(i=0; i < size; i++) {
+    addr[i] = data[i];
+  }
+
+  addr += size;
+  for(i=0; i < count; i++) {
+    ((__global int *)addr)[i] = slm_sizes[i];
+  }
+  return 0;
+}
+
+clk_event_t create_user_event(void)
+{
+  clk_event_t e;
+  return e;
+}
+
+void retain_event(clk_event_t event)
+{
+  return;
+}
+
+void release_event(clk_event_t event)
+{
+  return;
+}
+
+void set_user_event_status(clk_event_t event, int status)
+{
+  return;
+}
+
+bool is_valid_event(clk_event_t event)
+{
+  return 1;
+}
+
+uint __get_kernel_work_group_size_impl(__private void *block)
+{
+  return 256;
+}
+
+uint __get_kernel_preferred_work_group_multiple_impl(__private  void *block)
+{
+  return 16;
+}
+
+void capture_event_profiling_info(clk_event_t event, int name, global void *value)
+{
+  //fake profiing data
+  ((__global ulong *)value)[0] = 0x3000;
+  ((__global ulong *)value)[1] = 0x6000;
+}
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size)
+{
+  struct ndrange_info_t info;
+  info.type = 0x1;
+  info.global_work_size[0] = global_work_size;
+  return __gen_ocl_set_ndrange_info(&info);
+  //return ndrange;
+}
+
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_size, size_t local_work_size)
+{
+  struct ndrange_info_t info;
+  info.type = 0x2;
+  info.global_work_size[0] = global_work_size;
+  info.local_work_size[0] = local_work_size;
+  return __gen_ocl_set_ndrange_info(&info);
+ // return ndrange;
+}
+
+
+OVERLOADABLE ndrange_t ndrange_1D(size_t global_work_offset, size_t global_work_size, size_t local_work_size)
+{
+  struct ndrange_info_t info;
+  info.type = 0x3;
+  info.global_work_size[0] = global_work_size;
+  info.local_work_size[0] = local_work_size;
+  info.global_work_offset[0] = global_work_offset;
+  return __gen_ocl_set_ndrange_info(&info);
+  //return ndrange;
+}
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2])
+{
+  struct ndrange_info_t info;
+  info.type = 0x11;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  return __gen_ocl_set_ndrange_info(&info);
+  //return ndrange;
+}
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_size[2], const size_t local_work_size[2])
+{
+  struct ndrange_info_t info;
+  info.type = 0x12;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+
+OVERLOADABLE ndrange_t ndrange_2D(const size_t global_work_offset[2], const size_t global_work_size[2], const size_t local_work_size[2])
+{
+  struct ndrange_info_t info;
+  info.type = 0x13;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  info.global_work_offset[0] = global_work_offset[0];
+  info.global_work_offset[1] = global_work_offset[1];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3])
+{
+  struct ndrange_info_t info;
+  info.type = 0x21;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.global_work_size[2] = global_work_size[2];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_size[3], const size_t local_work_size[3])
+{
+  struct ndrange_info_t info;
+  info.type = 0x22;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.global_work_size[2] = global_work_size[2];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  info.local_work_size[2] = local_work_size[2];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+OVERLOADABLE ndrange_t ndrange_3D(const size_t global_work_offset[3], const size_t global_work_size[3], const size_t local_work_size[3])
+{
+  struct ndrange_info_t info;
+  info.type = 0x23;
+  info.global_work_size[0] = global_work_size[0];
+  info.global_work_size[1] = global_work_size[1];
+  info.global_work_size[2] = global_work_size[2];
+  info.local_work_size[0] = local_work_size[0];
+  info.local_work_size[1] = local_work_size[1];
+  info.local_work_size[2] = local_work_size[2];
+  info.global_work_offset[0] = global_work_offset[0];
+  info.global_work_offset[1] = global_work_offset[1];
+  info.global_work_offset[2] = global_work_offset[2];
+  return __gen_ocl_set_ndrange_info(&info);
+}
+
+int enqueue_marker (queue_t queue, uint num_events_in_wait_list, const clk_event_t *event_wait_list, clk_event_t *event_ret)
+{
+  return 0;
+}
-- 
2.1.4



More information about the Beignet mailing list