[Beignet] [PATCH 7/8] OCL20: handle device enqueue in runtime.

Fri May 20 07:46:06 UTC 2016

There are some step to handle device enqueue:
1. allocate the device enqueue bo to store the device enqueue
information for parent kernel. Add must convert all global buffers to
SVM buffers to make sure the child kernels have the same GPU address.
2. When flush the command, check whether have device enqueue or not. If
has device enqueue, must wait finish and parse the device enqueue info.
3. Start the child ndrange according the device enqueue info, and the
parent's global buffers as the exec info.

Because of non uniform workgroup size, one enqueue api will flush
serveral times, but device enqueue only need handle once, so add a flag
to function cl_command_queue_flush to indicate the last flush.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 src/CMakeLists.txt          |   2 +
 src/cl_api.c                |  16 ++--
 src/cl_command_queue.c      |  35 ++++++---
 src/cl_command_queue.h      |   6 +-
 src/cl_command_queue_gen7.c |   5 +-
 src/cl_context.c            |  15 ++++
 src/cl_context.h            |   3 +
 src/cl_device_enqueue.c     | 181 ++++++++++++++++++++++++++++++++++++++++++++
 src/cl_device_enqueue.h     |  32 ++++++++
 src/cl_event.c              |   2 +-
 src/cl_kernel.c             |   4 +
 src/cl_kernel.h             |   6 +-
 src/intel/intel_driver.c    |   1 +
 13 files changed, 283 insertions(+), 25 deletions(-)
 create mode 100644 src/cl_device_enqueue.c
 create mode 100644 src/cl_device_enqueue.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 40a9afb..55c8002 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -80,6 +80,8 @@ set(OPENCL_SRC
     cl_context.c
     cl_command_queue.c
     cl_command_queue.h
+    cl_device_enqueue.c
+    cl_device_enqueue.h
     cl_command_queue_gen7.c
     cl_thread.c
     cl_driver.h
diff --git a/src/cl_api.c b/src/cl_api.c
index 254c029..f17870a 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -2297,7 +2297,7 @@ clEnqueueFillImage(cl_command_queue   command_queue,
       cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
     }
 
-    err = cl_command_queue_flush(command_queue);
+    err = cl_command_queue_flush(command_queue, 1);
   }
 
   if(b_output_kernel_perf)
@@ -2375,7 +2375,7 @@ clEnqueueFillBuffer(cl_command_queue   command_queue,
       cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
     }
 
-    err = cl_command_queue_flush(command_queue);
+    err = cl_command_queue_flush(command_queue, 1);
   }
 
   if(b_output_kernel_perf)
@@ -2463,7 +2463,7 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
       cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
     }
 
-    err = cl_command_queue_flush(command_queue);
+    err = cl_command_queue_flush(command_queue, 1);
   }
 
   if(b_output_kernel_perf)
@@ -2567,7 +2567,7 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
       cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
     }
 
-    err = cl_command_queue_flush(command_queue);
+    err = cl_command_queue_flush(command_queue, 1);
   }
 
   if(b_output_kernel_perf)
@@ -2816,7 +2816,7 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
       cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
     }
 
-    err = cl_command_queue_flush(command_queue);
+    err = cl_command_queue_flush(command_queue, 1);
   }
 
   if(b_output_kernel_perf)
@@ -2882,7 +2882,7 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
       cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
     }
 
-    err = cl_command_queue_flush(command_queue);
+    err = cl_command_queue_flush(command_queue, 1);
   }
 
   if(b_output_kernel_perf)
@@ -2948,7 +2948,7 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
       cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
     }
 
-    err = cl_command_queue_flush(command_queue);
+    err = cl_command_queue_flush(command_queue, 1);
   }
 
   if(b_output_kernel_perf)
@@ -3480,7 +3480,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
       cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
     }
 
-    err = cl_command_queue_flush(command_queue);
+    err = cl_command_queue_flush(command_queue, 1);
   }
 
   if(b_output_kernel_perf)
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 8b11c1c..675d786 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -31,6 +31,7 @@
 #include "cl_khr_icd.h"
 #include "cl_event.h"
 #include "performance.h"
+#include "cl_device_enqueue.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -186,25 +187,30 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, uint32_t *max
 }
 
 LOCAL cl_int
-cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, uint32_t max_bti)
+cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, uint32_t *max_bti)
 {
   uint32_t i;
-  size_t mem_offset, bti = max_bti;
-  cl_mem svm_mem;
+  size_t mem_offset, bti = *max_bti;
+  cl_mem mem;
 
   GET_QUEUE_THREAD_GPGPU(queue);
 
   for (i = 0; i < k->exec_info_n; i++) {
     void *ptr = k->exec_info[i];
-    if((svm_mem = cl_context_get_svm_from_ptr(k->program->ctx, ptr)) != NULL) {
-      mem_offset = (size_t)ptr - (size_t)svm_mem->host_ptr;
+    mem = cl_context_get_svm_from_ptr(k->program->ctx, ptr);
+    if(mem == NULL)
+      mem = cl_context_get_mem_from_ptr(k->program->ctx, ptr);
+
+    if(mem) {
+      mem_offset = (size_t)ptr - (size_t)mem->host_ptr;
       /* only need realloc in surface state, don't need realloc in curbe */
-      cl_gpgpu_bind_buf(gpgpu, svm_mem->bo, -1, svm_mem->offset + mem_offset, svm_mem->size, bti++);
+      cl_gpgpu_bind_buf(gpgpu, mem->bo, -1, mem->offset + mem_offset, mem->size, bti++);
       if(bti == BTI_WORKAROUND_IMAGE_OFFSET)
-        bti = max_bti + BTI_WORKAROUND_IMAGE_OFFSET;
+        bti = *max_bti + BTI_WORKAROUND_IMAGE_OFFSET;
       assert(bti < BTI_MAX_ID);
     }
   }
+  *max_bti = bti;
 
   return CL_SUCCESS;
 }
@@ -243,6 +249,7 @@ cl_command_queue_ND_range_wrap(cl_command_queue queue,
     global_wk_sz[1]%local_wk_sz[1],
     global_wk_sz[2]%local_wk_sz[2]
   };
+  count = (global_wk_sz_rem[0] ? 2 : 1) * (global_wk_sz_rem[1] ? 2 : 1) * (global_wk_sz_rem[2] ? 2 : 1);
 
   const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
   /* Go through the at most 8 cases and euque if there is work items left */
@@ -262,9 +269,10 @@ cl_command_queue_ND_range_wrap(cl_command_queue queue,
         };
         if(local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0) continue;
         TRY (cl_command_queue_ND_range_gen7, queue, ker, work_dim, global_wk_off,global_dim_off, global_wk_sz,global_wk_sz_use,local_wk_sz, local_wk_sz_use);
+
         /* TODO: need to handle events for multiple enqueue, now is a workaroud for uniform group size */
         if(!(global_wk_sz_rem[0] == 0 && global_wk_sz_rem[1] == 0 && global_wk_sz_rem[2] == 0))
-          err = cl_command_queue_flush(queue);
+          err = cl_command_queue_flush(queue, --count == 0);
       }
       if(work_dim < 2)
         break;
@@ -303,7 +311,7 @@ error:
 }
 
 LOCAL int
-cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
+cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu, cl_bool last_flush_of_enqueue)
 {
   void* printf_info = cl_gpgpu_get_printf_info(gpgpu);
 
@@ -320,15 +328,19 @@ cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
     interp_release_printf_info(printf_info);
     cl_gpgpu_set_printf_info(gpgpu, NULL);
   }
+
+  if(last_flush_of_enqueue)
+    cl_device_enqueue_parse_result(queue, gpgpu);
+
   return CL_SUCCESS;
 }
 
 LOCAL cl_int
-cl_command_queue_flush(cl_command_queue queue)
+cl_command_queue_flush(cl_command_queue queue, cl_bool last_flush_of_enqueue)
 {
   int err;
   GET_QUEUE_THREAD_GPGPU(queue);
-  err = cl_command_queue_flush_gpgpu(queue, gpgpu);
+  err = cl_command_queue_flush_gpgpu(queue, gpgpu, last_flush_of_enqueue);
   // We now keep a list of uncompleted events and check if they compelte
   // every flush. This can make sure all events created have chance to be
   // update status, so the callback functions or reference can be handled.
@@ -340,6 +352,7 @@ cl_command_queue_flush(cl_command_queue queue)
     set_current_event(queue, NULL);
   }
   cl_invalid_thread_gpgpu(queue);
+
   return err;
 }
 
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index ad56129..3c9b9c0 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -76,10 +76,10 @@ extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
 extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
 
 /* Flush for the command queue */
-extern cl_int cl_command_queue_flush(cl_command_queue);
+extern cl_int cl_command_queue_flush(cl_command_queue, cl_bool);
 
 /* Flush for the specified gpgpu */
-extern int cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu);
+extern int cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu, cl_bool);
 
 /* Wait for the completion of the command queue */
 extern cl_int cl_command_queue_finish(cl_command_queue);
@@ -91,7 +91,7 @@ extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel, uint32_
 extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel, uint32_t *);
 
 /* Bind all exec info to bind table */
-extern cl_int cl_command_queue_bind_exec_info(cl_command_queue, cl_kernel, uint32_t);
+extern cl_int cl_command_queue_bind_exec_info(cl_command_queue, cl_kernel, uint32_t *);
 
 /* Insert a user event to command's wait_events */
 extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 3ec5822..7bb3a48 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -25,6 +25,7 @@
 #include "cl_mem.h"
 #include "cl_utils.h"
 #include "cl_alloc.h"
+#include "cl_device_enqueue.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -397,7 +398,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind user images */
   cl_command_queue_bind_image(queue, ker, &max_bti);
   /* Bind all exec infos */
-  cl_command_queue_bind_exec_info(queue, ker, max_bti);
+  cl_command_queue_bind_exec_info(queue, ker, &max_bti);
+  /* Bind device enqueue buffer */
+  cl_device_enqueue_bind_buffer(gpgpu, ker, &max_bti, &kernel);
   /* Bind all samplers */
   cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
 
diff --git a/src/cl_context.c b/src/cl_context.c
index 6bdf272..207960f 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -329,6 +329,7 @@ unlock:
   return cl_kernel_dup(ker);
 }
 
+
 cl_mem
 cl_context_get_svm_from_ptr(cl_context ctx, const void * p)
 {
@@ -342,3 +343,17 @@ cl_context_get_svm_from_ptr(cl_context ctx, const void * p)
   }
   return NULL;
 }
+
+cl_mem
+cl_context_get_mem_from_ptr(cl_context ctx, const void * p)
+{
+  cl_mem buf = ctx->buffers;
+  while(buf) {
+    if(buf->host_ptr == NULL) continue;
+    if((size_t)buf->host_ptr <= (size_t)p &&
+       (size_t)p < ((size_t)buf->host_ptr + buf->size))
+      return buf;
+    buf = buf->next;
+  }
+  return NULL;
+}
diff --git a/src/cl_context.h b/src/cl_context.h
index 8c462b1..e4f6605 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -172,5 +172,8 @@ extern cl_kernel cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int in
 
 /* Get the SVM from pointer, return NULL if pointer is not from SVM */
 extern cl_mem cl_context_get_svm_from_ptr(cl_context ctx, const void *p);
+/* Get the mem from pointer, return NULL if pointer is not from mem*/
+extern cl_mem cl_context_get_mem_from_ptr(cl_context ctx, const void *p);
+
 #endif /* __CL_CONTEXT_H__ */
 
diff --git a/src/cl_device_enqueue.c b/src/cl_device_enqueue.c
new file mode 100644
index 0000000..ae95a6b
--- /dev/null
+++ b/src/cl_device_enqueue.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang<rong.r.yang at intel.com>
+ */
+#include "cl_device_enqueue.h"
+#include "cl_mem.h"
+#include "cl_utils.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_alloc.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
+
+LOCAL cl_int
+cl_device_enqueue_fix_offset(cl_kernel ker) {
+  uint32_t i;
+  void *ptr;
+  cl_mem mem;
+  enum gbe_arg_type arg_type; /* kind of argument */
+  for (i = 0; i < ker->arg_n; ++i) {
+    arg_type = interp_kernel_get_arg_type(ker->opaque, i);
+    //HOW about image
+    if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !ker->args[i].mem)
+      continue;
+
+    if(!ker->args[i].is_svm) {
+      mem = ker->args[i].mem;
+      ptr = cl_mem_map(mem, 0);
+      cl_buffer_set_softpin_offset(mem->bo, (size_t)ptr);
+      cl_buffer_set_bo_use_full_range(mem->bo, 1);
+      cl_buffer_disable_reuse(mem->bo);
+      mem->host_ptr = ptr;
+      cl_mem_unmap(mem);
+      ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ptr;
+    } else {
+      ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ker->args[i].mem->host_ptr;
+    }
+  }
+  return 0;
+}
+
+LOCAL cl_int
+cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker, uint32_t *max_bti, cl_gpgpu_kernel *kernel)
+{
+  int32_t value = GBE_CURBE_ENQUEUE_BUF_POINTER;
+  int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+  size_t buf_size = 32 * 1024 * 1024;  //fix 32M
+  cl_mem mem;
+
+  if(offset > 0) {
+    if(ker->useDeviceEnqueue == false) {
+      if(ker->device_enqueue_ptr == NULL)
+        ker->device_enqueue_ptr = cl_mem_svm_allocate(ker->program->ctx, 0, buf_size, 0);
+      if(ker->device_enqueue_infos == NULL)
+        ker->device_enqueue_infos = cl_calloc(ker->arg_n, sizeof(void *));
+      ker->device_enqueue_info_n = 0;
+      ker->useDeviceEnqueue = CL_TRUE;
+      cl_device_enqueue_fix_offset(ker);
+      cl_kernel_add_ref(ker);
+    }
+
+    mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
+    cl_gpgpu_bind_buf(gpgpu, mem->bo, offset, 0, buf_size, *max_bti);
+
+    cl_gpgpu_set_kernel(gpgpu, ker);
+  }
+  return 0;
+}
+
+typedef struct ndrange_info_t {
+  int type;
+  int global_work_size[3];
+  int local_work_size[3];
+  int global_work_offset[3];
+} ndrange_info_t;
+
+typedef struct Block_literal {
+  void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
+  int flags;
+  int reserved;
+  int index;
+  struct Block_descriptor_1 {
+    unsigned long int slm_size;         // NULL
+    unsigned long int size;         // sizeof(struct Block_literal_1)
+    // optional helper functions
+    void *copy_helper;     // IFF (1<<25)
+    void *dispose_helper;             // IFF (1<<25)
+    // required ABI.2010.3.16
+    const char *signature;                         // IFF (1<<30)
+  } *descriptor;
+  // imported variables
+} Block_literal;
+
+LOCAL cl_int
+cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu)
+{
+  cl_mem mem;
+  int size, type, dim, i;
+  const char * kernel_name;
+  cl_kernel child_ker;
+
+  cl_kernel ker = cl_gpgpu_get_kernel(gpgpu);
+  if(ker == NULL || ker->useDeviceEnqueue == CL_FALSE)
+    return 0;
+
+  void *buf = cl_gpgpu_ref_batch_buf(gpgpu);
+  //wait the gpgpu's batch buf finish, the gpgpu in queue may be not
+  //same as the param gpgpu, for example when flush event.
+  cl_gpgpu_sync(buf);
+  cl_gpgpu_unref_batch_buf(buf);
+  cl_invalid_thread_gpgpu(queue);
+
+  mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
+  if(mem == NULL) return -1;
+  char *ptr = (char *)cl_mem_map(mem, 0);
+
+  size =  *(int *)ptr;
+  ptr += 4;
+  while(size > 0) {
+    size_t fixed_global_off[] = {0,0,0};
+    size_t fixed_global_sz[] = {1,1,1};
+    size_t fixed_local_sz[] = {1,1,1};
+    ndrange_info_t* ndrange_info = (ndrange_info_t *)ptr;
+    size -= sizeof(ndrange_info_t);
+    ptr += sizeof(ndrange_info_t);
+
+    Block_literal *block = (Block_literal *)ptr;
+    size -=  block->descriptor->size;
+    ptr += block->descriptor->size;
+
+    type = ndrange_info->type;
+    dim = (type & 0xf0) >> 4;
+    type = type & 0xf;
+    for(i = 0; i <= dim; i++) {
+      fixed_global_sz[i] = ndrange_info->global_work_size[i];
+      if(type > 1)
+        fixed_local_sz[i] = ndrange_info->local_work_size[i];
+      if(type > 2)
+        fixed_global_off[i] = ndrange_info->global_work_offset[i];
+    }
+
+    int *slm_sizes = (int *)ptr;
+    int slm_size = block->descriptor->slm_size;
+    size -= slm_size;
+    ptr += slm_size;
+
+    kernel_name = interp_program_get_device_enqueue_kernel_name(ker->program->opaque, block->index);
+    child_ker = cl_program_create_kernel(ker->program, kernel_name, NULL);
+    cl_kernel_set_arg_svm_pointer(child_ker, 0, block);
+    int index = 1;
+    for(i=0; i<slm_size/sizeof(int); i++, index++) {
+      cl_kernel_set_arg(child_ker, index, slm_sizes[i], NULL);
+    }
+    cl_kernel_set_exec_info(child_ker, ker->device_enqueue_info_n * sizeof(void *),
+                            ker->device_enqueue_infos);
+
+    cl_command_queue_ND_range(queue, child_ker, dim + 1, fixed_global_off, fixed_global_sz, fixed_local_sz);
+    //call cl_command_queue_flush_gpgpu instead of cl_command_queue_flush because of event.
+    cl_command_queue_flush_gpgpu(queue, cl_get_thread_gpgpu(queue), 1);
+    cl_kernel_delete(child_ker);
+  }
+  //wait finish, don't call cl_command_queue_finish, it will update event.
+  cl_gpgpu_sync(cl_get_thread_batch_buf(queue));
+  cl_mem_unmap_auto(mem);
+  cl_kernel_delete(ker);
+  return 0;
+}
diff --git a/src/cl_device_enqueue.h b/src/cl_device_enqueue.h
new file mode 100644
index 0000000..4e9968e
--- /dev/null
+++ b/src/cl_device_enqueue.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang<rong.r.yang at intel.com>
+ */
+
+#ifndef __CL_DEVICE_ENQUEUE_H__
+#define __CL_DEVICE_ENQUEUE_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "cl_thread.h"
+#include "CL/cl.h"
+#include <stdint.h>
+
+extern cl_int cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker,
+                                                     uint32_t *max_bti, cl_gpgpu_kernel *kernel);
+extern cl_int cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu);
+#endif /* __CL_DEVICE_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
index 041c178..1b22b76 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -78,7 +78,7 @@ int cl_event_flush(cl_event event)
   int err = CL_SUCCESS;
   assert(event->gpgpu_event != NULL);
   if (event->gpgpu) {
-    err = cl_command_queue_flush_gpgpu(event->queue, event->gpgpu);
+    err = cl_command_queue_flush_gpgpu(event->queue, event->gpgpu, 1);
     cl_gpgpu_delete(event->gpgpu);
     event->gpgpu = NULL;
   }
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index fe042a7..723d61d 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -59,6 +59,10 @@ cl_kernel_delete(cl_kernel k)
     cl_free(k->images);
   if (k->exec_info)
     cl_free(k->exec_info);
+  if (k->device_enqueue_ptr)
+    cl_mem_svm_delete(k->program->ctx, k->device_enqueue_ptr);
+  if (k->device_enqueue_infos)
+    cl_free(k->device_enqueue_infos);
   k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(k);
 }
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 87187bc..76ba8e4 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -68,7 +68,11 @@ struct _cl_kernel {
   uint32_t arg_n:31;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
   uint32_t exec_info_n;       /* The kernel's exec info count */
-  void** exec_info;            /* The kernel's exec info */
+  void** exec_info;             /* The kernel's exec info */
+  cl_bool useDeviceEnqueue;     /* kernel use device enqueue */
+  void* device_enqueue_ptr;     /* device_enqueue buffer*/
+  uint32_t device_enqueue_info_n; /* count of parent kernel's arguments buffers, as child enqueues' exec info */
+  void** device_enqueue_infos;   /* parent kernel's arguments buffers, as child enqueues' exec info   */
 };
 
 /* Allocate an empty kernel */
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index ed6b33f..d256ab5 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -862,6 +862,7 @@ intel_setup_callbacks(void)
   cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
   cl_buffer_set_softpin_offset = (cl_buffer_set_softpin_offset_cb *) drm_intel_bo_set_softpin_offset;
   cl_buffer_set_bo_use_full_range = (cl_buffer_set_bo_use_full_range_cb *) drm_intel_bo_use_48b_address_range;
+  cl_buffer_disable_reuse = (cl_buffer_disable_reuse_cb *) drm_intel_bo_disable_reuse;
   cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
 #if defined(HAS_EGL)
   cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
-- 
1.9.1