[Beignet] [PATCH 08/10] OCL20: handle device enqueue in runtime.

Yang Rong rong.r.yang at intel.com
Thu Mar 17 10:53:56 UTC 2016


There are some step to handle device enqueue:
1. allocate the device enqueue bo to store the device enqueue
information for parent kernel. Add must convert all global buffers to
SVM buffers to make sure the child kernels have the same GPU address.
2. When flush the command, check whether have device enqueue or not. If
has device enqueue, must wait finish and parse the device enqueue info.
3. Start the child ndrange according the device enqueue info, and the
parent's global buffers as the exec info.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 src/CMakeLists.txt          |   2 +
 src/cl_command_queue.c      |  22 ++++--
 src/cl_command_queue.h      |   2 +-
 src/cl_command_queue_gen7.c |   5 +-
 src/cl_context.c            |  15 ++++
 src/cl_context.h            |   3 +
 src/cl_device_enqueue.c     | 171 ++++++++++++++++++++++++++++++++++++++++++++
 src/cl_device_enqueue.h     |  33 +++++++++
 src/cl_driver.h             |   9 ++-
 src/cl_driver_defs.c        |   1 +
 src/cl_kernel.c             |   4 ++
 src/cl_kernel.h             |   6 +-
 src/intel/intel_driver.c    |   1 +
 13 files changed, 261 insertions(+), 13 deletions(-)
 create mode 100644 src/cl_device_enqueue.c
 create mode 100644 src/cl_device_enqueue.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 40a9afb..55c8002 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -80,6 +80,8 @@ set(OPENCL_SRC
     cl_context.c
     cl_command_queue.c
     cl_command_queue.h
+    cl_device_enqueue.c
+    cl_device_enqueue.h
     cl_command_queue_gen7.c
     cl_thread.c
     cl_driver.h
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 6572c47..3e82f52 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -31,6 +31,7 @@
 #include "cl_khr_icd.h"
 #include "cl_event.h"
 #include "performance.h"
+#include "cl_device_enqueue.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -186,25 +187,30 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, uint32_t *max
 }
 
 LOCAL cl_int
-cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, uint32_t max_bti)
+cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, uint32_t *max_bti)
 {
   uint32_t i;
-  size_t mem_offset, bti = max_bti;
-  cl_mem svm_mem;
+  size_t mem_offset, bti = *max_bti;
+  cl_mem mem;
 
   GET_QUEUE_THREAD_GPGPU(queue);
 
   for (i = 0; i < k->exec_info_n; i++) {
     void *ptr = k->exec_info[i];
-    if((svm_mem = cl_context_get_svm_from_ptr(k->program->ctx, ptr)) != NULL) {
-      mem_offset = (size_t)ptr - (size_t)svm_mem->host_ptr;
+    mem = cl_context_get_svm_from_ptr(k->program->ctx, ptr);
+    if(mem == NULL)
+      mem = cl_context_get_mem_from_ptr(k->program->ctx, ptr);
+
+    if(mem) {
+      mem_offset = (size_t)ptr - (size_t)mem->host_ptr;
       /* only need realloc in surface state, don't need realloc in curbe */
-      cl_gpgpu_bind_buf(gpgpu, svm_mem->bo, -1, svm_mem->offset + mem_offset, svm_mem->size, bti++);
+      cl_gpgpu_bind_buf(gpgpu, mem->bo, -1, mem->offset + mem_offset, mem->size, bti++);
       if(bti == BTI_WORKAROUND_IMAGE_OFFSET)
-        bti = max_bti + BTI_WORKAROUND_IMAGE_OFFSET;
+        bti = *max_bti + BTI_WORKAROUND_IMAGE_OFFSET;
       assert(bti < BTI_MAX_ID);
     }
   }
+  *max_bti = bti;
 
   return CL_SUCCESS;
 }
@@ -350,7 +356,9 @@ cl_command_queue_flush(cl_command_queue queue)
     err = cl_event_flush(current_event);
     set_current_event(queue, NULL);
   }
+  cl_device_enqueue_parse_result(queue, gpgpu);
   cl_invalid_thread_gpgpu(queue);
+
   return err;
 }
 
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index bdf1a43..9a1a81f 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -90,7 +90,7 @@ extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel, uint32_
 extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel, uint32_t *);
 
 /* Bind all exec info to bind table */
-extern cl_int cl_command_queue_bind_exec_info(cl_command_queue, cl_kernel, uint32_t);
+extern cl_int cl_command_queue_bind_exec_info(cl_command_queue, cl_kernel, uint32_t *);
 
 /* Insert a user event to command's wait_events */
 extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index b00e383..5b8ab54 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -25,6 +25,7 @@
 #include "cl_mem.h"
 #include "cl_utils.h"
 #include "cl_alloc.h"
+#include "cl_device_enqueue.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -409,7 +410,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind user images */
   cl_command_queue_bind_image(queue, ker, &max_bti);
   /* Bind all exec infos */
-  cl_command_queue_bind_exec_info(queue, ker, max_bti);
+  cl_command_queue_bind_exec_info(queue, ker, &max_bti);
+  /* Bind device enqueue buffer */
+  cl_device_enqueue_bind_buffer(gpgpu, ker, &max_bti, &kernel);
   /* Bind all samplers */
   cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
 
diff --git a/src/cl_context.c b/src/cl_context.c
index 6bdf272..207960f 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -329,6 +329,7 @@ unlock:
   return cl_kernel_dup(ker);
 }
 
+
 cl_mem
 cl_context_get_svm_from_ptr(cl_context ctx, const void * p)
 {
@@ -342,3 +343,17 @@ cl_context_get_svm_from_ptr(cl_context ctx, const void * p)
   }
   return NULL;
 }
+
+cl_mem
+cl_context_get_mem_from_ptr(cl_context ctx, const void * p)
+{
+  cl_mem buf = ctx->buffers;
+  while(buf) {
+    if(buf->host_ptr == NULL) continue;
+    if((size_t)buf->host_ptr <= (size_t)p &&
+       (size_t)p < ((size_t)buf->host_ptr + buf->size))
+      return buf;
+    buf = buf->next;
+  }
+  return NULL;
+}
diff --git a/src/cl_context.h b/src/cl_context.h
index 8c462b1..e4f6605 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -172,5 +172,8 @@ extern cl_kernel cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int in
 
 /* Get the SVM from pointer, return NULL if pointer is not from SVM */
 extern cl_mem cl_context_get_svm_from_ptr(cl_context ctx, const void *p);
+/* Get the mem from pointer, return NULL if pointer is not from mem*/
+extern cl_mem cl_context_get_mem_from_ptr(cl_context ctx, const void *p);
+
 #endif /* __CL_CONTEXT_H__ */
 
diff --git a/src/cl_device_enqueue.c b/src/cl_device_enqueue.c
new file mode 100644
index 0000000..7e2e758
--- /dev/null
+++ b/src/cl_device_enqueue.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang<rong.r.yang at intel.com>
+ */
+#include "cl_device_enqueue.h"
+#include "cl_mem.h"
+#include "cl_utils.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_alloc.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
+
+LOCAL cl_int
+cl_device_enqueue_fix_offset(cl_kernel ker) {
+  uint32_t i;
+  void *ptr;
+  cl_mem mem;
+  enum gbe_arg_type arg_type; /* kind of argument */
+  for (i = 0; i < ker->arg_n; ++i) {
+    arg_type = interp_kernel_get_arg_type(ker->opaque, i);
+    //HOW about image
+    if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !ker->args[i].mem)
+      continue;
+
+    if(!ker->args[i].is_svm) {
+      mem = ker->args[i].mem;
+      ptr = cl_mem_map(mem, 0);
+      cl_buffer_set_softpin_offset(mem->bo, (size_t)ptr);
+      cl_buffer_set_bo_use_full_range(mem->bo, 1);
+      cl_buffer_disable_reuse(mem->bo);
+      mem->host_ptr = ptr;
+      //cl_mem_unmap(mem);
+      ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ptr;
+    } else {
+      ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ker->args[i].mem->host_ptr;
+    }
+  }
+  for (i = 0; i < ker->arg_n; ++i) {
+    arg_type = interp_kernel_get_arg_type(ker->opaque, i);
+    //HOW about image
+    if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !ker->args[i].mem)
+      continue;
+
+    if(!ker->args[i].is_svm) {
+      cl_mem_unmap(mem);
+    }
+  }
+  return 0;
+}
+
+LOCAL cl_int
+cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker, uint32_t *max_bti, cl_gpgpu_kernel *kernel)
+{
+  int32_t value = GBE_CURBE_ENQUEUE_BUF_POINTER;
+  int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+  size_t buf_size = 32 * 1024 * 1024;  //fix 32M
+  cl_mem mem;
+
+  if(offset > 0) {
+    ker->device_enqueue_ptr = cl_mem_svm_allocate(ker->program->ctx, 0, buf_size, 0);
+    mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
+    ker->device_enqueue_infos = cl_calloc(ker->arg_n, sizeof(void *));
+    ker->device_enqueue_info_n = 0;
+
+    ker->useDeviceEnqueue = CL_TRUE;
+    cl_device_enqueue_fix_offset(ker);
+    cl_gpgpu_bind_buf(gpgpu, mem->bo, offset, 0, buf_size, *max_bti);
+
+    cl_gpgpu_set_kernel(gpgpu, ker);
+    cl_kernel_add_ref(ker);
+  }
+  return 0;
+}
+
+typedef struct ndrange_info_t {
+  int type;
+  int global_work_size[3];
+  int local_work_size[3];
+  int global_work_offset[3];
+} ndrange_info_t;
+
+typedef struct Block_literal {
+  void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
+  int flags;
+  int reserved;
+  int index;
+  struct Block_descriptor_1 {
+    unsigned long int reserved;         // NULL
+    unsigned long int size;         // sizeof(struct Block_literal_1)
+    // optional helper functions
+    void *copy_helper;     // IFF (1<<25)
+    void *dispose_helper;             // IFF (1<<25)
+    // required ABI.2010.3.16
+    const char *signature;                         // IFF (1<<30)
+  } *descriptor;
+  // imported variables
+} Block_literal;
+
+LOCAL cl_int
+cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu)
+{
+  cl_mem mem;
+  int size, type, dim, i;
+  const char * kernel_name;
+  cl_kernel child_ker;
+  size_t fixed_global_off[] = {0,0,0};
+  size_t fixed_global_sz[] = {1,1,1};
+  size_t fixed_local_sz[] = {1,1,1};
+
+  cl_kernel ker = cl_gpgpu_get_kernel(gpgpu);
+  if(ker == NULL || ker->useDeviceEnqueue == CL_FALSE)
+    return 0;
+
+  cl_invalid_thread_gpgpu(queue);
+
+  mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
+  if(mem == NULL) return -1;
+  cl_command_queue_finish(queue);
+  char *ptr = (char *)cl_mem_map(mem, 0);
+
+  size =  *(int *)ptr;
+  ptr += 4;
+  while(size > 0) {
+    ndrange_info_t* ndrange_info = (ndrange_info_t *)ptr;
+    size -= sizeof(ndrange_info_t);
+    ptr += sizeof(ndrange_info_t);
+
+    Block_literal *block = (Block_literal *)ptr;
+    size -=  block->descriptor->size;
+    ptr += block->descriptor->size;
+
+    type = ndrange_info->type;
+    dim = (type & 0xf0) >> 4;
+    type = type & 0xf;
+    for(i = 0; i <= dim; i++) {
+      fixed_global_sz[i] = ndrange_info->global_work_size[i];
+      if(type > 1)
+        fixed_local_sz[i] = ndrange_info->local_work_size[i];
+      if(type > 2)
+        fixed_global_off[i] = ndrange_info->global_work_offset[i];
+    }
+
+    kernel_name = interp_program_get_device_enqueue_kernel_name(ker->program->opaque, block->index);
+    child_ker = cl_program_create_kernel(ker->program, kernel_name, NULL);
+    cl_kernel_set_arg_svm_pointer(child_ker, 0, block);
+    cl_kernel_set_exec_info(child_ker, ker->device_enqueue_info_n * sizeof(void *),
+                            ker->device_enqueue_infos);
+
+    cl_command_queue_ND_range(queue, child_ker, dim + 1, fixed_global_off, fixed_global_sz, fixed_local_sz);
+    cl_command_queue_flush(queue);
+  }
+  cl_command_queue_finish(queue);
+  cl_kernel_delete(ker);
+  return 0;
+}
+
diff --git a/src/cl_device_enqueue.h b/src/cl_device_enqueue.h
new file mode 100644
index 0000000..34842e3
--- /dev/null
+++ b/src/cl_device_enqueue.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang<rong.r.yang at intel.com>
+ */
+
+#ifndef __CL_DEVICE_ENQUEUE_H__
+#define __CL_DEVICE_ENQUEUE_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "cl_thread.h"
+#include "CL/cl.h"
+#include <stdint.h>
+
+extern cl_int cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker,
+                                                     uint32_t *max_bti, cl_gpgpu_kernel *kernel);
+extern cl_int cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu);
+#endif /* __CL_DEVICE_ENQUEUE_H__ */
+
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 631b21f..898638d 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -305,14 +305,17 @@ extern cl_buffer_alloc_cb *cl_buffer_alloc;
 typedef cl_buffer (cl_buffer_alloc_userptr_cb)(cl_buffer_mgr, const char*, void *, size_t, unsigned long);
 extern cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr;
 
-typedef cl_buffer (cl_buffer_set_softpin_offset_cb)(cl_buffer, uint64_t);
+typedef int (cl_buffer_set_softpin_offset_cb)(cl_buffer, uint64_t);
 extern cl_buffer_set_softpin_offset_cb *cl_buffer_set_softpin_offset;
 
-typedef cl_buffer (cl_buffer_set_bo_use_full_range_cb)(cl_buffer, uint32_t);
+typedef int (cl_buffer_set_bo_use_full_range_cb)(cl_buffer, uint32_t);
 extern cl_buffer_set_bo_use_full_range_cb *cl_buffer_set_bo_use_full_range;
 
+typedef int (cl_buffer_disable_reuse_cb)(cl_buffer);
+extern cl_buffer_disable_reuse_cb *cl_buffer_disable_reuse;
+
 /* Set a buffer's tiling mode */
-typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
+typedef int (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
 extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
 
 #include "cl_context.h"
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 5ab0fa4..c1ae868 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -34,6 +34,7 @@ LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
 LOCAL cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr = NULL;
 LOCAL cl_buffer_set_softpin_offset_cb *cl_buffer_set_softpin_offset = NULL;
 LOCAL cl_buffer_set_bo_use_full_range_cb *cl_buffer_set_bo_use_full_range = NULL;
+LOCAL cl_buffer_disable_reuse_cb *cl_buffer_disable_reuse = NULL;
 LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
 LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
 LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index fe042a7..96f7f01 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -59,6 +59,10 @@ cl_kernel_delete(cl_kernel k)
     cl_free(k->images);
   if (k->exec_info)
     cl_free(k->exec_info);
+  if (k->device_enqueue_infos)
+    cl_free(k->exec_info);
+  if (k->device_enqueue_infos)
+    cl_mem_svm_delete(k->program->ctx, k->device_enqueue_ptr);
   k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(k);
 }
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 87187bc..76ba8e4 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -68,7 +68,11 @@ struct _cl_kernel {
   uint32_t arg_n:31;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
   uint32_t exec_info_n;       /* The kernel's exec info count */
-  void** exec_info;            /* The kernel's exec info */
+  void** exec_info;             /* The kernel's exec info */
+  cl_bool useDeviceEnqueue;     /* kernel use device enqueue */
+  void* device_enqueue_ptr;     /* device_enqueue buffer*/
+  uint32_t device_enqueue_info_n; /* count of parent kernel's arguments buffers, as child enqueues' exec info */
+  void** device_enqueue_infos;   /* parent kernel's arguments buffers, as child enqueues' exec info   */
 };
 
 /* Allocate an empty kernel */
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index ed6b33f..d256ab5 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -862,6 +862,7 @@ intel_setup_callbacks(void)
   cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
   cl_buffer_set_softpin_offset = (cl_buffer_set_softpin_offset_cb *) drm_intel_bo_set_softpin_offset;
   cl_buffer_set_bo_use_full_range = (cl_buffer_set_bo_use_full_range_cb *) drm_intel_bo_use_48b_address_range;
+  cl_buffer_disable_reuse = (cl_buffer_disable_reuse_cb *) drm_intel_bo_disable_reuse;
   cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
 #if defined(HAS_EGL)
   cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
-- 
1.9.1



More information about the Beignet mailing list