[Beignet] [PATCH V2] Add clEnqueueCopyBufferRect api.

Yang Rong rong.r.yang at intel.com
Mon Sep 9 01:10:08 PDT 2013


Using enqueue ND range to copy two buffers. Now compile the kernel string, after
load binary ready, should using static binary.

V2: Add a comment for function check_copy_overlap and rename CL_INVALID TO CL_INTERNAL_KERNEL_MAX.
---
 src/cl_api.c     | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/cl_context.h |  16 +++++++
 src/cl_enqueue.c |   1 +
 src/cl_mem.c     |  86 +++++++++++++++++++++++++++++++++++
 src/cl_mem.h     |   4 ++
 5 files changed, 240 insertions(+), 2 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c
index 3630b48..2c9b399 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -79,6 +79,66 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
   return status;
 }
 
+/* The following code checking overlap is from Appendix of openCL spec 1.1 */
+inline cl_bool check_copy_overlap(const size_t src_offset[3],
+                                  const size_t dst_offset[3],
+                                  const size_t region[3],
+                                  size_t row_pitch, size_t slice_pitch)
+{
+  const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
+  const size_t src_max[] = {src_offset[0] + region[0],
+                            src_offset[1] + region[1],
+                            src_offset[2] + region[2]};
+  const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
+  const size_t dst_max[] = {dst_offset[0] + region[0],
+                            dst_offset[1] + region[1],
+                            dst_offset[2] + region[2]};
+  // Check for overlap
+  cl_bool overlap = CL_TRUE;
+  unsigned i;
+  size_t dst_start = dst_offset[2] * slice_pitch +
+                     dst_offset[1] * row_pitch + dst_offset[0];
+  size_t dst_end = dst_start + (region[2] * slice_pitch +
+                   region[1] * row_pitch + region[0]);
+  size_t src_start = src_offset[2] * slice_pitch +
+                     src_offset[1] * row_pitch + src_offset[0];
+  size_t src_end = src_start + (region[2] * slice_pitch +
+                   region[1] * row_pitch + region[0]);
+
+  for (i=0; i != 3; ++i) {
+    overlap = overlap && (src_min[i] < dst_max[i])
+                      && (src_max[i] > dst_min[i]);
+  }
+
+  if (!overlap) {
+    size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
+                          src_offset[0] + region[0] - row_pitch : 0;
+    size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
+                          dst_offset[0] + region[0] - row_pitch : 0;
+    if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
+         (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
+      if ( (src_start <= dst_start && dst_start < src_end) ||
+           (dst_start <= src_start && src_start < dst_end) )
+        overlap = CL_TRUE;
+    }
+    if (region[2] > 1) {
+      size_t src_height = slice_pitch / row_pitch;
+      size_t dst_height = slice_pitch / row_pitch;
+      size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
+                            src_offset[1] + region[1] - src_height : 0;
+      size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
+                            dst_offset[1] + region[1] - dst_height : 0;
+      if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
+           (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
+        if ( (src_start <= dst_start && dst_start < src_end) ||
+             (dst_start <= src_start && src_start < dst_end) )
+          overlap = CL_TRUE;
+      }
+    }
+  }
+  return overlap;
+}
+
 static cl_int
 cl_check_device_type(cl_device_type device_type)
 {
@@ -1483,8 +1543,79 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
                         const cl_event *     event_wait_list,
                         cl_event *           event)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(src_buffer);
+  CHECK_MEM(dst_buffer);
+
+  if ((command_queue->ctx != src_buffer->ctx) ||
+      (command_queue->ctx != dst_buffer->ctx)) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if(src_row_pitch == 0)
+    src_row_pitch = region[0];
+  if(src_slice_pitch == 0)
+    src_slice_pitch = region[1] * src_row_pitch;
+
+  if(dst_row_pitch == 0)
+    dst_row_pitch = region[0];
+  if(dst_slice_pitch == 0)
+    dst_slice_pitch = region[1] * dst_row_pitch;
+
+  if (src_row_pitch < region[0] ||
+      dst_row_pitch < region[0]) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) ||
+      (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((src_origin[2]+region[2])*src_slice_pitch + (src_origin[1]+region[1])*src_row_pitch + src_origin[0] + region[0] > src_buffer->size ||
+      (dst_origin[2]+region[2])*dst_slice_pitch + (dst_origin[1]+region[1])*dst_row_pitch + dst_origin[0] + region[0] > dst_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_buffer == dst_buffer &&
+      check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
+    err = CL_MEM_COPY_OVERLAP;
+    goto error;
+  }
+
+  cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
+                          src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyBufferRect;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_command_queue_flush(command_queue);
+  }
+
+error:
+  return err;
 }
 
 cl_int
diff --git a/src/cl_context.h b/src/cl_context.h
index ac61f57..461113a 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -39,6 +39,18 @@ enum _cl_gl_context_type {
   CL_GL_CGL_SHAREGROUP
 };
 
+enum _cl_internal_ker_type {
+  CL_ENQUEUE_COPY_BUFFER = 0,
+  CL_ENQUEUE_COPY_BUFFER_RECT = 1,
+  CL_ENQUEUE_COPY_IMAGE_0 = 2,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_1 = 3,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_2 = 4,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_3 = 5,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER = 6,
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE = 7,
+  CL_INTERNAL_KERNEL_MAX = 8
+};
+
 struct _cl_context_prop {
   cl_context_properties platform_id;
   enum _cl_gl_context_type gl_type;
@@ -71,6 +83,10 @@ struct _cl_context {
   pthread_mutex_t buffer_lock;      /* To allocate and deallocate buffers */
   pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
   pthread_mutex_t event_lock;       /* To allocate and deallocate events */
+  cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
+                                    /* All programs internal used, for example clEnqueuexxx api use */
+  cl_kernel  internel_kernels[CL_INTERNAL_KERNEL_MAX];
+                                    /* All kernels  for clenqueuexxx api, for example clEnqueuexxx api use */
   uint32_t ver;                     /* Gen version */
   struct _cl_context_prop props;
   cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 156ea8c..3446ac3 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -372,6 +372,7 @@ cl_int cl_enqueue_handle(enqueue_data* data)
       return cl_enqueue_map_image(data);
     case EnqueueUnmapMemObject:
       return cl_enqueue_unmap_mem_object(data);
+    case EnqueueCopyBufferRect:
     case EnqueueNDRangeKernel:
       cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);   //goto default
     default:
diff --git a/src/cl_mem.c b/src/cl_mem.c
index f74b5cf..886af8c 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -25,6 +25,9 @@
 #include "cl_device_id.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
 
 #include "CL/cl.h"
 #include "CL/cl_intel.h"
@@ -565,6 +568,89 @@ cl_mem_add_ref(cl_mem mem)
   atomic_inc(&mem->ref_n);
 }
 
+#define LOCAL_SZ_0   16
+#define LOCAL_SZ_1   4
+#define LOCAL_SZ_2   4
+
+LOCAL cl_int
+cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+                       const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+                       size_t src_row_pitch, size_t src_slice_pitch,
+                       size_t dst_row_pitch, size_t dst_slice_pitch) {
+  cl_int ret;
+  cl_kernel ker;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_2,LOCAL_SZ_1,LOCAL_SZ_0};
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+  cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
+  cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
+  cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
+
+  static const char *kernel_str =
+      "kernel void __cl_cpy_buffer_rect ( \n"
+      "       global char* src, global char* dst, \n"
+      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
+      "       unsigned int src_offset, unsigned int dst_offset, \n"
+      "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
+      "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
+      "  int i = get_global_id(0); \n"
+      "  int j = get_global_id(1); \n"
+      "  int k = get_global_id(2); \n"
+      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+      "    return; \n"
+      "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+      "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+      "  dst[dst_offset] = src[src_offset]; \n"
+      "}";
+
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(src_buf->ctx == dst_buf->ctx);
+  if (!src_buf->ctx->internal_prgs[index])
+  {
+    size_t length = strlen(kernel_str) + 1;
+    src_buf->ctx->internal_prgs[index] = cl_program_create_from_source(src_buf->ctx, 1, &kernel_str, &length, NULL);
+
+    if (!src_buf->ctx->internal_prgs[index])
+      return CL_OUT_OF_RESOURCES;
+
+    ret = cl_program_build(src_buf->ctx->internal_prgs[index], NULL);
+    if (ret != CL_SUCCESS)
+      return CL_OUT_OF_RESOURCES;
+
+    src_buf->ctx->internal_prgs[index]->is_built = 1;
+
+    src_buf->ctx->internel_kernels[index] = cl_kernel_dup(src_buf->ctx->internal_prgs[index]->ker[0]);
+  }
+
+  /* setup the kernel and run. */
+  ker = src_buf->ctx->internel_kernels[index];
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch);
+  cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
+  cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  cl_command_queue_finish(queue);
+
+  return ret;
+}
+
 LOCAL void*
 cl_mem_map(cl_mem mem)
 {
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 9a70913..b9b3c0a 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -186,6 +186,10 @@ extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
 /* Add one more reference to this object */
 extern void cl_mem_add_ref(cl_mem);
 
+/* api clEnqueueCopy buffer rect help function */
+extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
+                                     const size_t *, const size_t *, const size_t *,
+                                     size_t, size_t, size_t, size_t);
 /* Directly map a memory object */
 extern void *cl_mem_map(cl_mem);
 
-- 
1.8.1.2



More information about the Beignet mailing list