[Beignet] [PATCH 3/4] Implement the clEnqueueCopyBuffer API using internal binary kernel

Yang, Rong R rong.r.yang at intel.com
Tue Oct 8 22:47:50 PDT 2013


In function cl_mem_copy, the local size is only 1, maybe set it to large or equal to 16 can improve performance.

-----Original Message-----
From: beignet-bounces+rong.r.yang=intel.com at lists.freedesktop.org [mailto:beignet-bounces+rong.r.yang=intel.com at lists.freedesktop.org] On Behalf Of junyan.he at inbox.com
Sent: Monday, September 23, 2013 5:02 PM
To: beignet at lists.freedesktop.org
Cc: Junyan He
Subject: [Beignet] [PATCH 3/4] Implement the clEnqueueCopyBuffer API using internal binary kernel

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 src/cl_api.c     |   51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/cl_context.c |   26 ++++++++++++++++++++++++++
 src/cl_context.h |   28 +++++++++++++++++-----------
 src/cl_mem.c     |   53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/cl_mem.h     |    4 ++++
 5 files changed, 150 insertions(+), 12 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c index c81f730..e9303d8 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1521,8 +1521,57 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
                     const cl_event *     event_wait_list,
                     cl_event *           event)
 {
-  NOT_IMPLEMENTED;
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(src_buffer);
+  CHECK_MEM(dst_buffer);
+
+  if (command_queue->ctx != src_buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (command_queue->ctx != dst_buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (src_offset < 0 || src_offset + cb > src_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if (dst_offset < 0 || dst_offset + cb > src_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  /* Check overlap */
+  if (src_buffer == dst_buffer
+         && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1)
+         && (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
+    err = CL_MEM_COPY_OVERLAP;
+    goto error;
+  }
+
+  // TODO: Need to check the sub buffer cases.
+  err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset, 
+ dst_offset, cb);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, 
+ event_wait_list, event, src_buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyBuffer;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_command_queue_flush(command_queue);
+  }
   return 0;
+
+error:
+  return err;
 }
 
 cl_int
diff --git a/src/cl_context.c b/src/cl_context.c index 4f1c611..b62e946 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -268,3 +268,29 @@ cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kern
 
   return ctx->internel_kernels[index];
 }
+
+cl_kernel
+cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
+                  const char * str_kernel, size_t size, const char * 
+str_option) {
+  cl_int ret;
+  cl_int binary_status = CL_SUCCESS;
+  if (!ctx->internal_prgs[index])
+  {
+    ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
+      &size, (const unsigned char **)&str_kernel, &binary_status, 
+&ret);
+
+    if (!ctx->internal_prgs[index])
+      return NULL;
+
+    ret = cl_program_build(ctx->internal_prgs[index], str_option);
+    if (ret != CL_SUCCESS)
+      return NULL;
+
+    ctx->internal_prgs[index]->is_built = 1;
+
+    ctx->internel_kernels[index] = 
+ cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+  }
+
+  return ctx->internel_kernels[index];
+}
diff --git a/src/cl_context.h b/src/cl_context.h index 7016733..29bcb9f 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -40,17 +40,19 @@ enum _cl_gl_context_type {  };
 
 enum _cl_internal_ker_type {
-  CL_ENQUEUE_COPY_BUFFER = 0,
-  CL_ENQUEUE_COPY_BUFFER_RECT = 1,
-  CL_ENQUEUE_COPY_IMAGE_0 = 2,             //copy image 2d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_1 = 3,             //copy image 3d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_2 = 4,             //copy image 2d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_3 = 5,             //copy image 3d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0 = 6,   //copy image 2d to buffer
-  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1 = 7,   //copy image 3d tobuffer
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0 = 8,   //copy buffer to image 2d
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1 = 9,   //copy buffer to image 3d
-  CL_INTERNAL_KERNEL_MAX = 10
+  CL_ENQUEUE_COPY_BUFFER_ALIGN1 = 0,
+  CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+  CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_RECT,
+  CL_ENQUEUE_COPY_IMAGE_0,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_1,             //copy image 3d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_2,             //copy image 2d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_3,             //copy image 3d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0,   //copy image 2d to buffer
+  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1,   //copy image 3d tobuffer
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1,   //copy buffer to image 3d
+  CL_INTERNAL_KERNEL_MAX
 };
 
 struct _cl_context_prop {
@@ -137,5 +139,9 @@ extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
 /* Get the internal used kernel */
 extern cl_kernel cl_context_get_static_kernel(cl_context ctx, cl_int index, const char *str_kernel, const char * str_option);
 
+/* Get the internal used kernel from binary*/ extern cl_kernel 
+cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
+                  const char * str_kernel, size_t size, const char * 
+str_option);
+
 #endif /* __CL_CONTEXT_H__ */
 
diff --git a/src/cl_mem.c b/src/cl_mem.c index 769e1cb..56bc6b1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -596,6 +596,59 @@ cl_mem_add_ref(cl_mem mem)
   atomic_inc(&mem->ref_n);
 }
 
+LOCAL cl_int
+cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+            size_t src_offset, size_t dst_offset, size_t cb) {
+  cl_int ret;
+  cl_kernel ker;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {1,1,1};
+
+  /* We use one kernel to copy the data. The kernel is lazily created. 
+ */  assert(src_buf->ctx == dst_buf->ctx);
+
+  if ((cb % 4) || (src_offset % 4) || (dst_offset % 4)) {
+    extern char cl_internal_copy_buf_align1_str[];
+    extern int cl_internal_copy_buf_align1_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN1,
+             cl_internal_copy_buf_align1_str, (size_t)cl_internal_copy_buf_align1_str_size, NULL);
+    global_sz[0] = cb;
+  } else if ((cb % 16) || (src_offset % 16) || (dst_offset % 16)) {
+    extern char cl_internal_copy_buf_align4_str[];
+    extern int cl_internal_copy_buf_align4_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+             cl_internal_copy_buf_align4_str, (size_t)cl_internal_copy_buf_align4_str_size, NULL);
+    global_sz[0] = cb/4;
+    src_offset = src_offset/4;
+    dst_offset = dst_offset/4;
+  } else {
+    extern char cl_internal_copy_buf_align16_str[];
+    extern int cl_internal_copy_buf_align16_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+             cl_internal_copy_buf_align16_str, (size_t)cl_internal_copy_buf_align16_str_size, NULL);
+    global_sz[0] = cb/16;
+    src_offset = src_offset/4;
+    dst_offset = dst_offset/4;
+  }
+
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);  
+ cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);  
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);  
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, 
+ local_sz);
+
+  return ret;
+}
+
 #define LOCAL_SZ_0   16
 #define LOCAL_SZ_1   4
 #define LOCAL_SZ_2   4
diff --git a/src/cl_mem.h b/src/cl_mem.h index ac09c3b..3313224 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -187,6 +187,10 @@ extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
 /* Add one more reference to this object */  extern void cl_mem_add_ref(cl_mem);
 
+/* api clEnqueueCopyBuffer help function */ extern cl_int 
+cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+              size_t src_offset, size_t dst_offset, size_t cb);
+
 /* api clEnqueueCopyBufferRect help function */  extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
                                      const size_t *, const size_t *, const size_t *,
--
1.7.9.5

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list