[Beignet] [PATCH] [PATCH_V3] GBE: Improve the clEnqueueCopyBuffer performance in not-aligned case

Lv Meng meng.lv at intel.com
Thu Mar 20 00:06:47 PDT 2014


Signed-off-by: Lv Meng <meng.lv at intel.com>
---
 src/CMakeLists.txt                             |  3 +-
 src/cl_context.h                               |  1 +
 src/cl_mem.c                                   | 79 ++++++++++++++++++++++----
 src/kernels/cl_internel_copy_buf_dword_copy.cl | 19 +++++++
 4 files changed, 89 insertions(+), 13 deletions(-)
 mode change 100644 => 100755 src/CMakeLists.txt
 mode change 100644 => 100755 src/cl_context.h
 mode change 100644 => 100755 src/cl_mem.c
 create mode 100755 src/kernels/cl_internel_copy_buf_dword_copy.cl

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
old mode 100644
new mode 100755
index 95ff56f..3c23d3d
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,7 +18,8 @@ endforeach (KF)
 endmacro (MakeKernelBinStr)
 
 set (KERNEL_STR_FILES)
-set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 cl_internal_copy_buf_align16)
+set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4
+cl_internal_copy_buf_align16 cl_internel_copy_buf_dword_copy)
 MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
 
 set(OPENCL_SRC
diff --git a/src/cl_context.h b/src/cl_context.h
old mode 100644
new mode 100755
index 29bcb9f..7326458
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -43,6 +43,7 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_BUFFER_ALIGN1 = 0,
   CL_ENQUEUE_COPY_BUFFER_ALIGN4,
   CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_DWORD_COPY,
   CL_ENQUEUE_COPY_BUFFER_RECT,
   CL_ENQUEUE_COPY_IMAGE_0,             //copy image 2d to image 2d
   CL_ENQUEUE_COPY_IMAGE_1,             //copy image 3d to image 2d
diff --git a/src/cl_mem.c b/src/cl_mem.c
old mode 100644
new mode 100755
index 9e0d334..0fd2959
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -749,6 +749,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {1,1,1};
+  int baligned = 1;
 
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(src_buf->ctx == dst_buf->ctx);
@@ -759,6 +760,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
 
     ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN1,
              cl_internal_copy_buf_align1_str, (size_t)cl_internal_copy_buf_align1_str_size, NULL);
+    baligned = 0;
   } else if ((cb % 16) || (src_offset % 16) || (dst_offset % 16)) {
     extern char cl_internal_copy_buf_align4_str[];
     extern int cl_internal_copy_buf_align4_str_size;
@@ -782,20 +784,73 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   if (!ker)
     return CL_OUT_OF_RESOURCES;
 
-  if (cb < LOCAL_SZ_0) {
-    local_sz[0] = 1;
+  if(baligned) {
+    if (cb < LOCAL_SZ_0) {
+      local_sz[0] = 1;
+    } else {
+      local_sz[0] = LOCAL_SZ_0;
+    }
+    global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+    cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
+    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+    cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
+    cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
+    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
   } else {
-    local_sz[0] = LOCAL_SZ_0;
+    extern char cl_internel_copy_buf_dword_copy_str[];
+    extern int cl_internel_copy_buf_dword_copy_str_size;
+    cl_kernel dword_ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_DWORD_COPY,
+             cl_internel_copy_buf_dword_copy_str, (size_t)cl_internel_copy_buf_dword_copy_str_size, NULL);
+    if (!dword_ker)
+      return CL_OUT_OF_RESOURCES;
+    int upbyte = dst_offset%4;
+    if(upbyte)
+      upbyte = 4-upbyte;
+    int alignbyte = cb - upbyte;
+    int aligndword = alignbyte/4;
+    int downbyte = alignbyte%4;
+    int dstalignoffset = dst_offset/4;
+    if(upbyte){
+      cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+      cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
+      cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+      cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
+      cl_kernel_set_arg(ker, 4, sizeof(int), &upbyte);
+      global_sz[0] = LOCAL_SZ_0;
+      local_sz[0] = LOCAL_SZ_0;
+      ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+      if(aligndword)
+        cl_command_queue_flush(queue);
+      dst_offset += upbyte;
+      src_offset += upbyte;
+      dstalignoffset += 1;
+    }
+    if(aligndword){
+      cl_kernel_set_arg(dword_ker, 0, sizeof(cl_mem), &src_buf);
+      cl_kernel_set_arg(dword_ker, 1, sizeof(int), &src_offset);
+      cl_kernel_set_arg(dword_ker, 2, sizeof(cl_mem), &dst_buf);
+      cl_kernel_set_arg(dword_ker, 3, sizeof(int), &dstalignoffset);
+      cl_kernel_set_arg(dword_ker, 4, sizeof(int), &aligndword);
+      global_sz[0] = ((aligndword + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+      local_sz[0] = LOCAL_SZ_0;
+      ret = cl_command_queue_ND_range(queue, dword_ker, 1, global_off, global_sz, local_sz);
+      if(downbyte)
+        cl_command_queue_flush(queue);
+      src_offset += aligndword*4;
+      dst_offset += aligndword*4;
+    }
+    if(downbyte){
+      cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+      cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
+      cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+      cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
+      cl_kernel_set_arg(ker, 4, sizeof(int), &downbyte);
+      global_sz[0] = LOCAL_SZ_0;
+      local_sz[0] = LOCAL_SZ_0;
+      ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    }
   }
-  global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
-
-  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
-  cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
-  cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
-  cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
-  cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
-
-  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
 
   return ret;
 }
diff --git a/src/kernels/cl_internel_copy_buf_dword_copy.cl b/src/kernels/cl_internel_copy_buf_dword_copy.cl
new file mode 100755
index 0000000..55a76d0
--- /dev/null
+++ b/src/kernels/cl_internel_copy_buf_dword_copy.cl
@@ -0,0 +1,19 @@
+kernel void dword_copy(__global unsigned int*src,int srcoffset,__global unsigned int*dst,int dstalignoffset,int size){
+    unsigned int outdata = 0;
+    unsigned char lsm[8];
+    unsigned int* li = lsm;
+    int lsmoffset = srcoffset%4;
+    __global unsigned int *src_algin = src+(srcoffset/4);
+    __global unsigned int *dst_align = dst+dstalignoffset;
+    int gid = get_global_id(0);
+    if(gid<size){
+     *li = src_algin[gid];
+     if(lsmoffset){
+       *(li+1) = src_algin[gid+1];
+       outdata = (lsm[lsmoffset])|(lsm[lsmoffset+1]<<8)|(lsm[lsmoffset+2]<<16)|(lsm[lsmoffset+3]<<24);
+     }
+     else
+       outdata = *li;
+     dst_align[gid] = outdata;
+    }
+}
\ No newline at end of file
-- 
1.8.3.2



More information about the Beignet mailing list