[Beignet] [PATCH] [PATCH_V3] GBE: Improve the clEnqueueCopyBuffer performance in not-aligned case
Lv Meng
meng.lv at intel.com
Thu Mar 20 00:06:47 PDT 2014
Signed-off-by: Lv Meng <meng.lv at intel.com>
---
src/CMakeLists.txt | 3 +-
src/cl_context.h | 1 +
src/cl_mem.c | 79 ++++++++++++++++++++++----
src/kernels/cl_internel_copy_buf_dword_copy.cl | 19 +++++++
4 files changed, 89 insertions(+), 13 deletions(-)
mode change 100644 => 100755 src/CMakeLists.txt
mode change 100644 => 100755 src/cl_context.h
mode change 100644 => 100755 src/cl_mem.c
create mode 100755 src/kernels/cl_internel_copy_buf_dword_copy.cl
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
old mode 100644
new mode 100755
index 95ff56f..3c23d3d
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,7 +18,8 @@ endforeach (KF)
endmacro (MakeKernelBinStr)
set (KERNEL_STR_FILES)
-set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 cl_internal_copy_buf_align16)
+set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4
+cl_internal_copy_buf_align16 cl_internel_copy_buf_dword_copy)
MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
set(OPENCL_SRC
diff --git a/src/cl_context.h b/src/cl_context.h
old mode 100644
new mode 100755
index 29bcb9f..7326458
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -43,6 +43,7 @@ enum _cl_internal_ker_type {
CL_ENQUEUE_COPY_BUFFER_ALIGN1 = 0,
CL_ENQUEUE_COPY_BUFFER_ALIGN4,
CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+ CL_ENQUEUE_COPY_BUFFER_DWORD_COPY,
CL_ENQUEUE_COPY_BUFFER_RECT,
CL_ENQUEUE_COPY_IMAGE_0, //copy image 2d to image 2d
CL_ENQUEUE_COPY_IMAGE_1, //copy image 3d to image 2d
diff --git a/src/cl_mem.c b/src/cl_mem.c
old mode 100644
new mode 100755
index 9e0d334..0fd2959
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -749,6 +749,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
size_t global_off[] = {0,0,0};
size_t global_sz[] = {1,1,1};
size_t local_sz[] = {1,1,1};
+ int baligned = 1;
/* We use one kernel to copy the data. The kernel is lazily created. */
assert(src_buf->ctx == dst_buf->ctx);
@@ -759,6 +760,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN1,
cl_internal_copy_buf_align1_str, (size_t)cl_internal_copy_buf_align1_str_size, NULL);
+ baligned = 0;
} else if ((cb % 16) || (src_offset % 16) || (dst_offset % 16)) {
extern char cl_internal_copy_buf_align4_str[];
extern int cl_internal_copy_buf_align4_str_size;
@@ -782,20 +784,73 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
if (!ker)
return CL_OUT_OF_RESOURCES;
- if (cb < LOCAL_SZ_0) {
- local_sz[0] = 1;
+ if(baligned) {
+ if (cb < LOCAL_SZ_0) {
+ local_sz[0] = 1;
+ } else {
+ local_sz[0] = LOCAL_SZ_0;
+ }
+ global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
} else {
- local_sz[0] = LOCAL_SZ_0;
+ extern char cl_internel_copy_buf_dword_copy_str[];
+ extern int cl_internel_copy_buf_dword_copy_str_size;
+ cl_kernel dword_ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_DWORD_COPY,
+ cl_internel_copy_buf_dword_copy_str, (size_t)cl_internel_copy_buf_dword_copy_str_size, NULL);
+ if (!dword_ker)
+ return CL_OUT_OF_RESOURCES;
+ int upbyte = dst_offset%4;
+ if(upbyte)
+ upbyte = 4-upbyte;
+ int alignbyte = cb - upbyte;
+ int aligndword = alignbyte/4;
+ int downbyte = alignbyte%4;
+ int dstalignoffset = dst_offset/4;
+ if(upbyte){
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &upbyte);
+ global_sz[0] = LOCAL_SZ_0;
+ local_sz[0] = LOCAL_SZ_0;
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ if(aligndword)
+ cl_command_queue_flush(queue);
+ dst_offset += upbyte;
+ src_offset += upbyte;
+ dstalignoffset += 1;
+ }
+ if(aligndword){
+ cl_kernel_set_arg(dword_ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(dword_ker, 1, sizeof(int), &src_offset);
+ cl_kernel_set_arg(dword_ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(dword_ker, 3, sizeof(int), &dstalignoffset);
+ cl_kernel_set_arg(dword_ker, 4, sizeof(int), &aligndword);
+ global_sz[0] = ((aligndword + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+ local_sz[0] = LOCAL_SZ_0;
+ ret = cl_command_queue_ND_range(queue, dword_ker, 1, global_off, global_sz, local_sz);
+ if(downbyte)
+ cl_command_queue_flush(queue);
+ src_offset += aligndword*4;
+ dst_offset += aligndword*4;
+ }
+ if(downbyte){
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &downbyte);
+ global_sz[0] = LOCAL_SZ_0;
+ local_sz[0] = LOCAL_SZ_0;
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ }
}
- global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
-
- cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
- cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
- cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
- cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
- cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
-
- ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
return ret;
}
diff --git a/src/kernels/cl_internel_copy_buf_dword_copy.cl b/src/kernels/cl_internel_copy_buf_dword_copy.cl
new file mode 100755
index 0000000..55a76d0
--- /dev/null
+++ b/src/kernels/cl_internel_copy_buf_dword_copy.cl
@@ -0,0 +1,19 @@
+kernel void dword_copy(__global unsigned int*src,int srcoffset,__global unsigned int*dst,int dstalignoffset,int size){
+ unsigned int outdata = 0;
+ unsigned char lsm[8];
+ unsigned int* li = lsm;
+ int lsmoffset = srcoffset%4;
+ __global unsigned int *src_algin = src+(srcoffset/4);
+ __global unsigned int *dst_align = dst+dstalignoffset;
+ int gid = get_global_id(0);
+ if(gid<size){
+ *li = src_algin[gid];
+ if(lsmoffset){
+ *(li+1) = src_algin[gid+1];
+ outdata = (lsm[lsmoffset])|(lsm[lsmoffset+1]<<8)|(lsm[lsmoffset+2]<<16)|(lsm[lsmoffset+3]<<24);
+ }
+ else
+ outdata = *li;
+ dst_align[gid] = outdata;
+ }
+}
\ No newline at end of file
--
1.8.3.2
More information about the Beignet
mailing list