[Beignet] [PATCH] [PATCH_V3] GBE: Improve the clEnqueueCopyBuffer performance in not-aligned case
Sun, Yi
yi.sun at intel.com
Mon Mar 31 20:19:15 PDT 2014
Verified.
This patch can save 2/3 time cost for clEnqueueCopyBuffer.
The test case has been sent out:
[Beignet][PATCH 1/2 v2] Prepare to add uperformance test suite
[Beignet] [PATCH 2/2 v2] Add unit performance suite
Thanks
--Sun, Yi
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Lv
> Meng
> Sent: Thursday, March 20, 2014 3:07 PM
> To: beignet at lists.freedesktop.org
> Cc: Lv, Meng
> Subject: [Beignet] [PATCH] [PATCH_V3] GBE: Improve the clEnqueueCopyBuffer
> performance in not-aligned case
>
> Signed-off-by: Lv Meng <meng.lv at intel.com>
> ---
> src/CMakeLists.txt | 3 +-
> src/cl_context.h | 1 +
> src/cl_mem.c | 79
> ++++++++++++++++++++++----
> src/kernels/cl_internel_copy_buf_dword_copy.cl | 19 +++++++
> 4 files changed, 89 insertions(+), 13 deletions(-) mode change 100644 =>
> 100755 src/CMakeLists.txt mode change 100644 => 100755 src/cl_context.h
> mode change 100644 => 100755 src/cl_mem.c create mode 100755
> src/kernels/cl_internel_copy_buf_dword_copy.cl
>
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt old mode 100644 new
> mode 100755 index 95ff56f..3c23d3d
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -18,7 +18,8 @@ endforeach (KF)
> endmacro (MakeKernelBinStr)
>
> set (KERNEL_STR_FILES)
> -set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4
> cl_internal_copy_buf_align16)
> +set (KERNEL_NAMES cl_internal_copy_buf_align1
> +cl_internal_copy_buf_align4
> +cl_internal_copy_buf_align16 cl_internel_copy_buf_dword_copy)
> MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/"
> "${KERNEL_NAMES}")
>
> set(OPENCL_SRC
> diff --git a/src/cl_context.h b/src/cl_context.h old mode 100644 new mode
> 100755 index 29bcb9f..7326458
> --- a/src/cl_context.h
> +++ b/src/cl_context.h
> @@ -43,6 +43,7 @@ enum _cl_internal_ker_type {
> CL_ENQUEUE_COPY_BUFFER_ALIGN1 = 0,
> CL_ENQUEUE_COPY_BUFFER_ALIGN4,
> CL_ENQUEUE_COPY_BUFFER_ALIGN16,
> + CL_ENQUEUE_COPY_BUFFER_DWORD_COPY,
> CL_ENQUEUE_COPY_BUFFER_RECT,
> CL_ENQUEUE_COPY_IMAGE_0, //copy image 2d to image
> 2d
> CL_ENQUEUE_COPY_IMAGE_1, //copy image 3d to image
> 2d
> diff --git a/src/cl_mem.c b/src/cl_mem.c old mode 100644 new mode 100755
> index 9e0d334..0fd2959
> --- a/src/cl_mem.c
> +++ b/src/cl_mem.c
> @@ -749,6 +749,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem
> src_buf, cl_mem dst_buf,
> size_t global_off[] = {0,0,0};
> size_t global_sz[] = {1,1,1};
> size_t local_sz[] = {1,1,1};
> + int baligned = 1;
>
> /* We use one kernel to copy the data. The kernel is lazily created. */
> assert(src_buf->ctx == dst_buf->ctx); @@ -759,6 +760,7 @@
> cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
>
> ker = cl_context_get_static_kernel_form_bin(queue->ctx,
> CL_ENQUEUE_COPY_BUFFER_ALIGN1,
> cl_internal_copy_buf_align1_str,
> (size_t)cl_internal_copy_buf_align1_str_size, NULL);
> + baligned = 0;
> } else if ((cb % 16) || (src_offset % 16) || (dst_offset % 16)) {
> extern char cl_internal_copy_buf_align4_str[];
> extern int cl_internal_copy_buf_align4_str_size;
> @@ -782,20 +784,73 @@ cl_mem_copy(cl_command_queue queue, cl_mem
> src_buf, cl_mem dst_buf,
> if (!ker)
> return CL_OUT_OF_RESOURCES;
>
> - if (cb < LOCAL_SZ_0) {
> - local_sz[0] = 1;
> + if(baligned) {
> + if (cb < LOCAL_SZ_0) {
> + local_sz[0] = 1;
> + } else {
> + local_sz[0] = LOCAL_SZ_0;
> + }
> + global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
> + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
> + cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
> + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
> + cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
> + cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
> + ret = cl_command_queue_ND_range(queue, ker, 1, global_off,
> + global_sz, local_sz);
> } else {
> - local_sz[0] = LOCAL_SZ_0;
> + extern char cl_internel_copy_buf_dword_copy_str[];
> + extern int cl_internel_copy_buf_dword_copy_str_size;
> + cl_kernel dword_ker =
> cl_context_get_static_kernel_form_bin(queue->ctx,
> CL_ENQUEUE_COPY_BUFFER_DWORD_COPY,
> + cl_internel_copy_buf_dword_copy_str,
> (size_t)cl_internel_copy_buf_dword_copy_str_size, NULL);
> + if (!dword_ker)
> + return CL_OUT_OF_RESOURCES;
> + int upbyte = dst_offset%4;
> + if(upbyte)
> + upbyte = 4-upbyte;
> + int alignbyte = cb - upbyte;
> + int aligndword = alignbyte/4;
> + int downbyte = alignbyte%4;
> + int dstalignoffset = dst_offset/4;
> + if(upbyte){
> + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
> + cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
> + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
> + cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
> + cl_kernel_set_arg(ker, 4, sizeof(int), &upbyte);
> + global_sz[0] = LOCAL_SZ_0;
> + local_sz[0] = LOCAL_SZ_0;
> + ret = cl_command_queue_ND_range(queue, ker, 1, global_off,
> global_sz, local_sz);
> + if(aligndword)
> + cl_command_queue_flush(queue);
> + dst_offset += upbyte;
> + src_offset += upbyte;
> + dstalignoffset += 1;
> + }
> + if(aligndword){
> + cl_kernel_set_arg(dword_ker, 0, sizeof(cl_mem), &src_buf);
> + cl_kernel_set_arg(dword_ker, 1, sizeof(int), &src_offset);
> + cl_kernel_set_arg(dword_ker, 2, sizeof(cl_mem), &dst_buf);
> + cl_kernel_set_arg(dword_ker, 3, sizeof(int), &dstalignoffset);
> + cl_kernel_set_arg(dword_ker, 4, sizeof(int), &aligndword);
> + global_sz[0] = ((aligndword + LOCAL_SZ_0 -
> 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
> + local_sz[0] = LOCAL_SZ_0;
> + ret = cl_command_queue_ND_range(queue, dword_ker, 1, global_off,
> global_sz, local_sz);
> + if(downbyte)
> + cl_command_queue_flush(queue);
> + src_offset += aligndword*4;
> + dst_offset += aligndword*4;
> + }
> + if(downbyte){
> + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
> + cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
> + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
> + cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
> + cl_kernel_set_arg(ker, 4, sizeof(int), &downbyte);
> + global_sz[0] = LOCAL_SZ_0;
> + local_sz[0] = LOCAL_SZ_0;
> + ret = cl_command_queue_ND_range(queue, ker, 1, global_off,
> global_sz, local_sz);
> + }
> }
> - global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
> -
> - cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
> - cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
> - cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
> - cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
> - cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
> -
> - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
>
> return ret;
> }
> diff --git a/src/kernels/cl_internel_copy_buf_dword_copy.cl
> b/src/kernels/cl_internel_copy_buf_dword_copy.cl
> new file mode 100755
> index 0000000..55a76d0
> --- /dev/null
> +++ b/src/kernels/cl_internel_copy_buf_dword_copy.cl
> @@ -0,0 +1,19 @@
> +kernel void dword_copy(__global unsigned int*src,int srcoffset,__global
> unsigned int*dst,int dstalignoffset,int size){
> + unsigned int outdata = 0;
> + unsigned char lsm[8];
> + unsigned int* li = lsm;
> + int lsmoffset = srcoffset%4;
> + __global unsigned int *src_algin = src+(srcoffset/4);
> + __global unsigned int *dst_align = dst+dstalignoffset;
> + int gid = get_global_id(0);
> + if(gid<size){
> + *li = src_algin[gid];
> + if(lsmoffset){
> + *(li+1) = src_algin[gid+1];
> + outdata =
> (lsm[lsmoffset])|(lsm[lsmoffset+1]<<8)|(lsm[lsmoffset+2]<<16)|(lsm[lsmoffset
> +3]<<24);
> + }
> + else
> + outdata = *li;
> + dst_align[gid] = outdata;
> + }
> +}
> \ No newline at end of file
> --
> 1.8.3.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list