[Beignet] [PATCH] [PATCH_V3] GBE: Improve the clEnqueueCopyBuffer performance in not-aligned case

Sun, Yi yi.sun at intel.com
Mon Mar 31 20:19:15 PDT 2014


Verified.

This patch can save 2/3 time cost for clEnqueueCopyBuffer.
The test case has been sent out:
[Beignet][PATCH 1/2 v2] Prepare to add uperformance test suite
[Beignet] [PATCH 2/2 v2] Add unit performance suite

Thanks
  --Sun, Yi

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Lv
> Meng
> Sent: Thursday, March 20, 2014 3:07 PM
> To: beignet at lists.freedesktop.org
> Cc: Lv, Meng
> Subject: [Beignet] [PATCH] [PATCH_V3] GBE: Improve the clEnqueueCopyBuffer
> performance in not-aligned case
> 
> Signed-off-by: Lv Meng <meng.lv at intel.com>
> ---
>  src/CMakeLists.txt                             |  3 +-
>  src/cl_context.h                               |  1 +
>  src/cl_mem.c                                   | 79
> ++++++++++++++++++++++----
>  src/kernels/cl_internel_copy_buf_dword_copy.cl | 19 +++++++
>  4 files changed, 89 insertions(+), 13 deletions(-)  mode change 100644 =>
> 100755 src/CMakeLists.txt  mode change 100644 => 100755 src/cl_context.h
> mode change 100644 => 100755 src/cl_mem.c  create mode 100755
> src/kernels/cl_internel_copy_buf_dword_copy.cl
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt old mode 100644 new
> mode 100755 index 95ff56f..3c23d3d
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -18,7 +18,8 @@ endforeach (KF)
>  endmacro (MakeKernelBinStr)
> 
>  set (KERNEL_STR_FILES)
> -set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4
> cl_internal_copy_buf_align16)
> +set (KERNEL_NAMES cl_internal_copy_buf_align1
> +cl_internal_copy_buf_align4
> +cl_internal_copy_buf_align16 cl_internel_copy_buf_dword_copy)
>  MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/"
> "${KERNEL_NAMES}")
> 
>  set(OPENCL_SRC
> diff --git a/src/cl_context.h b/src/cl_context.h old mode 100644 new mode
> 100755 index 29bcb9f..7326458
> --- a/src/cl_context.h
> +++ b/src/cl_context.h
> @@ -43,6 +43,7 @@ enum _cl_internal_ker_type {
>    CL_ENQUEUE_COPY_BUFFER_ALIGN1 = 0,
>    CL_ENQUEUE_COPY_BUFFER_ALIGN4,
>    CL_ENQUEUE_COPY_BUFFER_ALIGN16,
> +  CL_ENQUEUE_COPY_BUFFER_DWORD_COPY,
>    CL_ENQUEUE_COPY_BUFFER_RECT,
>    CL_ENQUEUE_COPY_IMAGE_0,             //copy image 2d to image
> 2d
>    CL_ENQUEUE_COPY_IMAGE_1,             //copy image 3d to image
> 2d
> diff --git a/src/cl_mem.c b/src/cl_mem.c old mode 100644 new mode 100755
> index 9e0d334..0fd2959
> --- a/src/cl_mem.c
> +++ b/src/cl_mem.c
> @@ -749,6 +749,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem
> src_buf, cl_mem dst_buf,
>    size_t global_off[] = {0,0,0};
>    size_t global_sz[] = {1,1,1};
>    size_t local_sz[] = {1,1,1};
> +  int baligned = 1;
> 
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(src_buf->ctx == dst_buf->ctx); @@ -759,6 +760,7 @@
> cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
> 
>      ker = cl_context_get_static_kernel_form_bin(queue->ctx,
> CL_ENQUEUE_COPY_BUFFER_ALIGN1,
>               cl_internal_copy_buf_align1_str,
> (size_t)cl_internal_copy_buf_align1_str_size, NULL);
> +    baligned = 0;
>    } else if ((cb % 16) || (src_offset % 16) || (dst_offset % 16)) {
>      extern char cl_internal_copy_buf_align4_str[];
>      extern int cl_internal_copy_buf_align4_str_size;
> @@ -782,20 +784,73 @@ cl_mem_copy(cl_command_queue queue, cl_mem
> src_buf, cl_mem dst_buf,
>    if (!ker)
>      return CL_OUT_OF_RESOURCES;
> 
> -  if (cb < LOCAL_SZ_0) {
> -    local_sz[0] = 1;
> +  if(baligned) {
> +    if (cb < LOCAL_SZ_0) {
> +      local_sz[0] = 1;
> +    } else {
> +      local_sz[0] = LOCAL_SZ_0;
> +    }
> +    global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
> +    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
> +    cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
> +    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
> +    cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
> +    cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
> +    ret = cl_command_queue_ND_range(queue, ker, 1, global_off,
> + global_sz, local_sz);
>    } else {
> -    local_sz[0] = LOCAL_SZ_0;
> +    extern char cl_internel_copy_buf_dword_copy_str[];
> +    extern int cl_internel_copy_buf_dword_copy_str_size;
> +    cl_kernel dword_ker =
> cl_context_get_static_kernel_form_bin(queue->ctx,
> CL_ENQUEUE_COPY_BUFFER_DWORD_COPY,
> +             cl_internel_copy_buf_dword_copy_str,
> (size_t)cl_internel_copy_buf_dword_copy_str_size, NULL);
> +    if (!dword_ker)
> +      return CL_OUT_OF_RESOURCES;
> +    int upbyte = dst_offset%4;
> +    if(upbyte)
> +      upbyte = 4-upbyte;
> +    int alignbyte = cb - upbyte;
> +    int aligndword = alignbyte/4;
> +    int downbyte = alignbyte%4;
> +    int dstalignoffset = dst_offset/4;
> +    if(upbyte){
> +      cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
> +      cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
> +      cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
> +      cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
> +      cl_kernel_set_arg(ker, 4, sizeof(int), &upbyte);
> +      global_sz[0] = LOCAL_SZ_0;
> +      local_sz[0] = LOCAL_SZ_0;
> +      ret = cl_command_queue_ND_range(queue, ker, 1, global_off,
> global_sz, local_sz);
> +      if(aligndword)
> +        cl_command_queue_flush(queue);
> +      dst_offset += upbyte;
> +      src_offset += upbyte;
> +      dstalignoffset += 1;
> +    }
> +    if(aligndword){
> +      cl_kernel_set_arg(dword_ker, 0, sizeof(cl_mem), &src_buf);
> +      cl_kernel_set_arg(dword_ker, 1, sizeof(int), &src_offset);
> +      cl_kernel_set_arg(dword_ker, 2, sizeof(cl_mem), &dst_buf);
> +      cl_kernel_set_arg(dword_ker, 3, sizeof(int), &dstalignoffset);
> +      cl_kernel_set_arg(dword_ker, 4, sizeof(int), &aligndword);
> +      global_sz[0] = ((aligndword + LOCAL_SZ_0 -
> 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
> +      local_sz[0] = LOCAL_SZ_0;
> +      ret = cl_command_queue_ND_range(queue, dword_ker, 1, global_off,
> global_sz, local_sz);
> +      if(downbyte)
> +        cl_command_queue_flush(queue);
> +      src_offset += aligndword*4;
> +      dst_offset += aligndword*4;
> +    }
> +    if(downbyte){
> +      cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
> +      cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
> +      cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
> +      cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
> +      cl_kernel_set_arg(ker, 4, sizeof(int), &downbyte);
> +      global_sz[0] = LOCAL_SZ_0;
> +      local_sz[0] = LOCAL_SZ_0;
> +      ret = cl_command_queue_ND_range(queue, ker, 1, global_off,
> global_sz, local_sz);
> +    }
>    }
> -  global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
> -
> -  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
> -  cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
> -  cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
> -  cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
> -  cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
> -
> -  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> 
>    return ret;
>  }
> diff --git a/src/kernels/cl_internel_copy_buf_dword_copy.cl
> b/src/kernels/cl_internel_copy_buf_dword_copy.cl
> new file mode 100755
> index 0000000..55a76d0
> --- /dev/null
> +++ b/src/kernels/cl_internel_copy_buf_dword_copy.cl
> @@ -0,0 +1,19 @@
> +kernel void dword_copy(__global unsigned int*src,int srcoffset,__global
> unsigned int*dst,int dstalignoffset,int size){
> +    unsigned int outdata = 0;
> +    unsigned char lsm[8];
> +    unsigned int* li = lsm;
> +    int lsmoffset = srcoffset%4;
> +    __global unsigned int *src_algin = src+(srcoffset/4);
> +    __global unsigned int *dst_align = dst+dstalignoffset;
> +    int gid = get_global_id(0);
> +    if(gid<size){
> +     *li = src_algin[gid];
> +     if(lsmoffset){
> +       *(li+1) = src_algin[gid+1];
> +       outdata =
> (lsm[lsmoffset])|(lsm[lsmoffset+1]<<8)|(lsm[lsmoffset+2]<<16)|(lsm[lsmoffset
> +3]<<24);
> +     }
> +     else
> +       outdata = *li;
> +     dst_align[gid] = outdata;
> +    }
> +}
> \ No newline at end of file
> --
> 1.8.3.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list