[Beignet] [PATCH] [PATCH_V2]improve the clEnqueueCopyBufferRect performance in some cases

Zhigang Gong zhigang.gong at linux.intel.com
Tue Jul 15 18:12:06 PDT 2014


I dig into this patch carefully, and found one serious issue in this patch.
You should also take into account the local_sz[0] and the region[0]. Either
divide it by 4 on the kernel or on the host side. I recommend you to divide
them on CPU side. And you need to check whether region[0] is multipe of 4
as well.

There is another issue beyond this patch but is related to cl_mem_copy_buffer_rect(),
the task dimension is hard coded to 1 which is incorrect. It could be 1D/2D/3D
rect. We should check whether the region[2] and region[3] are zero to determine
the actual dimension.

On Wed, Jul 16, 2014 at 12:47:02AM +0000, Lv, Meng wrote:
> 
> 
> -----Original Message-----
> From: Zhigang Gong [mailto:zhigang.gong at linux.intel.com] 
> Sent: Tuesday, July 15, 2014 5:55 PM
> To: Lv, Meng
> Cc: beignet at lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH] [PATCH_V2]improve the clEnqueueCopyBufferRect performance in some cases
> 
> On Tue, Jul 15, 2014 at 12:22:55PM +0800, Lv Meng wrote:
> > Signed-off-by: Lv Meng <meng.lv at intel.com>
> > ---
> >  src/CMakeLists.txt                              |  3 ++-
> >  src/cl_context.h                                |  1 +
> >  src/cl_mem.c                                    | 31 ++++++++++++++++++++++---
> >  src/kernels/cl_internal_copy_buf_rect_align4.cl | 15 ++++++++++++
> >  4 files changed, 46 insertions(+), 4 deletions(-)  create mode 100644 
> > src/kernels/cl_internal_copy_buf_rect_align4.cl
> > 
> > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 
> > 46426d9..dff8fdf 100644
> > --- a/src/CMakeLists.txt
> > +++ b/src/CMakeLists.txt
> > @@ -41,7 +41,8 @@ set (KERNEL_STR_FILES)  set (KERNEL_NAMES 
> > cl_internal_copy_buf_align4
> >  cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
> >  cl_internal_copy_buf_unalign_dst_offset 
> > cl_internal_copy_buf_unalign_src_offset
> > -cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d 
> > cl_internal_copy_image_2d_to_2d
> > +cl_internal_copy_buf_rect cl_internal_copy_buf_rect_align4 
> > +cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d
> >  cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d 
> > cl_internal_copy_image_3d_to_3d  cl_internal_copy_image_2d_to_buffer 
> > cl_internal_copy_image_3d_to_buffer
> >  cl_internal_copy_buffer_to_image_2d 
> > cl_internal_copy_buffer_to_image_3d
> > diff --git a/src/cl_context.h b/src/cl_context.h index 
> > 75afbf6..f8342d3 100644
> > --- a/src/cl_context.h
> > +++ b/src/cl_context.h
> > @@ -47,6 +47,7 @@ enum _cl_internal_ker_type {
> >    CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
> >    CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
> >    CL_ENQUEUE_COPY_BUFFER_RECT,
> > +  CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
> >    CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,             //copy image 1d to image 1d
> >    CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
> >    CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
> > diff --git a/src/cl_mem.c b/src/cl_mem.c index 70bc3eb..c125f62 100644
> > --- a/src/cl_mem.c
> > +++ b/src/cl_mem.c
> > @@ -1399,6 +1399,16 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
> >    size_t global_off[] = {0,0,0};
> >    size_t global_sz[] = {1,1,1};
> >    size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
> > +  // the src and dst mem rect is continuous, the copy is degraded to 
> > + buf copy  if((region[0] == dst_row_pitch) && (region[0] == 
> > + src_row_pitch) &&  (region[1] * src_row_pitch == src_slice_pitch) && (region[1] * dst_row_pitch == dst_slice_pitch)){
> > +    cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
> > +    cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
> > +    cl_int size = region[0]*region[1]*region[2];
> > +    ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size);
> > +    return ret;
> > +  }
> > +
> >    if(region[1] == 1) local_sz[1] = 1;
> >    if(region[2] == 1) local_sz[2] = 1;
> >    global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * 
> > local_sz[0]; @@ -1411,11 +1421,26 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
> >    assert(src_buf->ctx == dst_buf->ctx);
> >  
> >    /* setup the kernel and run. */
> > -  extern char cl_internal_copy_buf_rect_str[];
> > -  extern size_t cl_internal_copy_buf_rect_str_size;
> >  
> > -  ker = cl_context_get_static_kernel_from_bin(queue->ctx, 
> > CL_ENQUEUE_COPY_BUFFER_RECT,
> > +  if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch 
> > + % 4== 0) && (dst_row_pitch % 4== 0)  && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (global_sz[0] % 4 == 0) ){
> > +    extern char cl_internal_copy_buf_rect_align4_str[];
> > +    extern size_t cl_internal_copy_buf_rect_align4_str_size;
> > +    global_sz[0] /= 4;
> > +    src_offset /= 4;
> > +    dst_offset /= 4;
> > +    src_row_pitch /= 4;
> > +    dst_row_pitch /= 4;
> > +    src_slice_pitch /= 4;
> > +    dst_slice_pitch /= 4;
> > +    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
> > +      cl_internal_copy_buf_rect_align4_str, 
> > + (size_t)cl_internal_copy_buf_rect_align4_str_size, NULL);  }else{
> > +    extern char cl_internal_copy_buf_rect_str[];
> > +    extern size_t cl_internal_copy_buf_rect_str_size;
> > +    ker = cl_context_get_static_kernel_from_bin(queue->ctx, 
> > + CL_ENQUEUE_COPY_BUFFER_RECT,
> >        cl_internal_copy_buf_rect_str, 
> > (size_t)cl_internal_copy_buf_rect_str_size, NULL);
> > +  }
> >  
> >    if (!ker)
> >      return CL_OUT_OF_RESOURCES;
> > diff --git a/src/kernels/cl_internal_copy_buf_rect_align4.cl 
> > b/src/kernels/cl_internal_copy_buf_rect_align4.cl
> > new file mode 100644
> > index 0000000..fbfe7b2
> > --- /dev/null
> > +++ b/src/kernels/cl_internal_copy_buf_rect_align4.cl
> > @@ -0,0 +1,15 @@
> > +kernel void __cl_copy_buffer_rect_align4 ( global int* src, global int* dst,
> > +                                          unsigned int region0, unsigned int region1, unsigned int region2,
> > +                                          unsigned int src_offset, unsigned int dst_offset,
> > +                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
> > +                                          unsigned int dst_row_pitch, 
> > +unsigned int dst_slice_pitch) {
> > +  int i = get_global_id(0);
> > +  int j = get_global_id(1);
> > +  int k = get_global_id(2);
> > +  if((i >= region0) || (j>= region1) || (k>=region2))
> > +    return;
> > +  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
> > +  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
> > +  dst[dst_offset] = src[src_offset];
> > +}
> This kernel is the same as __cl_copy_buffer_rect(). Is it expected?
> This kernel is different from __cl_copy_buffer_rect(), the src and dst in this kernel is 4byte aligned, and which in __cl_copy_buffer_rect() is 1 byte aligned.
> > --
> > 1.8.3.2
> > 
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list