[Beignet] [PATCH v2] Optimization of clEnqueueCopyImageToBuffer for 16 aligned case.

Zhigang Gong zhigang.gong at linux.intel.com
Thu Feb 12 19:44:45 PST 2015


This version LGTM, will push latter thanks.

On Fri, Feb 13, 2015 at 11:33:44AM +0800, Chuanbo Weng wrote:
> We can change the image_channel_order to CL_RGBA and
> image_channel_data_type to CL_UNSIGNED_INT32 for some special
> case, thus 16 bytes can be read by one work item. Bandwidth is
> fully used.
> 
> v2:
> Now we just optimize for IMAGE2D, so add judgement to not affect
> other image type's code path.
> 
> Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
> ---
>  src/CMakeLists.txt                                 |  2 +-
>  src/cl_context.h                                   |  1 +
>  src/cl_mem.c                                       | 44 ++++++++++++++++++----
>  .../cl_internal_copy_image_2d_to_buffer_align16.cl | 19 ++++++++++
>  4 files changed, 57 insertions(+), 9 deletions(-)
>  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> index 939f58d..d4181d8 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -49,7 +49,7 @@ cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d cl_internal_copy
>  cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_array
>  cl_internal_copy_image_2d_array_to_2d_array cl_internal_copy_image_2d_array_to_2d
>  cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array
> -cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer
> +cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
>  cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
>  cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
>  cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
> diff --git a/src/cl_context.h b/src/cl_context.h
> index 2ea0a73..fdbfd2a 100644
> --- a/src/cl_context.h
> +++ b/src/cl_context.h
> @@ -60,6 +60,7 @@ enum _cl_internal_ker_type {
>    CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D,       //copy image 2d array to image 3d
>    CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY,       //copy image 3d to image 2d array
>    CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,   //copy image 2d to buffer
> +  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
>    CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
>    CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
>    CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
> diff --git a/src/cl_mem.c b/src/cl_mem.c
> index e58a183..b41ec14 100644
> --- a/src/cl_mem.c
> +++ b/src/cl_mem.c
> @@ -1714,6 +1714,10 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
>    uint32_t intel_fmt, bpp;
>    cl_image_format fmt;
>    size_t origin0, region0;
> +  size_t kn_dst_offset;
> +  int align16 = 0;
> +  size_t align_size = 1;
> +  size_t w_saved;
>  
>    if(region[1] == 1) local_sz[1] = 1;
>    if(region[2] == 1) local_sz[2] = 1;
> @@ -1724,24 +1728,48 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(image->base.ctx == buffer->ctx);
>  
> -  fmt.image_channel_order = CL_R;
> -  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
>    intel_fmt = image->intel_fmt;
>    bpp = image->bpp;
> -  image->intel_fmt = cl_image_get_intel_format(&fmt);
> -  image->w = image->w * image->bpp;
> -  image->bpp = 1;
> +  w_saved = image->w;
>    region0 = region[0] * bpp;
> -  origin0 = src_origin[0] * bpp;
> +  kn_dst_offset = dst_offset;
> +  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
> +      ((src_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (dst_offset % 16 == 0)){
> +    fmt.image_channel_order = CL_RGBA;
> +    fmt.image_channel_data_type = CL_UNSIGNED_INT32;
> +    align16 = 1;
> +    align_size = 16;
> +  }
> +  else{
> +    fmt.image_channel_order = CL_R;
> +    fmt.image_channel_data_type = CL_UNSIGNED_INT8;
> +    align_size = 1;
> +  }
> +  image->intel_fmt = cl_image_get_intel_format(&fmt);
> +  image->w = (image->w * image->bpp) / align_size;
> +  image->bpp = align_size;
> +  region0 = (region[0] * bpp) / align_size;
> +  origin0 = (src_origin[0] * bpp) / align_size;
> +  kn_dst_offset /= align_size;
>    global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
>  
>    /* setup the kernel and run. */
>    if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +    if(align16){
> +      extern char cl_internal_copy_image_2d_to_buffer_align16_str[];
> +      extern size_t cl_internal_copy_image_2d_to_buffer_align16_str_size;
> +
> +      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
> +                cl_internal_copy_image_2d_to_buffer_align16_str,
> +                (size_t)cl_internal_copy_image_2d_to_buffer_align16_str_size, NULL);
> +    }
> +    else{
>        extern char cl_internal_copy_image_2d_to_buffer_str[];
>        extern size_t cl_internal_copy_image_2d_to_buffer_str_size;
>  
>        ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,
>            cl_internal_copy_image_2d_to_buffer_str, (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL);
> +    }
>    }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
>      extern char cl_internal_copy_image_3d_to_buffer_str[];
>      extern size_t cl_internal_copy_image_3d_to_buffer_str_size;
> @@ -1763,7 +1791,7 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
>    cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
>    cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
>    cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
> -  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_offset);
> +  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset);
>  
>    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
>  
> @@ -1771,7 +1799,7 @@ fail:
>  
>    image->intel_fmt = intel_fmt;
>    image->bpp = bpp;
> -  image->w = image->w / bpp;
> +  image->w = w_saved;
>  
>    return ret;
>  }
> diff --git a/src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl b/src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl
> new file mode 100644
> index 0000000..a32e5f2
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl
> @@ -0,0 +1,19 @@
> +kernel void __cl_copy_image_2d_to_buffer_align16( __read_only image2d_t image, global uint4* buffer,
> +                                        unsigned int region0, unsigned int region1, unsigned int region2,
> +                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                                        unsigned int dst_offset)
> +{
> +
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  if((i >= region0) || (j>= region1))
> +    return;
> +  uint4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int2 src_coord;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  color = read_imageui(image, sampler, src_coord);
> +
> +  *(buffer + dst_offset + region0*j + i) = color;
> +}
> -- 
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list