[Beignet] [PATCH 2/2] move enqueue_copy_image kernels outside of runtime code.

Luo, Xionghu xionghu.luo at intel.com
Tue May 13 01:46:55 PDT 2014


"[PATCH  V1 2/2] move enqueue_copy_image kernels outside of runtime code" is the updated patch of this patch set, which excludes the clEnqueueFillBuffer codes more thoroughly. 

ZhiGang need push the later patch set, please.


Luo Xionghu
Best Regards


-----Original Message-----
From: He Junyan [mailto:junyan.he at inbox.com] 
Sent: Tuesday, May 13, 2014 4:41 PM
To: Luo, Xionghu
Cc: beignet at lists.freedesktop.org
Subject: Re: [Beignet] [PATCH 2/2] move enqueue_copy_image kernels outside of runtime code.

2/2 is OK

On Mon, 2014-05-12 at 12:41 +0800, xionghu.luo at intel.com wrote:
> From: Luo <xionghu.luo at intel.com>
> 
> seperate the kernel code from host code to make it clean; build the 
> kernels offline by gbe_bin_generator to improve the performance.
> ---
>  src/CMakeLists.txt                                 |  23 ++-
>  src/cl_context.h                                   |  24 ++-
>  src/cl_gt_device.h                                 |  23 ++-
>  src/cl_mem.c                                       | 214 ++++++---------------
>  src/kernels/cl_internal_copy_buf_align1.cl         |   8 -
>  src/kernels/cl_internal_copy_buf_align16.cl        |   2 +-
>  src/kernels/cl_internal_copy_buf_align4.cl         |   2 +-
>  src/kernels/cl_internal_copy_buf_rect.cl           |  15 ++
>  .../cl_internal_copy_buf_unalign_dst_offset.cl     |   2 +-
>  .../cl_internal_copy_buf_unalign_same_offset.cl    |   2 +-
>  .../cl_internal_copy_buf_unalign_src_offset.cl     |   2 +-
>  src/kernels/cl_internal_copy_buffer_to_image_2d.cl |  18 ++  
> src/kernels/cl_internal_copy_buffer_to_image_3d.cl |  19 ++
>  src/kernels/cl_internal_copy_image_2d_to_2d.cl     |  21 ++
>  src/kernels/cl_internal_copy_image_2d_to_3d.cl     |  22 +++
>  src/kernels/cl_internal_copy_image_2d_to_buffer.cl |  19 ++
>  src/kernels/cl_internal_copy_image_3d_to_2d.cl     |  22 +++
>  src/kernels/cl_internal_copy_image_3d_to_3d.cl     |  23 +++
>  src/kernels/cl_internal_copy_image_3d_to_buffer.cl |  22 +++
>  19 files changed, 308 insertions(+), 175 deletions(-)  delete mode 
> 100644 src/kernels/cl_internal_copy_buf_align1.cl
>  create mode 100644 src/kernels/cl_internal_copy_buf_rect.cl
>  create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d.cl
>  create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_3d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_2d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_3d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_buffer.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_3d_to_2d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_3d_to_3d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_3d_to_buffer.cl
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 
> 8164a44..ecc04ab 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -17,11 +17,30 @@ foreach (KF ${KERNEL_FILES})  endforeach (KF)  
> endmacro (MakeKernelBinStr)
>  
> +macro (MakeBuiltInKernelStr KERNEL_PATH KERNEL_FILES)
> +  set (output_file ${KERNEL_PATH}/${BUILT_IN_NAME}.cl)
> +  set (file_content)
> +  file (REMOVE ${output_file})
> +  foreach (KF ${KERNEL_NAMES})
> +    set (input_file ${KERNEL_PATH}/${KF}.cl)
> +    file(READ ${input_file} file_content )
> +    STRING(REGEX REPLACE ";" "\\\\;" file_content "${file_content}")
> +    file(APPEND ${output_file} ${file_content})
> +  endforeach (KF)
> +endmacro (MakeBuiltInKernelStr)
> +
>  set (KERNEL_STR_FILES)
> -set (KERNEL_NAMES cl_internal_copy_buf_align1 
> cl_internal_copy_buf_align4
> +set (KERNEL_NAMES cl_internal_copy_buf_align4
>  cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
> -cl_internal_copy_buf_unalign_dst_offset 
> cl_internal_copy_buf_unalign_src_offset)
> +cl_internal_copy_buf_unalign_dst_offset 
> +cl_internal_copy_buf_unalign_src_offset
> +cl_internal_copy_buf_rect cl_internal_copy_image_2d_to_2d 
> +cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d 
> +cl_internal_copy_image_3d_to_3d cl_internal_copy_image_2d_to_buffer 
> +cl_internal_copy_image_3d_to_buffer
> +cl_internal_copy_buffer_to_image_2d 
> +cl_internal_copy_buffer_to_image_3d)
> +set (BUILT_IN_NAME  cl_internal_built_in_kernel) MakeBuiltInKernelStr 
> +("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
>  MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" 
> "${KERNEL_NAMES}")
> +MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" 
> +"${BUILT_IN_NAME}")
>  
>  set(OPENCL_SRC
>      ${KERNEL_STR_FILES}
> diff --git a/src/cl_context.h b/src/cl_context.h index 
> 782a9af..24281be 100644
> --- a/src/cl_context.h
> +++ b/src/cl_context.h
> @@ -46,14 +46,22 @@ enum _cl_internal_ker_type {
>    CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
>    CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
>    CL_ENQUEUE_COPY_BUFFER_RECT,
> -  CL_ENQUEUE_COPY_IMAGE_0,             //copy image 2d to image 2d
> -  CL_ENQUEUE_COPY_IMAGE_1,             //copy image 3d to image 2d
> -  CL_ENQUEUE_COPY_IMAGE_2,             //copy image 2d to image 3d
> -  CL_ENQUEUE_COPY_IMAGE_3,             //copy image 3d to image 3d
> -  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0,   //copy image 2d to buffer
> -  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1,   //copy image 3d tobuffer
> -  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0,   //copy buffer to image 2d
> -  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1,   //copy buffer to image 3d
> +  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
> +  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
> +  CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,             //copy image 2d to image 3d
> +  CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,             //copy image 3d to image 3d
> +  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,   //copy image 2d to buffer
> +  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
> +  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
> +  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
> +  CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN4,       //fill buffer with 4 aligne pattern, pattern size=4
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,     //fill buffer with 8 aligne pattern, pattern size=8
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,    //fill buffer with 16 aligne pattern, pattern size=16
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,    //fill buffer with 16 aligne pattern, pattern size=32
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,    //fill buffer with 16 aligne pattern, pattern size=64
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN128,     //fill buffer with 128 aligne pattern, pattern size=128
>    CL_INTERNAL_KERNEL_MAX
>  };
>  
> diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index 
> 7e45b4e..8690190 100644
> --- a/src/cl_gt_device.h
> +++ b/src/cl_gt_device.h
> @@ -75,7 +75,28 @@ DECL_INFO_STRING(version, LIBCL_VERSION_STRING)  
> DECL_INFO_STRING(profile, "FULL_PROFILE")  
> DECL_INFO_STRING(opencl_c_version, LIBCL_C_VERSION_STRING)  
> DECL_INFO_STRING(extensions, "") -DECL_INFO_STRING(built_in_kernels, 
> "")
> +DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
> +                                   "__cl_copy_region_align16;"
> +                                   "__cl_cpy_region_unalign_same_offset;"
> +                                   "__cl_copy_region_unalign_dst_offset;"
> +                                   "__cl_copy_region_unalign_src_offset;"
> +                                   "__cl_copy_buffer_rect;"
> +                                   "__cl_copy_image_2d_to_2d;"
> +                                   "__cl_copy_image_3d_to_2d;"
> +                                   "__cl_copy_image_2d_to_3d;"
> +                                   "__cl_copy_image_3d_to_3d;"
> +                                   "__cl_copy_image_2d_to_buffer;"
> +                                   "__cl_copy_image_3d_to_buffer;"
> +                                   "__cl_copy_buffer_to_image_2d;"
> +                                   "__cl_copy_buffer_to_image_3d;"
> +                                   "__cl_fill_region_unalign;"
> +                                   "__cl_fill_region_align2;"
> +                                   "__cl_fill_region_align4;"
> +                                   "__cl_fill_region_align8_2;"
> +                                   "__cl_fill_region_align8_4;"
> +                                   "__cl_fill_region_align8_8;"
> +                                   "__cl_fill_region_align8_16;"
> +                                   "__cl_fill_region_align128;")
>  DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)  #undef 
> DECL_INFO_STRING
>  
> diff --git a/src/cl_mem.c b/src/cl_mem.c index 5faef4b..7eaf95f 100644
> --- a/src/cl_mem.c
> +++ b/src/cl_mem.c
> @@ -937,33 +937,19 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
>    global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
>    global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
>    global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * 
> local_sz[2];
> -  cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
>    cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
>    cl_int dst_offset = dst_origin[2]*dst_slice_pitch + 
> dst_origin[1]*dst_row_pitch + dst_origin[0];
>  
> -  static const char *str_kernel =
> -      "kernel void __cl_cpy_buffer_rect ( \n"
> -      "       global char* src, global char* dst, \n"
> -      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
> -      "       unsigned int src_offset, unsigned int dst_offset, \n"
> -      "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
> -      "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
> -      "  int i = get_global_id(0); \n"
> -      "  int j = get_global_id(1); \n"
> -      "  int k = get_global_id(2); \n"
> -      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
> -      "    return; \n"
> -      "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
> -      "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
> -      "  dst[dst_offset] = src[src_offset]; \n"
> -      "}";
> -
> -
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(src_buf->ctx == dst_buf->ctx);
>  
>    /* setup the kernel and run. */
> -  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, 
> NULL);
> +  extern char cl_internal_copy_buf_rect_str[];  extern int 
> + cl_internal_copy_buf_rect_str_size;
> +
> +  ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT,
> +      cl_internal_copy_buf_rect_str, 
> + (size_t)cl_internal_copy_buf_rect_str_size, NULL);
> +
>    if (!ker)
>      return CL_OUT_OF_RESOURCES;
>  
> @@ -992,8 +978,6 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
>    size_t global_off[] = {0,0,0};
>    size_t global_sz[] = {1,1,1};
>    size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
> -  cl_int index = CL_ENQUEUE_COPY_IMAGE_0;
> -  char option[40] = "";
>    uint32_t fixupDataType;
>    uint32_t savedIntelFmt;
>  
> @@ -1003,15 +987,6 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
>    global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
>    global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * 
> local_sz[2];
>  
> -  if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> -    strcat(option, "-D SRC_IMAGE_3D");
> -    index += 1;
> -  }
> -  if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> -    strcat(option, " -D DST_IMAGE_3D");
> -    index += 2;
> -  }
> -
>    switch (src_image->fmt.image_channel_data_type) {
>      case CL_SNORM_INT8:
>      case CL_UNORM_INT8:  fixupDataType = CL_UNSIGNED_INT8; break; @@ 
> -1034,54 +1009,41 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
>      src_image->intel_fmt = cl_image_get_intel_format(&fmt);
>      dst_image->intel_fmt = src_image->intel_fmt;
>    }
> -  static const char *str_kernel =
> -      "#ifdef SRC_IMAGE_3D \n"
> -      "  #define SRC_IMAGE_TYPE image3d_t \n"
> -      "  #define SRC_COORD_TYPE int4 \n"
> -      "#else \n"
> -      "  #define SRC_IMAGE_TYPE image2d_t \n"
> -      "  #define SRC_COORD_TYPE int2 \n"
> -      "#endif \n"
> -      "#ifdef DST_IMAGE_3D \n"
> -      "  #define DST_IMAGE_TYPE image3d_t \n"
> -      "  #define DST_COORD_TYPE int4 \n"
> -      "#else \n"
> -      "  #define DST_IMAGE_TYPE image2d_t \n"
> -      "  #define DST_COORD_TYPE int2 \n"
> -      "#endif \n"
> -      "kernel void __cl_copy_image ( \n"
> -      "       __read_only SRC_IMAGE_TYPE src_image, __write_only DST_IMAGE_TYPE dst_image, \n"
> -      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
> -      "       unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
> -      "       unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2) { \n"
> -      "  int i = get_global_id(0); \n"
> -      "  int j = get_global_id(1); \n"
> -      "  int k = get_global_id(2); \n"
> -      "  int4 color; \n"
> -      "  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
> -      "  SRC_COORD_TYPE src_coord; \n"
> -      "  DST_COORD_TYPE dst_coord; \n"
> -      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
> -      "    return; \n"
> -      "  src_coord.x = src_origin0 + i; \n"
> -      "  src_coord.y = src_origin1 + j; \n"
> -      "#ifdef SRC_IMAGE_3D \n"
> -      "  src_coord.z = src_origin2 + k; \n"
> -      "#endif \n"
> -      "  dst_coord.x = dst_origin0 + i; \n"
> -      "  dst_coord.y = dst_origin1 + j; \n"
> -      "#ifdef DST_IMAGE_3D \n"
> -      "  dst_coord.z = dst_origin2 + k; \n"
> -      "#endif \n"
> -      "  color = read_imagei(src_image, sampler, src_coord); \n"
> -      "  write_imagei(dst_image, dst_coord, color); \n"
> -      "}";
>  
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(src_image->base.ctx == dst_image->base.ctx);
>  
>    /* setup the kernel and run. */
> -  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, 
> option);
> +  if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +      extern char cl_internal_copy_image_2d_to_2d_str[];
> +      extern int cl_internal_copy_image_2d_to_2d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,
> +          cl_internal_copy_image_2d_to_2d_str, (size_t)cl_internal_copy_image_2d_to_2d_str_size, NULL);
> +    }else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +      extern char cl_internal_copy_image_2d_to_3d_str[];
> +      extern int cl_internal_copy_image_2d_to_3d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,
> +          cl_internal_copy_image_2d_to_3d_str, (size_t)cl_internal_copy_image_2d_to_3d_str_size, NULL);
> +    }
> +  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +      extern char cl_internal_copy_image_3d_to_2d_str[];
> +      extern int cl_internal_copy_image_3d_to_2d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,
> +          cl_internal_copy_image_3d_to_2d_str, (size_t)cl_internal_copy_image_3d_to_2d_str_size, NULL);
> +    }else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +      extern char cl_internal_copy_image_3d_to_3d_str[];
> +      extern int cl_internal_copy_image_3d_to_3d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,
> +          cl_internal_copy_image_3d_to_3d_str, (size_t)cl_internal_copy_image_3d_to_3d_str_size, NULL);
> +    }
> +  }
> +
>    if (!ker) {
>      ret = CL_OUT_OF_RESOURCES;
>      goto fail;
> @@ -1117,8 +1079,6 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
>    size_t global_off[] = {0,0,0};
>    size_t global_sz[] = {1,1,1};
>    size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
> -  cl_int index = CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0;
> -  char option[40] = "";
>    uint32_t intel_fmt, bpp;
>    cl_image_format fmt;
>    size_t origin0, region0;
> @@ -1129,42 +1089,6 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
>    global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
>    global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * 
> local_sz[2];
>  
> -  if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> -    strcat(option, "-D IMAGE_3D");
> -    index += 1;
> -  }
> -
> -  static const char *str_kernel =
> -      "#ifdef IMAGE_3D \n"
> -      "  #define IMAGE_TYPE image3d_t \n"
> -      "  #define COORD_TYPE int4 \n"
> -      "#else \n"
> -      "  #define IMAGE_TYPE image2d_t \n"
> -      "  #define COORD_TYPE int2 \n"
> -      "#endif \n"
> -      "kernel void __cl_copy_image_to_buffer ( \n"
> -      "       __read_only IMAGE_TYPE image, global uchar* buffer, \n"
> -      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
> -      "       unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
> -      "       unsigned int dst_offset) { \n"
> -      "  int i = get_global_id(0); \n"
> -      "  int j = get_global_id(1); \n"
> -      "  int k = get_global_id(2); \n"
> -      "  uint4 color; \n"
> -      "  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
> -      "  COORD_TYPE src_coord; \n"
> -      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
> -      "    return; \n"
> -      "  src_coord.x = src_origin0 + i; \n"
> -      "  src_coord.y = src_origin1 + j; \n"
> -      "#ifdef IMAGE_3D \n"
> -      "  src_coord.z = src_origin2 + k; \n"
> -      "#endif \n"
> -      "  color = read_imageui(image, sampler, src_coord); \n"
> -      "  dst_offset += (k * region1 + j) * region0 + i; \n"
> -      "  buffer[dst_offset] = color.x; \n"
> -      "}";
> -
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(image->base.ctx == buffer->ctx);
>  
> @@ -1180,7 +1104,20 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
>    global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * 
> local_sz[0];
>  
>    /* setup the kernel and run. */
> -  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, 
> option);
> +  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +      extern char cl_internal_copy_image_2d_to_buffer_str[];
> +      extern int cl_internal_copy_image_2d_to_buffer_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,
> +          cl_internal_copy_image_2d_to_buffer_str, 
> + (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL);  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +    extern char cl_internal_copy_image_3d_to_buffer_str[];
> +    extern int cl_internal_copy_image_3d_to_buffer_str_size;
> +
> +    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,
> +          cl_internal_copy_image_3d_to_buffer_str, 
> + (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL);  }
> +
>    if (!ker) {
>      ret = CL_OUT_OF_RESOURCES;
>      goto fail;
> @@ -1216,8 +1153,6 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
>    size_t global_off[] = {0,0,0};
>    size_t global_sz[] = {1,1,1};
>    size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
> -  cl_int index = CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0;
> -  char option[40] = "";
>    uint32_t intel_fmt, bpp;
>    cl_image_format fmt;
>    size_t origin0, region0;
> @@ -1228,41 +1163,6 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
>    global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
>    global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * 
> local_sz[2];
>  
> -  if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> -    strcat(option, "-D IMAGE_3D");
> -    index += 1;
> -  }
> -
> -  static const char *str_kernel =
> -      "#ifdef IMAGE_3D \n"
> -      "  #define IMAGE_TYPE image3d_t \n"
> -      "  #define COORD_TYPE int4 \n"
> -      "#else \n"
> -      "  #define IMAGE_TYPE image2d_t \n"
> -      "  #define COORD_TYPE int2 \n"
> -      "#endif \n"
> -      "kernel void __cl_copy_image_to_buffer ( \n"
> -      "       __read_only IMAGE_TYPE image, global uchar* buffer, \n"
> -      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
> -      "       unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2, \n"
> -      "       unsigned int src_offset) { \n"
> -      "  int i = get_global_id(0); \n"
> -      "  int j = get_global_id(1); \n"
> -      "  int k = get_global_id(2); \n"
> -      "  uint4 color = (uint4)(0); \n"
> -      "  COORD_TYPE dst_coord; \n"
> -      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
> -      "    return; \n"
> -      "  dst_coord.x = dst_origin0 + i; \n"
> -      "  dst_coord.y = dst_origin1 + j; \n"
> -      "#ifdef IMAGE_3D \n"
> -      "  dst_coord.z = dst_origin2 + k; \n"
> -      "#endif \n"
> -      "  src_offset += (k * region1 + j) * region0 + i; \n"
> -      "  color.x = buffer[src_offset]; \n"
> -      "  write_imageui(image, dst_coord, color); \n"
> -      "}";
> -
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(image->base.ctx == buffer->ctx);
>  
> @@ -1278,7 +1178,19 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
>    global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * 
> local_sz[0];
>  
>    /* setup the kernel and run. */
> -  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, 
> option);
> +  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +      extern char cl_internal_copy_buffer_to_image_2d_str[];
> +      extern int cl_internal_copy_buffer_to_image_2d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
> +          cl_internal_copy_buffer_to_image_2d_str, 
> + (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +      extern char cl_internal_copy_buffer_to_image_3d_str[];
> +      extern int cl_internal_copy_buffer_to_image_3d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,
> +          cl_internal_copy_buffer_to_image_3d_str, 
> + (size_t)cl_internal_copy_buffer_to_image_3d_str_size, NULL);  }
>    if (!ker)
>      return CL_OUT_OF_RESOURCES;
>  
> diff --git a/src/kernels/cl_internal_copy_buf_align1.cl 
> b/src/kernels/cl_internal_copy_buf_align1.cl
> deleted file mode 100644
> index cd3ec7b..0000000
> --- a/src/kernels/cl_internal_copy_buf_align1.cl
> +++ /dev/null
> @@ -1,8 +0,0 @@
> -kernel void __cl_cpy_region_align1 ( global char* src, unsigned int src_offset,
> -                                     global char* dst, unsigned int dst_offset,
> -				     unsigned int size)
> -{
> -    int i = get_global_id(0);
> -    if (i < size)
> -        dst[i+dst_offset] = src[i+src_offset];
> -}
> diff --git a/src/kernels/cl_internal_copy_buf_align16.cl 
> b/src/kernels/cl_internal_copy_buf_align16.cl
> index 75b1a4a..1abb4e9 100644
> --- a/src/kernels/cl_internal_copy_buf_align16.cl
> +++ b/src/kernels/cl_internal_copy_buf_align16.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_align16 ( global float* src, unsigned int 
> src_offset,
> +kernel void __cl_copy_region_align16 ( global float* src, unsigned 
> +int src_offset,
>                                        global float* dst, unsigned int dst_offset,
>  				      unsigned int size)
>  {
> diff --git a/src/kernels/cl_internal_copy_buf_align4.cl 
> b/src/kernels/cl_internal_copy_buf_align4.cl
> index 44a0f81..27174ca 100644
> --- a/src/kernels/cl_internal_copy_buf_align4.cl
> +++ b/src/kernels/cl_internal_copy_buf_align4.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_align4 ( global float* src, unsigned int 
> src_offset,
> +kernel void __cl_copy_region_align4 ( global float* src, unsigned int 
> +src_offset,
>                                       global float* dst, unsigned int dst_offset,
>  				     unsigned int size)
>  {
> diff --git a/src/kernels/cl_internal_copy_buf_rect.cl 
> b/src/kernels/cl_internal_copy_buf_rect.cl
> new file mode 100644
> index 0000000..71e7484
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_buf_rect.cl
> @@ -0,0 +1,15 @@
> +kernel void __cl_copy_buffer_rect ( global char* src, global char* dst,
> +                                          unsigned int region0, unsigned int region1, unsigned int region2,
> +                                          unsigned int src_offset, unsigned int dst_offset,
> +                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
> +                                          unsigned int dst_row_pitch, 
> +unsigned int dst_slice_pitch) {
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
> +  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
> +  dst[dst_offset] = src[src_offset];
> +}
> diff --git a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl 
> b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
> index 13f4162..e02d0e5 100644
> --- a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
> +++ b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_unalign_dst_offset ( global int* src, 
> unsigned int src_offset,
> +kernel void __cl_copy_region_unalign_dst_offset ( global int* src, 
> +unsigned int src_offset,
>                                       global int* dst, unsigned int dst_offset,
>  				     unsigned int size,
>  				     unsigned int first_mask, unsigned int last_mask, diff --git 
> a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl 
> b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
> index 8510246..83b6e97 100644
> --- a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
> +++ b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_unalign_same_offset ( global int* src, 
> unsigned int src_offset,
> +kernel void __cl_copy_region_unalign_same_offset ( global int* src, 
> +unsigned int src_offset,
>                                       global int* dst, unsigned int dst_offset,
>  				     unsigned int size,
>  				     unsigned int first_mask, unsigned int last_mask) diff --git 
> a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl 
> b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
> index f98368a..ce0aa1d 100644
> --- a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
> +++ b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_unalign_src_offset ( global int* src, 
> unsigned int src_offset,
> +kernel void __cl_copy_region_unalign_src_offset ( global int* src, 
> +unsigned int src_offset,
>                                       global int* dst, unsigned int dst_offset,
>  				     unsigned int size,
>  				     unsigned int first_mask, unsigned int last_mask, diff --git 
> a/src/kernels/cl_internal_copy_buffer_to_image_2d.cl 
> b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
> new file mode 100644
> index 0000000..a218b58
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
> @@ -0,0 +1,18 @@
> +kernel void __cl_copy_buffer_to_image_2d(__read_only image2d_t image, global uchar* buffer,
> +                                        unsigned int region0, unsigned int region1, unsigned int region2,
> +                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
> +                                        unsigned int src_offset) {
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  uint4 color = (uint4)(0);
> +  int2 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  src_offset += (k * region1 + j) * region0 + i;
> +  color.x = buffer[src_offset];
> +  write_imageui(image, dst_coord, color); }
> diff --git a/src/kernels/cl_internal_copy_buffer_to_image_3d.cl 
> b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
> new file mode 100644
> index 0000000..84d3b27
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
> @@ -0,0 +1,19 @@
> +kernel void __cl_copy_buffer_to_image_3d(__read_only image3d_t image, global uchar* buffer,
> +                                        unsigned int region0, unsigned int region1, unsigned int region2,
> +                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
> +                                        unsigned int src_offset) {
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  uint4 color = (uint4)(0);
> +  int4 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  dst_coord.z = dst_origin2 + k;
> +  src_offset += (k * region1 + j) * region0 + i;
> +  color.x = buffer[src_offset];
> +  write_imageui(image, dst_coord, color); }
> diff --git a/src/kernels/cl_internal_copy_image_2d_to_2d.cl 
> b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
> new file mode 100644
> index 0000000..c5eaab1
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
> @@ -0,0 +1,21 @@
> +kernel void __cl_copy_image_2d_to_2d(__read_only image2d_t src_image, __write_only image2d_t dst_image,
> +                             unsigned int region0, unsigned int region1, unsigned int region2,
> +                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                             unsigned int dst_origin0, unsigned int 
> +dst_origin1, unsigned int dst_origin2) {
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  int4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | 
> +CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int2 src_coord;
> +  int2 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  color = read_imagei(src_image, sampler, src_coord);
> +  write_imagei(dst_image, dst_coord, color); }
> diff --git a/src/kernels/cl_internal_copy_image_2d_to_3d.cl 
> b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
> new file mode 100644
> index 0000000..4c73a74
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
> @@ -0,0 +1,22 @@
> +kernel void __cl_copy_image_2d_to_3d(__read_only image2d_t src_image, __write_only image3d_t dst_image,
> +                                         unsigned int region0, unsigned int region1, unsigned int region2,
> +                                         unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                                         unsigned int dst_origin0, 
> +unsigned int dst_origin1, unsigned int dst_origin2) {
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  int4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | 
> +CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int2 src_coord;
> +  int4 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  dst_coord.z = dst_origin2 + k;
> +  color = read_imagei(src_image, sampler, src_coord);
> +  write_imagei(dst_image, dst_coord, color); }
> diff --git a/src/kernels/cl_internal_copy_image_2d_to_buffer.cl 
> b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
> new file mode 100644
> index 0000000..b6c352e
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
> @@ -0,0 +1,19 @@
> +kernel void __cl_copy_image_2d_to_buffer( __read_only image2d_t image, global uchar* buffer,
> +                                        unsigned int region0, unsigned int region1, unsigned int region2,
> +                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                                        unsigned int dst_offset) {
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  uint4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | 
> +CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int2 src_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  color = read_imageui(image, sampler, src_coord);
> +  dst_offset += (k * region1 + j) * region0 + i;
> +  buffer[dst_offset] = color.x;
> +}
> diff --git a/src/kernels/cl_internal_copy_image_3d_to_2d.cl 
> b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
> new file mode 100644
> index 0000000..e0effa0
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
> @@ -0,0 +1,22 @@
> +kernel void __cl_copy_image_3d_to_2d(__read_only image3d_t src_image, __write_only image2d_t dst_image,
> +                             unsigned int region0, unsigned int region1, unsigned int region2,
> +                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                             unsigned int dst_origin0, unsigned int 
> +dst_origin1, unsigned int dst_origin2) {
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  int4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | 
> +CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int4 src_coord;
> +  int2 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  src_coord.z = src_origin2 + k;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  color = read_imagei(src_image, sampler, src_coord);
> +  write_imagei(dst_image, dst_coord, color); }
> diff --git a/src/kernels/cl_internal_copy_image_3d_to_3d.cl 
> b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
> new file mode 100644
> index 0000000..de80a0a
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
> @@ -0,0 +1,23 @@
> +kernel void __cl_copy_image_3d_to_3d(__read_only image3d_t src_image, __write_only image3d_t dst_image,
> +                             unsigned int region0, unsigned int region1, unsigned int region2,
> +                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                             unsigned int dst_origin0, unsigned int 
> +dst_origin1, unsigned int dst_origin2) {
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  int4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | 
> +CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int4 src_coord;
> +  int4 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  src_coord.z = src_origin2 + k;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  dst_coord.z = dst_origin2 + k;
> +  color = read_imagei(src_image, sampler, src_coord);
> +  write_imagei(dst_image, dst_coord, color); }
> diff --git a/src/kernels/cl_internal_copy_image_3d_to_buffer.cl 
> b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
> new file mode 100644
> index 0000000..dcfc8a2
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
> @@ -0,0 +1,22 @@
> +#define IMAGE_TYPE image3d_t
> +#define COORD_TYPE int4
> +kernel void __cl_copy_image_3d_to_buffer ( __read_only IMAGE_TYPE image, global uchar* buffer,
> +                                        unsigned int region0, unsigned int region1, unsigned int region2,
> +                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                                        unsigned int dst_offset) {
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  uint4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | 
> +CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  COORD_TYPE src_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  src_coord.z = src_origin2 + k;
> +  color = read_imageui(image, sampler, src_coord);
> +  dst_offset += (k * region1 + j) * region0 + i;
> +  buffer[dst_offset] = color.x;
> +}





More information about the Beignet mailing list