[Beignet] [PATCH 2/2] move enqueue_copy_image kernels outside of runtime code.

He Junyan junyan.he at inbox.com
Tue May 13 01:40:38 PDT 2014


2/2 is OK

On Mon, 2014-05-12 at 12:41 +0800, xionghu.luo at intel.com wrote:
> From: Luo <xionghu.luo at intel.com>
> 
> seperate the kernel code from host code to make it clean; build the
> kernels offline by gbe_bin_generator to improve the performance.
> ---
>  src/CMakeLists.txt                                 |  23 ++-
>  src/cl_context.h                                   |  24 ++-
>  src/cl_gt_device.h                                 |  23 ++-
>  src/cl_mem.c                                       | 214 ++++++---------------
>  src/kernels/cl_internal_copy_buf_align1.cl         |   8 -
>  src/kernels/cl_internal_copy_buf_align16.cl        |   2 +-
>  src/kernels/cl_internal_copy_buf_align4.cl         |   2 +-
>  src/kernels/cl_internal_copy_buf_rect.cl           |  15 ++
>  .../cl_internal_copy_buf_unalign_dst_offset.cl     |   2 +-
>  .../cl_internal_copy_buf_unalign_same_offset.cl    |   2 +-
>  .../cl_internal_copy_buf_unalign_src_offset.cl     |   2 +-
>  src/kernels/cl_internal_copy_buffer_to_image_2d.cl |  18 ++
>  src/kernels/cl_internal_copy_buffer_to_image_3d.cl |  19 ++
>  src/kernels/cl_internal_copy_image_2d_to_2d.cl     |  21 ++
>  src/kernels/cl_internal_copy_image_2d_to_3d.cl     |  22 +++
>  src/kernels/cl_internal_copy_image_2d_to_buffer.cl |  19 ++
>  src/kernels/cl_internal_copy_image_3d_to_2d.cl     |  22 +++
>  src/kernels/cl_internal_copy_image_3d_to_3d.cl     |  23 +++
>  src/kernels/cl_internal_copy_image_3d_to_buffer.cl |  22 +++
>  19 files changed, 308 insertions(+), 175 deletions(-)
>  delete mode 100644 src/kernels/cl_internal_copy_buf_align1.cl
>  create mode 100644 src/kernels/cl_internal_copy_buf_rect.cl
>  create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d.cl
>  create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_3d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_2d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_3d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_2d_to_buffer.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_3d_to_2d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_3d_to_3d.cl
>  create mode 100644 src/kernels/cl_internal_copy_image_3d_to_buffer.cl
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> index 8164a44..ecc04ab 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -17,11 +17,30 @@ foreach (KF ${KERNEL_FILES})
>  endforeach (KF)
>  endmacro (MakeKernelBinStr)
>  
> +macro (MakeBuiltInKernelStr KERNEL_PATH KERNEL_FILES)
> +  set (output_file ${KERNEL_PATH}/${BUILT_IN_NAME}.cl)
> +  set (file_content)
> +  file (REMOVE ${output_file})
> +  foreach (KF ${KERNEL_NAMES})
> +    set (input_file ${KERNEL_PATH}/${KF}.cl)
> +    file(READ ${input_file} file_content )
> +    STRING(REGEX REPLACE ";" "\\\\;" file_content "${file_content}")
> +    file(APPEND ${output_file} ${file_content})
> +  endforeach (KF)
> +endmacro (MakeBuiltInKernelStr)
> +
>  set (KERNEL_STR_FILES)
> -set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4
> +set (KERNEL_NAMES cl_internal_copy_buf_align4
>  cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
> -cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset)
> +cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset
> +cl_internal_copy_buf_rect cl_internal_copy_image_2d_to_2d cl_internal_copy_image_3d_to_2d
> +cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d
> +cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer
> +cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d)
> +set (BUILT_IN_NAME  cl_internal_built_in_kernel)
> +MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
>  MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
> +MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${BUILT_IN_NAME}")
>  
>  set(OPENCL_SRC
>      ${KERNEL_STR_FILES}
> diff --git a/src/cl_context.h b/src/cl_context.h
> index 782a9af..24281be 100644
> --- a/src/cl_context.h
> +++ b/src/cl_context.h
> @@ -46,14 +46,22 @@ enum _cl_internal_ker_type {
>    CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
>    CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
>    CL_ENQUEUE_COPY_BUFFER_RECT,
> -  CL_ENQUEUE_COPY_IMAGE_0,             //copy image 2d to image 2d
> -  CL_ENQUEUE_COPY_IMAGE_1,             //copy image 3d to image 2d
> -  CL_ENQUEUE_COPY_IMAGE_2,             //copy image 2d to image 3d
> -  CL_ENQUEUE_COPY_IMAGE_3,             //copy image 3d to image 3d
> -  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0,   //copy image 2d to buffer
> -  CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1,   //copy image 3d tobuffer
> -  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0,   //copy buffer to image 2d
> -  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1,   //copy buffer to image 3d
> +  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
> +  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
> +  CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,             //copy image 2d to image 3d
> +  CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,             //copy image 3d to image 3d
> +  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,   //copy image 2d to buffer
> +  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
> +  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
> +  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
> +  CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN4,       //fill buffer with 4 aligne pattern, pattern size=4
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,     //fill buffer with 8 aligne pattern, pattern size=8
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,    //fill buffer with 16 aligne pattern, pattern size=16
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,    //fill buffer with 16 aligne pattern, pattern size=32
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,    //fill buffer with 16 aligne pattern, pattern size=64
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN128,     //fill buffer with 128 aligne pattern, pattern size=128
>    CL_INTERNAL_KERNEL_MAX
>  };
>  
> diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
> index 7e45b4e..8690190 100644
> --- a/src/cl_gt_device.h
> +++ b/src/cl_gt_device.h
> @@ -75,7 +75,28 @@ DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
>  DECL_INFO_STRING(profile, "FULL_PROFILE")
>  DECL_INFO_STRING(opencl_c_version, LIBCL_C_VERSION_STRING)
>  DECL_INFO_STRING(extensions, "")
> -DECL_INFO_STRING(built_in_kernels, "")
> +DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
> +                                   "__cl_copy_region_align16;"
> +                                   "__cl_cpy_region_unalign_same_offset;"
> +                                   "__cl_copy_region_unalign_dst_offset;"
> +                                   "__cl_copy_region_unalign_src_offset;"
> +                                   "__cl_copy_buffer_rect;"
> +                                   "__cl_copy_image_2d_to_2d;"
> +                                   "__cl_copy_image_3d_to_2d;"
> +                                   "__cl_copy_image_2d_to_3d;"
> +                                   "__cl_copy_image_3d_to_3d;"
> +                                   "__cl_copy_image_2d_to_buffer;"
> +                                   "__cl_copy_image_3d_to_buffer;"
> +                                   "__cl_copy_buffer_to_image_2d;"
> +                                   "__cl_copy_buffer_to_image_3d;"
> +                                   "__cl_fill_region_unalign;"
> +                                   "__cl_fill_region_align2;"
> +                                   "__cl_fill_region_align4;"
> +                                   "__cl_fill_region_align8_2;"
> +                                   "__cl_fill_region_align8_4;"
> +                                   "__cl_fill_region_align8_8;"
> +                                   "__cl_fill_region_align8_16;"
> +                                   "__cl_fill_region_align128;")
>  DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
>  #undef DECL_INFO_STRING
>  
> diff --git a/src/cl_mem.c b/src/cl_mem.c
> index 5faef4b..7eaf95f 100644
> --- a/src/cl_mem.c
> +++ b/src/cl_mem.c
> @@ -937,33 +937,19 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
>    global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
>    global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
>    global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
> -  cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
>    cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
>    cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
>  
> -  static const char *str_kernel =
> -      "kernel void __cl_cpy_buffer_rect ( \n"
> -      "       global char* src, global char* dst, \n"
> -      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
> -      "       unsigned int src_offset, unsigned int dst_offset, \n"
> -      "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
> -      "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
> -      "  int i = get_global_id(0); \n"
> -      "  int j = get_global_id(1); \n"
> -      "  int k = get_global_id(2); \n"
> -      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
> -      "    return; \n"
> -      "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
> -      "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
> -      "  dst[dst_offset] = src[src_offset]; \n"
> -      "}";
> -
> -
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(src_buf->ctx == dst_buf->ctx);
>  
>    /* setup the kernel and run. */
> -  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, NULL);
> +  extern char cl_internal_copy_buf_rect_str[];
> +  extern int cl_internal_copy_buf_rect_str_size;
> +
> +  ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT,
> +      cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL);
> +
>    if (!ker)
>      return CL_OUT_OF_RESOURCES;
>  
> @@ -992,8 +978,6 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
>    size_t global_off[] = {0,0,0};
>    size_t global_sz[] = {1,1,1};
>    size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
> -  cl_int index = CL_ENQUEUE_COPY_IMAGE_0;
> -  char option[40] = "";
>    uint32_t fixupDataType;
>    uint32_t savedIntelFmt;
>  
> @@ -1003,15 +987,6 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
>    global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
>    global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
>  
> -  if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> -    strcat(option, "-D SRC_IMAGE_3D");
> -    index += 1;
> -  }
> -  if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> -    strcat(option, " -D DST_IMAGE_3D");
> -    index += 2;
> -  }
> -
>    switch (src_image->fmt.image_channel_data_type) {
>      case CL_SNORM_INT8:
>      case CL_UNORM_INT8:  fixupDataType = CL_UNSIGNED_INT8; break;
> @@ -1034,54 +1009,41 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image
>      src_image->intel_fmt = cl_image_get_intel_format(&fmt);
>      dst_image->intel_fmt = src_image->intel_fmt;
>    }
> -  static const char *str_kernel =
> -      "#ifdef SRC_IMAGE_3D \n"
> -      "  #define SRC_IMAGE_TYPE image3d_t \n"
> -      "  #define SRC_COORD_TYPE int4 \n"
> -      "#else \n"
> -      "  #define SRC_IMAGE_TYPE image2d_t \n"
> -      "  #define SRC_COORD_TYPE int2 \n"
> -      "#endif \n"
> -      "#ifdef DST_IMAGE_3D \n"
> -      "  #define DST_IMAGE_TYPE image3d_t \n"
> -      "  #define DST_COORD_TYPE int4 \n"
> -      "#else \n"
> -      "  #define DST_IMAGE_TYPE image2d_t \n"
> -      "  #define DST_COORD_TYPE int2 \n"
> -      "#endif \n"
> -      "kernel void __cl_copy_image ( \n"
> -      "       __read_only SRC_IMAGE_TYPE src_image, __write_only DST_IMAGE_TYPE dst_image, \n"
> -      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
> -      "       unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
> -      "       unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2) { \n"
> -      "  int i = get_global_id(0); \n"
> -      "  int j = get_global_id(1); \n"
> -      "  int k = get_global_id(2); \n"
> -      "  int4 color; \n"
> -      "  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
> -      "  SRC_COORD_TYPE src_coord; \n"
> -      "  DST_COORD_TYPE dst_coord; \n"
> -      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
> -      "    return; \n"
> -      "  src_coord.x = src_origin0 + i; \n"
> -      "  src_coord.y = src_origin1 + j; \n"
> -      "#ifdef SRC_IMAGE_3D \n"
> -      "  src_coord.z = src_origin2 + k; \n"
> -      "#endif \n"
> -      "  dst_coord.x = dst_origin0 + i; \n"
> -      "  dst_coord.y = dst_origin1 + j; \n"
> -      "#ifdef DST_IMAGE_3D \n"
> -      "  dst_coord.z = dst_origin2 + k; \n"
> -      "#endif \n"
> -      "  color = read_imagei(src_image, sampler, src_coord); \n"
> -      "  write_imagei(dst_image, dst_coord, color); \n"
> -      "}";
>  
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(src_image->base.ctx == dst_image->base.ctx);
>  
>    /* setup the kernel and run. */
> -  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
> +  if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +      extern char cl_internal_copy_image_2d_to_2d_str[];
> +      extern int cl_internal_copy_image_2d_to_2d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,
> +          cl_internal_copy_image_2d_to_2d_str, (size_t)cl_internal_copy_image_2d_to_2d_str_size, NULL);
> +    }else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +      extern char cl_internal_copy_image_2d_to_3d_str[];
> +      extern int cl_internal_copy_image_2d_to_3d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,
> +          cl_internal_copy_image_2d_to_3d_str, (size_t)cl_internal_copy_image_2d_to_3d_str_size, NULL);
> +    }
> +  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +      extern char cl_internal_copy_image_3d_to_2d_str[];
> +      extern int cl_internal_copy_image_3d_to_2d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,
> +          cl_internal_copy_image_3d_to_2d_str, (size_t)cl_internal_copy_image_3d_to_2d_str_size, NULL);
> +    }else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +      extern char cl_internal_copy_image_3d_to_3d_str[];
> +      extern int cl_internal_copy_image_3d_to_3d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,
> +          cl_internal_copy_image_3d_to_3d_str, (size_t)cl_internal_copy_image_3d_to_3d_str_size, NULL);
> +    }
> +  }
> +
>    if (!ker) {
>      ret = CL_OUT_OF_RESOURCES;
>      goto fail;
> @@ -1117,8 +1079,6 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
>    size_t global_off[] = {0,0,0};
>    size_t global_sz[] = {1,1,1};
>    size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
> -  cl_int index = CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0;
> -  char option[40] = "";
>    uint32_t intel_fmt, bpp;
>    cl_image_format fmt;
>    size_t origin0, region0;
> @@ -1129,42 +1089,6 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
>    global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
>    global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
>  
> -  if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> -    strcat(option, "-D IMAGE_3D");
> -    index += 1;
> -  }
> -
> -  static const char *str_kernel =
> -      "#ifdef IMAGE_3D \n"
> -      "  #define IMAGE_TYPE image3d_t \n"
> -      "  #define COORD_TYPE int4 \n"
> -      "#else \n"
> -      "  #define IMAGE_TYPE image2d_t \n"
> -      "  #define COORD_TYPE int2 \n"
> -      "#endif \n"
> -      "kernel void __cl_copy_image_to_buffer ( \n"
> -      "       __read_only IMAGE_TYPE image, global uchar* buffer, \n"
> -      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
> -      "       unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, \n"
> -      "       unsigned int dst_offset) { \n"
> -      "  int i = get_global_id(0); \n"
> -      "  int j = get_global_id(1); \n"
> -      "  int k = get_global_id(2); \n"
> -      "  uint4 color; \n"
> -      "  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n"
> -      "  COORD_TYPE src_coord; \n"
> -      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
> -      "    return; \n"
> -      "  src_coord.x = src_origin0 + i; \n"
> -      "  src_coord.y = src_origin1 + j; \n"
> -      "#ifdef IMAGE_3D \n"
> -      "  src_coord.z = src_origin2 + k; \n"
> -      "#endif \n"
> -      "  color = read_imageui(image, sampler, src_coord); \n"
> -      "  dst_offset += (k * region1 + j) * region0 + i; \n"
> -      "  buffer[dst_offset] = color.x; \n"
> -      "}";
> -
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(image->base.ctx == buffer->ctx);
>  
> @@ -1180,7 +1104,20 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image,
>    global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
>  
>    /* setup the kernel and run. */
> -  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
> +  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +      extern char cl_internal_copy_image_2d_to_buffer_str[];
> +      extern int cl_internal_copy_image_2d_to_buffer_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,
> +          cl_internal_copy_image_2d_to_buffer_str, (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL);
> +  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +    extern char cl_internal_copy_image_3d_to_buffer_str[];
> +    extern int cl_internal_copy_image_3d_to_buffer_str_size;
> +
> +    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,
> +          cl_internal_copy_image_3d_to_buffer_str, (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL);
> +  }
> +
>    if (!ker) {
>      ret = CL_OUT_OF_RESOURCES;
>      goto fail;
> @@ -1216,8 +1153,6 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
>    size_t global_off[] = {0,0,0};
>    size_t global_sz[] = {1,1,1};
>    size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
> -  cl_int index = CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0;
> -  char option[40] = "";
>    uint32_t intel_fmt, bpp;
>    cl_image_format fmt;
>    size_t origin0, region0;
> @@ -1228,41 +1163,6 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
>    global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
>    global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
>  
> -  if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> -    strcat(option, "-D IMAGE_3D");
> -    index += 1;
> -  }
> -
> -  static const char *str_kernel =
> -      "#ifdef IMAGE_3D \n"
> -      "  #define IMAGE_TYPE image3d_t \n"
> -      "  #define COORD_TYPE int4 \n"
> -      "#else \n"
> -      "  #define IMAGE_TYPE image2d_t \n"
> -      "  #define COORD_TYPE int2 \n"
> -      "#endif \n"
> -      "kernel void __cl_copy_image_to_buffer ( \n"
> -      "       __read_only IMAGE_TYPE image, global uchar* buffer, \n"
> -      "       unsigned int region0, unsigned int region1, unsigned int region2, \n"
> -      "       unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2, \n"
> -      "       unsigned int src_offset) { \n"
> -      "  int i = get_global_id(0); \n"
> -      "  int j = get_global_id(1); \n"
> -      "  int k = get_global_id(2); \n"
> -      "  uint4 color = (uint4)(0); \n"
> -      "  COORD_TYPE dst_coord; \n"
> -      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
> -      "    return; \n"
> -      "  dst_coord.x = dst_origin0 + i; \n"
> -      "  dst_coord.y = dst_origin1 + j; \n"
> -      "#ifdef IMAGE_3D \n"
> -      "  dst_coord.z = dst_origin2 + k; \n"
> -      "#endif \n"
> -      "  src_offset += (k * region1 + j) * region0 + i; \n"
> -      "  color.x = buffer[src_offset]; \n"
> -      "  write_imageui(image, dst_coord, color); \n"
> -      "}";
> -
>    /* We use one kernel to copy the data. The kernel is lazily created. */
>    assert(image->base.ctx == buffer->ctx);
>  
> @@ -1278,7 +1178,19 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
>    global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
>  
>    /* setup the kernel and run. */
> -  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option);
> +  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
> +      extern char cl_internal_copy_buffer_to_image_2d_str[];
> +      extern int cl_internal_copy_buffer_to_image_2d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
> +          cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
> +  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
> +      extern char cl_internal_copy_buffer_to_image_3d_str[];
> +      extern int cl_internal_copy_buffer_to_image_3d_str_size;
> +
> +      ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,
> +          cl_internal_copy_buffer_to_image_3d_str, (size_t)cl_internal_copy_buffer_to_image_3d_str_size, NULL);
> +  }
>    if (!ker)
>      return CL_OUT_OF_RESOURCES;
>  
> diff --git a/src/kernels/cl_internal_copy_buf_align1.cl b/src/kernels/cl_internal_copy_buf_align1.cl
> deleted file mode 100644
> index cd3ec7b..0000000
> --- a/src/kernels/cl_internal_copy_buf_align1.cl
> +++ /dev/null
> @@ -1,8 +0,0 @@
> -kernel void __cl_cpy_region_align1 ( global char* src, unsigned int src_offset,
> -                                     global char* dst, unsigned int dst_offset,
> -				     unsigned int size)
> -{
> -    int i = get_global_id(0);
> -    if (i < size)
> -        dst[i+dst_offset] = src[i+src_offset];
> -}
> diff --git a/src/kernels/cl_internal_copy_buf_align16.cl b/src/kernels/cl_internal_copy_buf_align16.cl
> index 75b1a4a..1abb4e9 100644
> --- a/src/kernels/cl_internal_copy_buf_align16.cl
> +++ b/src/kernels/cl_internal_copy_buf_align16.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_align16 ( global float* src, unsigned int src_offset,
> +kernel void __cl_copy_region_align16 ( global float* src, unsigned int src_offset,
>                                        global float* dst, unsigned int dst_offset,
>  				      unsigned int size)
>  {
> diff --git a/src/kernels/cl_internal_copy_buf_align4.cl b/src/kernels/cl_internal_copy_buf_align4.cl
> index 44a0f81..27174ca 100644
> --- a/src/kernels/cl_internal_copy_buf_align4.cl
> +++ b/src/kernels/cl_internal_copy_buf_align4.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_align4 ( global float* src, unsigned int src_offset,
> +kernel void __cl_copy_region_align4 ( global float* src, unsigned int src_offset,
>                                       global float* dst, unsigned int dst_offset,
>  				     unsigned int size)
>  {
> diff --git a/src/kernels/cl_internal_copy_buf_rect.cl b/src/kernels/cl_internal_copy_buf_rect.cl
> new file mode 100644
> index 0000000..71e7484
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_buf_rect.cl
> @@ -0,0 +1,15 @@
> +kernel void __cl_copy_buffer_rect ( global char* src, global char* dst,
> +                                          unsigned int region0, unsigned int region1, unsigned int region2,
> +                                          unsigned int src_offset, unsigned int dst_offset,
> +                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
> +                                          unsigned int dst_row_pitch, unsigned int dst_slice_pitch)
> +{
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
> +  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
> +  dst[dst_offset] = src[src_offset];
> +}
> diff --git a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
> index 13f4162..e02d0e5 100644
> --- a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
> +++ b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_unalign_dst_offset ( global int* src, unsigned int src_offset,
> +kernel void __cl_copy_region_unalign_dst_offset ( global int* src, unsigned int src_offset,
>                                       global int* dst, unsigned int dst_offset,
>  				     unsigned int size,
>  				     unsigned int first_mask, unsigned int last_mask,
> diff --git a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
> index 8510246..83b6e97 100644
> --- a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
> +++ b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_unalign_same_offset ( global int* src, unsigned int src_offset,
> +kernel void __cl_copy_region_unalign_same_offset ( global int* src, unsigned int src_offset,
>                                       global int* dst, unsigned int dst_offset,
>  				     unsigned int size,
>  				     unsigned int first_mask, unsigned int last_mask)
> diff --git a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
> index f98368a..ce0aa1d 100644
> --- a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
> +++ b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
> @@ -1,4 +1,4 @@
> -kernel void __cl_cpy_region_unalign_src_offset ( global int* src, unsigned int src_offset,
> +kernel void __cl_copy_region_unalign_src_offset ( global int* src, unsigned int src_offset,
>                                       global int* dst, unsigned int dst_offset,
>  				     unsigned int size,
>  				     unsigned int first_mask, unsigned int last_mask,
> diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
> new file mode 100644
> index 0000000..a218b58
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
> @@ -0,0 +1,18 @@
> +kernel void __cl_copy_buffer_to_image_2d(__read_only image2d_t image, global uchar* buffer,
> +                                        unsigned int region0, unsigned int region1, unsigned int region2,
> +                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
> +                                        unsigned int src_offset)
> +{
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  uint4 color = (uint4)(0);
> +  int2 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  src_offset += (k * region1 + j) * region0 + i;
> +  color.x = buffer[src_offset];
> +  write_imageui(image, dst_coord, color);
> +}
> diff --git a/src/kernels/cl_internal_copy_buffer_to_image_3d.cl b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
> new file mode 100644
> index 0000000..84d3b27
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
> @@ -0,0 +1,19 @@
> +kernel void __cl_copy_buffer_to_image_3d(__read_only image3d_t image, global uchar* buffer,
> +                                        unsigned int region0, unsigned int region1, unsigned int region2,
> +                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
> +                                        unsigned int src_offset)
> +{
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  uint4 color = (uint4)(0);
> +  int4 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  dst_coord.z = dst_origin2 + k;
> +  src_offset += (k * region1 + j) * region0 + i;
> +  color.x = buffer[src_offset];
> +  write_imageui(image, dst_coord, color);
> +}
> diff --git a/src/kernels/cl_internal_copy_image_2d_to_2d.cl b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
> new file mode 100644
> index 0000000..c5eaab1
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
> @@ -0,0 +1,21 @@
> +kernel void __cl_copy_image_2d_to_2d(__read_only image2d_t src_image, __write_only image2d_t dst_image,
> +                             unsigned int region0, unsigned int region1, unsigned int region2,
> +                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
> +{
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  int4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int2 src_coord;
> +  int2 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  color = read_imagei(src_image, sampler, src_coord);
> +  write_imagei(dst_image, dst_coord, color);
> +}
> diff --git a/src/kernels/cl_internal_copy_image_2d_to_3d.cl b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
> new file mode 100644
> index 0000000..4c73a74
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
> @@ -0,0 +1,22 @@
> +kernel void __cl_copy_image_2d_to_3d(__read_only image2d_t src_image, __write_only image3d_t dst_image,
> +                                         unsigned int region0, unsigned int region1, unsigned int region2,
> +                                         unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                                         unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
> +{
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  int4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int2 src_coord;
> +  int4 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  dst_coord.z = dst_origin2 + k;
> +  color = read_imagei(src_image, sampler, src_coord);
> +  write_imagei(dst_image, dst_coord, color);
> +}
> diff --git a/src/kernels/cl_internal_copy_image_2d_to_buffer.cl b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
> new file mode 100644
> index 0000000..b6c352e
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
> @@ -0,0 +1,19 @@
> +kernel void __cl_copy_image_2d_to_buffer( __read_only image2d_t image, global uchar* buffer,
> +                                        unsigned int region0, unsigned int region1, unsigned int region2,
> +                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                                        unsigned int dst_offset)
> +{
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  uint4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int2 src_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  color = read_imageui(image, sampler, src_coord);
> +  dst_offset += (k * region1 + j) * region0 + i;
> +  buffer[dst_offset] = color.x;
> +}
> diff --git a/src/kernels/cl_internal_copy_image_3d_to_2d.cl b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
> new file mode 100644
> index 0000000..e0effa0
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
> @@ -0,0 +1,22 @@
> +kernel void __cl_copy_image_3d_to_2d(__read_only image3d_t src_image, __write_only image2d_t dst_image,
> +                             unsigned int region0, unsigned int region1, unsigned int region2,
> +                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
> +{
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  int4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int4 src_coord;
> +  int2 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  src_coord.z = src_origin2 + k;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  color = read_imagei(src_image, sampler, src_coord);
> +  write_imagei(dst_image, dst_coord, color);
> +}
> diff --git a/src/kernels/cl_internal_copy_image_3d_to_3d.cl b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
> new file mode 100644
> index 0000000..de80a0a
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
> @@ -0,0 +1,23 @@
> +kernel void __cl_copy_image_3d_to_3d(__read_only image3d_t src_image, __write_only image3d_t dst_image,
> +                             unsigned int region0, unsigned int region1, unsigned int region2,
> +                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
> +{
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  int4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  int4 src_coord;
> +  int4 dst_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  src_coord.z = src_origin2 + k;
> +  dst_coord.x = dst_origin0 + i;
> +  dst_coord.y = dst_origin1 + j;
> +  dst_coord.z = dst_origin2 + k;
> +  color = read_imagei(src_image, sampler, src_coord);
> +  write_imagei(dst_image, dst_coord, color);
> +}
> diff --git a/src/kernels/cl_internal_copy_image_3d_to_buffer.cl b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
> new file mode 100644
> index 0000000..dcfc8a2
> --- /dev/null
> +++ b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
> @@ -0,0 +1,22 @@
> +#define IMAGE_TYPE image3d_t
> +#define COORD_TYPE int4
> +kernel void __cl_copy_image_3d_to_buffer ( __read_only IMAGE_TYPE image, global uchar* buffer,
> +                                        unsigned int region0, unsigned int region1, unsigned int region2,
> +                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
> +                                        unsigned int dst_offset)
> +{
> +  int i = get_global_id(0);
> +  int j = get_global_id(1);
> +  int k = get_global_id(2);
> +  uint4 color;
> +  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
> +  COORD_TYPE src_coord;
> +  if((i >= region0) || (j>= region1) || (k>=region2))
> +    return;
> +  src_coord.x = src_origin0 + i;
> +  src_coord.y = src_origin1 + j;
> +  src_coord.z = src_origin2 + k;
> +  color = read_imageui(image, sampler, src_coord);
> +  dst_offset += (k * region1 + j) * region0 + i;
> +  buffer[dst_offset] = color.x;
> +}





More information about the Beignet mailing list