[Beignet] [patch v3 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case.

xionghu.luo at intel.com xionghu.luo at intel.com
Wed Apr 8 20:37:57 PDT 2015


From: Luo Xionghu <xionghu.luo at intel.com>

We can change the image_channel_order to CL_RGBA and
image_channel_data_type to CL_UNSIGNED_INT32 for some special
case, thus 16 bytes can be read by one work item. Bandwidth is
fully used.

v2: merge patch 3 of initializing region0;
remove k dimension in kernel for 2d image.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 src/CMakeLists.txt                                 |  2 +-
 src/cl_context.h                                   |  1 +
 src/cl_mem.c                                       | 44 ++++++++++++++++++----
 .../cl_internal_copy_buffer_to_image_2d_align16.cl | 18 +++++++++
 4 files changed, 56 insertions(+), 9 deletions(-)
 create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index da69532..4e67c71 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -51,7 +51,7 @@ cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_arra
 cl_internal_copy_image_2d_array_to_2d_array cl_internal_copy_image_2d_array_to_2d
 cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array
 cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
-cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
+cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_2d_align16 cl_internal_copy_buffer_to_image_3d
 cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
diff --git a/src/cl_context.h b/src/cl_context.h
index fdbfd2a..249fed8 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -63,6 +63,7 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
   CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
   CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
   CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
diff --git a/src/cl_mem.c b/src/cl_mem.c
index b41ec14..0a2613d 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1816,6 +1816,10 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   uint32_t intel_fmt, bpp;
   cl_image_format fmt;
   size_t origin0, region0;
+  size_t kn_src_offset;
+  int align16 = 0;
+  size_t align_size = 1;
+  size_t w_saved = 0;
 
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
@@ -1826,24 +1830,48 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(image->base.ctx == buffer->ctx);
 
-  fmt.image_channel_order = CL_R;
-  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
   intel_fmt = image->intel_fmt;
   bpp = image->bpp;
-  image->intel_fmt = cl_image_get_intel_format(&fmt);
-  image->w = image->w * image->bpp;
-  image->bpp = 1;
+  w_saved = image->w;
   region0 = region[0] * bpp;
-  origin0 = dst_origin[0] * bpp;
+  kn_src_offset = src_offset;
+  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
+      ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)){
+    fmt.image_channel_order = CL_RGBA;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT32;
+    align16 = 1;
+    align_size = 16;
+  }
+  else{
+    fmt.image_channel_order = CL_R;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+    align_size = 1;
+  }
+  image->intel_fmt = cl_image_get_intel_format(&fmt);
+  image->w = (image->w * image->bpp) / align_size;
+  image->bpp = align_size;
+  region0 = (region[0] * bpp) / align_size;
+  origin0 = (dst_origin[0] * bpp) / align_size;
+  kn_src_offset /= align_size;
   global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
 
   /* setup the kernel and run. */
   if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    if(align16){
+      extern char cl_internal_copy_buffer_to_image_2d_align16_str[];
+      extern size_t cl_internal_copy_buffer_to_image_2d_align16_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
+                cl_internal_copy_buffer_to_image_2d_align16_str,
+                (size_t)cl_internal_copy_buffer_to_image_2d_align16_str_size, NULL);
+    }
+    else{
       extern char cl_internal_copy_buffer_to_image_2d_str[];
       extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
 
       ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
           cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
+    }
   }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
       extern char cl_internal_copy_buffer_to_image_3d_str[];
       extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
@@ -1862,13 +1890,13 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
   cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
   cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
-  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
 
   ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
 
   image->intel_fmt = intel_fmt;
   image->bpp = bpp;
-  image->w = image->w / bpp;
+  image->w = w_saved;
 
   return ret;
 }
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
new file mode 100644
index 0000000..e4cef73
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
@@ -0,0 +1,18 @@
+kernel void __cl_copy_buffer_to_image_2d_align16(__read_only image2d_t image, global uint4* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  uint4 color = (uint4)(0);
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  src_offset += j * region0 + i;
+  color = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
+
-- 
1.9.1



More information about the Beignet mailing list