[Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case.
Luo, Xionghu
xionghu.luo at intel.com
Thu Apr 2 20:16:35 PDT 2015
I don't agree with your description. Take your CopyImageToBuffer kernel as example:
Acturally, the unaligned kernel also read 16bytes to the color, but only the color.x is useful, so 1 bytes written to the buffer;
And for aligned case, read 16bytes then write 16 bytes. So the difference is "write" instead of "read" or "16 times per work item ".
kernel void __cl_copy_image_2d_to_buffer( __read_only image2d_t image, global uchar* buffer,
unsigned int region0, unsigned int region1, unsigned int region2,
unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
unsigned int dst_offset)
{
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
uint4 color;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
int2 src_coord;
if((i >= region0) || (j>= region1) || (k>=region2))
return;
src_coord.x = src_origin0 + i;
src_coord.y = src_origin1 + j;
color = read_imageui(image, sampler, src_coord);
dst_offset += (k * region1 + j) * region0 + i;
buffer[dst_offset] = color.x;
}
Luo Xionghu
Best Regards
-----Original Message-----
From: Weng, Chuanbo
Sent: Friday, April 3, 2015 10:57 AM
To: Luo, Xionghu; beignet at lists.freedesktop.org
Subject: RE: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case.
The key point of this optimization is that we change image_channel_order and image_channel_data_type of image internally, so 16 bytes image data can be processed one time instead of processed 16 times per work item. So we give patch description as below:
(For CopyImageToBuffer kernel) thus 16 bytes can be read by one work item.
(For CopyBufferToImage kernel) thus 16 bytes can be written by one work item.
For "__read_only " issue, maybe our compiler should give a warning for this. This is another topic, I suggest you can send out another mail to discuss this issue:)
-----Original Message-----
From: Luo, Xionghu
Sent: Thursday, April 02, 2015 11:16
To: Weng, Chuanbo; beignet at lists.freedesktop.org
Subject: RE: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case.
For CopyBufferToImage kernel, "thus 16 bytes can be read by one work item " is correct from the Buffer side, data is read from the buffer then write to the image.
PS: a typo in the image attribute of this kernel, this image should be "__write_only" instead of "__read_only" as it is written to. Why this attribute doesn't work as expected even set to "__read_only" but still writable?
Luo Xionghu
Best Regards
-----Original Message-----
From: Weng, Chuanbo
Sent: Wednesday, April 1, 2015 4:15 PM
To: Luo, Xionghu; beignet at lists.freedesktop.org
Cc: Luo, Xionghu
Subject: RE: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case.
One warning when running "git am" command:
warning: 1 line adds whitespace errors.
And the words of the patch description "thus 16 bytes can be read by one work item " should be changed to "thus 16 bytes can be written by one work item ".
I think Zhigang can help to do this minor modification before pushing this patch.
Other part of this patch LGTM.
-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of xionghu.luo at intel.com
Sent: Wednesday, April 01, 2015 13:11
To: beignet at lists.freedesktop.org
Cc: Luo, Xionghu
Subject: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage for 16 aligned case.
From: Luo Xionghu <xionghu.luo at intel.com>
We can change the image_channel_order to CL_RGBA and image_channel_data_type to CL_UNSIGNED_INT32 for some special case, thus 16 bytes can be read by one work item. Bandwidth is fully used.
v2: merge patch 3 of initializing region0; remove k dimension in kernel for 2d image.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
src/CMakeLists.txt | 2 +-
src/cl_context.h | 1 +
src/cl_mem.c | 44 ++++++++++++++++++----
.../cl_internal_copy_buffer_to_image_2d_align16.cl | 18 +++++++++
4 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index da69532..4e67c71 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -51,7 +51,7 @@ cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_arra
cl_internal_copy_image_2d_array_to_2d_array cl_internal_copy_image_2d_array_to_2d
cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array
cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
-cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
+cl_internal_copy_buffer_to_image_2d
+cl_internal_copy_buffer_to_image_2d_align16
+cl_internal_copy_buffer_to_image_3d
cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
cl_internal_fill_buf_align128 cl_internal_fill_image_1d diff --git a/src/cl_context.h b/src/cl_context.h index fdbfd2a..249fed8 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -63,6 +63,7 @@ enum _cl_internal_ker_type {
CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer
CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d
+ CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d
CL_ENQUEUE_FILL_BUFFER_UNALIGN, //fill buffer with 1 aligne pattern, pattern size=1
CL_ENQUEUE_FILL_BUFFER_ALIGN2, //fill buffer with 2 aligne pattern, pattern size=2
diff --git a/src/cl_mem.c b/src/cl_mem.c index b41ec14..0a2613d 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1816,6 +1816,10 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
uint32_t intel_fmt, bpp;
cl_image_format fmt;
size_t origin0, region0;
+ size_t kn_src_offset;
+ int align16 = 0;
+ size_t align_size = 1;
+ size_t w_saved = 0;
if(region[1] == 1) local_sz[1] = 1;
if(region[2] == 1) local_sz[2] = 1;
@@ -1826,24 +1830,48 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
/* We use one kernel to copy the data. The kernel is lazily created. */
assert(image->base.ctx == buffer->ctx);
- fmt.image_channel_order = CL_R;
- fmt.image_channel_data_type = CL_UNSIGNED_INT8;
intel_fmt = image->intel_fmt;
bpp = image->bpp;
- image->intel_fmt = cl_image_get_intel_format(&fmt);
- image->w = image->w * image->bpp;
- image->bpp = 1;
+ w_saved = image->w;
region0 = region[0] * bpp;
- origin0 = dst_origin[0] * bpp;
+ kn_src_offset = src_offset;
+ if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
+ ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)){
+ fmt.image_channel_order = CL_RGBA;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT32;
+ align16 = 1;
+ align_size = 16;
+ }
+ else{
+ fmt.image_channel_order = CL_R;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+ align_size = 1;
+ }
+ image->intel_fmt = cl_image_get_intel_format(&fmt); image->w =
+ (image->w * image->bpp) / align_size; image->bpp = align_size;
+ region0 = (region[0] * bpp) / align_size;
+ origin0 = (dst_origin[0] * bpp) / align_size; kn_src_offset /=
+ align_size;
global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
/* setup the kernel and run. */
if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ if(align16){
+ extern char cl_internal_copy_buffer_to_image_2d_align16_str[];
+ extern size_t
+ cl_internal_copy_buffer_to_image_2d_align16_str_size;
+
+ ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
+ cl_internal_copy_buffer_to_image_2d_align16_str,
+ (size_t)cl_internal_copy_buffer_to_image_2d_align16_str_size, NULL);
+ }
+ else{
extern char cl_internal_copy_buffer_to_image_2d_str[];
extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
+ }
}else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
extern char cl_internal_copy_buffer_to_image_3d_str[];
extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
@@ -1862,13 +1890,13 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
- cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
image->intel_fmt = intel_fmt;
image->bpp = bpp;
- image->w = image->w / bpp;
+ image->w = w_saved;
return ret;
}
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
new file mode 100644
index 0000000..e4cef73
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
@@ -0,0 +1,18 @@
+kernel void __cl_copy_buffer_to_image_2d_align16(__read_only image2d_t image, global uint4* buffer,
+ unsigned int region0, unsigned int region1, unsigned int region2,
+ unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+ unsigned int src_offset) {
+ int i = get_global_id(0);
+ int j = get_global_id(1);
+ uint4 color = (uint4)(0);
+ int2 dst_coord;
+ if((i >= region0) || (j>= region1))
+ return;
+ dst_coord.x = dst_origin0 + i;
+ dst_coord.y = dst_origin1 + j;
+ src_offset += j * region0 + i;
+ color = buffer[src_offset];
+ write_imageui(image, dst_coord, color); }
+
--
1.9.1
_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list