[Beignet] [PATCH 5/6 newRT] Move the enqueue Copy/Fill to gen dir.
junyan.he at inbox.com
junyan.he at inbox.com
Tue Mar 28 08:25:36 UTC 2017
From: Junyan He <junyan.he at intel.com>
We will use enqueue kernel range to implement GPU side
mem/image copy and fill. This is gen specific logc and
move to gen dir.
Signed-off-by: Junyan He <junyan.he at intel.com>
---
src/gen/cl_gen.h | 24 +++
src/gen/cl_image_gen.c | 394 +++++++++++++++++++++++++++++++++++++++++++++++++
src/gen/cl_mem_gen.c | 327 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 745 insertions(+)
create mode 100644 src/gen/cl_image_gen.c
create mode 100644 src/gen/cl_mem_gen.c
diff --git a/src/gen/cl_gen.h b/src/gen/cl_gen.h
index 710068a..c4294eb 100644
--- a/src/gen/cl_gen.h
+++ b/src/gen/cl_gen.h
@@ -172,4 +172,28 @@ extern char *cl_internal_built_in_kernel_str;
extern size_t cl_internal_built_in_kernel_str_size;
extern cl_device_id cl_get_device_id_gen(cl_platform_id platform);
+
+/*************************************** Mem *******************************************/
+extern cl_int cl_mem_copy_gen(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
+ size_t src_offset, size_t dst_offset, size_t cb);
+extern cl_int cl_mem_fill_gen(cl_command_queue queue, cl_event e, const void *pattern, size_t pattern_size,
+ cl_mem buffer, size_t offset, size_t size);
+extern cl_int cl_mem_copy_buffer_rect_gen(cl_command_queue queue, cl_event event, cl_mem src_buf,
+ cl_mem dst_buf, const size_t *src_origin, const size_t *dst_origin,
+ const size_t *region, size_t src_row_pitch, size_t src_slice_pitch,
+ size_t dst_row_pitch, size_t dst_slice_pitch);
+
+/*************************************** Image ******************************************/
+extern cl_int cl_image_fill_gen(cl_command_queue queue, cl_event e, const void *pattern,
+ cl_mem src_image, const size_t *origin, const size_t *region);
+extern cl_int cl_image_copy_gen(cl_command_queue queue, cl_event event, cl_mem src_image,
+ cl_mem dst_image, const size_t *src_origin,
+ const size_t *dst_origin, const size_t *region);
+extern cl_int cl_mem_copy_image_to_buffer_gen(cl_command_queue queue, cl_event event, cl_mem image,
+ cl_mem buffer, const size_t *src_origin,
+ const size_t dst_offset, const size_t *region);
+extern cl_int cl_mem_copy_buffer_to_image_gen(cl_command_queue queue, cl_event event, cl_mem buffer,
+ cl_mem image, const size_t src_offset,
+ const size_t *dst_origin, const size_t *region);
+
#endif /* End of __CL_GEN_H__ */
diff --git a/src/gen/cl_image_gen.c b/src/gen/cl_image_gen.c
new file mode 100644
index 0000000..39d3d23
--- /dev/null
+++ b/src/gen/cl_image_gen.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_gen.h"
+#include <math.h>
+
+#define LOCAL_SZ_0 16
+#define LOCAL_SZ_1 4
+#define LOCAL_SZ_2 4
+
+LOCAL cl_int
+cl_image_fill_gen(cl_command_queue queue, cl_event e, const void *pattern,
+ cl_mem mem, const size_t *origin, const size_t *region)
+{
+ cl_int ret = CL_SUCCESS;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0, 0, 0};
+ size_t global_sz[] = {1, 1, 1};
+ size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2};
+ struct _cl_mem_image *src_image = cl_mem_image(mem);
+ uint32_t savedIntelFmt = src_image->intel_fmt;
+
+ if (region[1] == 1)
+ local_sz[1] = 1;
+ if (region[2] == 1)
+ local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_1D);
+ } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_1D_ARRAY);
+ } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_2D);
+ } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_2D_ARRAY);
+ } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_FILL_IMAGE_3D);
+ } else {
+ return CL_IMAGE_FORMAT_NOT_SUPPORTED;
+ }
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
+ if (src_image->fmt.image_channel_order >= CL_sRGBA) {
+#define RGB2sRGB(linear) (linear <= 0.0031308f) ? (12.92f * linear) : (1.055f * powf(linear, 1.0f / 2.4f) - 0.055f);
+ cl_image_format fmt;
+ float newpattern[4] = {0.0, 0.0, 0.0, ((float *)pattern)[3]};
+ int i;
+ for (i = 0; i < 3; i++) {
+ if (src_image->fmt.image_channel_order == CL_sRGBA) {
+ newpattern[i] = RGB2sRGB(((float *)pattern)[i]);
+ } else
+ newpattern[2 - i] = RGB2sRGB(((float *)pattern)[i]);
+ }
+ cl_kernel_set_arg(ker, 1, sizeof(float) * 4, newpattern);
+ fmt.image_channel_order = CL_RGBA;
+ fmt.image_channel_data_type = CL_UNORM_INT8;
+ src_image->intel_fmt = cl_image_get_intel_format(&fmt);
+#undef RGB2sRGB
+ } else
+ cl_kernel_set_arg(ker, 1, sizeof(float) * 4, pattern);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin[0]);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]);
+
+ ret = cl_command_queue_ND_range(queue, ker, e, 3, global_off, global_sz, local_sz);
+ src_image->intel_fmt = savedIntelFmt;
+ return ret;
+}
+
+LOCAL cl_int
+cl_image_copy_gen(cl_command_queue queue, cl_event event, cl_mem src, cl_mem dst,
+ const size_t *src_origin, const size_t *dst_origin, const size_t *region)
+{
+ cl_int ret;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0, 0, 0};
+ size_t global_sz[] = {1, 1, 1};
+ size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2};
+ uint32_t fixupDataType;
+ uint32_t savedIntelFmt;
+ struct _cl_mem_image *src_image = cl_mem_image(src);
+ struct _cl_mem_image *dst_image = cl_mem_image(dst);
+
+ if (region[1] == 1)
+ local_sz[1] = 1;
+ if (region[2] == 1)
+ local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ switch (src_image->fmt.image_channel_data_type) {
+ case CL_SNORM_INT8:
+ case CL_UNORM_INT8:
+ fixupDataType = CL_UNSIGNED_INT8;
+ break;
+ case CL_HALF_FLOAT:
+ case CL_SNORM_INT16:
+ case CL_UNORM_INT16:
+ fixupDataType = CL_UNSIGNED_INT16;
+ break;
+ case CL_FLOAT:
+ fixupDataType = CL_UNSIGNED_INT32;
+ break;
+ default:
+ fixupDataType = 0;
+ }
+
+ if (fixupDataType) {
+ cl_image_format fmt;
+ if (src_image->fmt.image_channel_order != CL_BGRA &&
+ src_image->fmt.image_channel_order != CL_sBGRA &&
+ src_image->fmt.image_channel_order != CL_sRGBA)
+ fmt.image_channel_order = src_image->fmt.image_channel_order;
+ else
+ fmt.image_channel_order = CL_RGBA;
+ fmt.image_channel_data_type = fixupDataType;
+ savedIntelFmt = src_image->intel_fmt;
+ src_image->intel_fmt = cl_image_get_intel_format(&fmt);
+ dst_image->intel_fmt = src_image->intel_fmt;
+ }
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(src_image->base.ctx == dst_image->base.ctx);
+
+ /* setup the kernel and run. */
+ if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+ if (dst_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D);
+ }
+ } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D);
+ } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D);
+ } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY);
+ }
+ } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+ if (dst_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY);
+ }
+ } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+ if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY);
+ } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D);
+ } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D);
+ }
+ } else if (src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D);
+ } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D);
+ } else if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY);
+ }
+ }
+
+ if (!ker) {
+ ret = CL_OUT_OF_RESOURCES;
+ goto fail;
+ }
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_image);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_origin[0]);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_origin[0]);
+ cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]);
+ cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]);
+
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz);
+
+fail:
+ if (fixupDataType) {
+ src_image->intel_fmt = savedIntelFmt;
+ dst_image->intel_fmt = savedIntelFmt;
+ }
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_copy_image_to_buffer_gen(cl_command_queue queue, cl_event event, cl_mem the_image, cl_mem buffer,
+ const size_t *src_origin, const size_t dst_offset, const size_t *region)
+{
+ cl_int ret;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0, 0, 0};
+ size_t global_sz[] = {1, 1, 1};
+ size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2};
+ uint32_t intel_fmt, bpp;
+ cl_image_format fmt;
+ size_t origin0, region0;
+ size_t kn_dst_offset;
+ int align16 = 0;
+ size_t align_size = 1;
+ size_t w_saved;
+ struct _cl_mem_image *image = cl_mem_image(the_image);
+
+ if (region[1] == 1)
+ local_sz[1] = 1;
+ if (region[2] == 1)
+ local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(image->base.ctx == buffer->ctx);
+
+ intel_fmt = image->intel_fmt;
+ bpp = image->bpp;
+ w_saved = image->w;
+ region0 = region[0] * bpp;
+ kn_dst_offset = dst_offset;
+ if ((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
+ ((src_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (dst_offset % 16 == 0)) {
+ fmt.image_channel_order = CL_RGBA;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT32;
+ align16 = 1;
+ align_size = 16;
+ } else {
+ fmt.image_channel_order = CL_R;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+ align_size = 1;
+ }
+ image->intel_fmt = cl_image_get_intel_format(&fmt);
+ image->w = (image->w * image->bpp) / align_size;
+ image->bpp = align_size;
+ region0 = (region[0] * bpp) / align_size;
+ origin0 = (src_origin[0] * bpp) / align_size;
+ kn_dst_offset /= align_size;
+ global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+ /* setup the kernel and run. */
+ if (image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ if (align16) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16);
+ } else {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER);
+ }
+ } else if (image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER);
+ }
+
+ if (!ker) {
+ ret = CL_OUT_OF_RESOURCES;
+ goto fail;
+ }
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset);
+
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz);
+
+fail:
+
+ image->intel_fmt = intel_fmt;
+ image->bpp = bpp;
+ image->w = w_saved;
+
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_copy_buffer_to_image_gen(cl_command_queue queue, cl_event event, cl_mem buffer, cl_mem the_image,
+ const size_t src_offset, const size_t *dst_origin, const size_t *region)
+{
+ cl_int ret;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0, 0, 0};
+ size_t global_sz[] = {1, 1, 1};
+ size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_2};
+ uint32_t intel_fmt, bpp;
+ cl_image_format fmt;
+ size_t origin0, region0;
+ size_t kn_src_offset;
+ int align16 = 0;
+ size_t align_size = 1;
+ size_t w_saved = 0;
+ struct _cl_mem_image *image = cl_mem_image(the_image);
+
+ if (region[1] == 1)
+ local_sz[1] = 1;
+ if (region[2] == 1)
+ local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(image->base.ctx == buffer->ctx);
+
+ intel_fmt = image->intel_fmt;
+ bpp = image->bpp;
+ w_saved = image->w;
+ region0 = region[0] * bpp;
+ kn_src_offset = src_offset;
+ if ((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
+ ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)) {
+ fmt.image_channel_order = CL_RGBA;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT32;
+ align16 = 1;
+ align_size = 16;
+ } else {
+ fmt.image_channel_order = CL_R;
+ fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+ align_size = 1;
+ }
+ image->intel_fmt = cl_image_get_intel_format(&fmt);
+ image->w = (image->w * image->bpp) / align_size;
+ image->bpp = align_size;
+ region0 = (region[0] * bpp) / align_size;
+ origin0 = (dst_origin[0] * bpp) / align_size;
+ kn_src_offset /= align_size;
+ global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+ /* setup the kernel and run. */
+ if (image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+ if (align16) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16);
+ } else {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D);
+ }
+ } else if (image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D);
+ }
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
+
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz);
+
+ image->intel_fmt = intel_fmt;
+ image->bpp = bpp;
+ image->w = w_saved;
+
+ return ret;
+}
diff --git a/src/gen/cl_mem_gen.c b/src/gen/cl_mem_gen.c
new file mode 100644
index 0000000..0d8c35c
--- /dev/null
+++ b/src/gen/cl_mem_gen.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_gen.h"
+
+#define LOCAL_SZ_0 16
+#define LOCAL_SZ_1 4
+#define LOCAL_SZ_2 4
+
+LOCAL cl_int
+cl_mem_copy_gen(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
+ size_t src_offset, size_t dst_offset, size_t cb)
+{
+ cl_int ret = CL_SUCCESS;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0, 0, 0};
+ size_t global_sz[] = {1, 1, 1};
+ size_t local_sz[] = {1, 1, 1};
+ const unsigned int masks[4] = {0xffffffff, 0x0ff, 0x0ffff, 0x0ffffff};
+ int aligned = 0;
+ int dw_src_offset = src_offset / 4;
+ int dw_dst_offset = dst_offset / 4;
+
+ if (!cb)
+ return ret;
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(src_buf->ctx == dst_buf->ctx);
+
+ /* All 16 bytes aligned, fast and easy one. */
+ if ((cb % 16 == 0) && (src_offset % 16 == 0) && (dst_offset % 16 == 0)) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_BUFFER_ALIGN16);
+ cb = cb / 16;
+ aligned = 1;
+ } else if ((cb % 4 == 0) && (src_offset % 4 == 0) && (dst_offset % 4 == 0)) { /* all Dword aligned.*/
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_BUFFER_ALIGN4);
+ cb = cb / 4;
+ aligned = 1;
+ }
+
+ if (aligned) {
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ if (cb < LOCAL_SZ_0) {
+ local_sz[0] = 1;
+ } else {
+ local_sz[0] = LOCAL_SZ_0;
+ }
+ global_sz[0] = ((cb + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0;
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz);
+ return ret;
+ }
+
+ /* Now handle the unaligned cases. */
+ int dw_num = ((dst_offset % 4 + cb) + 3) / 4;
+ unsigned int first_mask = dst_offset % 4 == 0 ? 0x0 : masks[dst_offset % 4];
+ unsigned int last_mask = masks[(dst_offset + cb) % 4];
+ /* handle the very small range copy. */
+ if (cb < 4 && dw_num == 1) {
+ first_mask = first_mask | ~last_mask;
+ }
+
+ if (cb < LOCAL_SZ_0) {
+ local_sz[0] = 1;
+ } else {
+ local_sz[0] = LOCAL_SZ_0;
+ }
+ global_sz[0] = ((dw_num + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0;
+
+ if (src_offset % 4 == dst_offset % 4) {
+ /* Src and dst has the same unaligned offset, just handle the
+ header and tail. */
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET);
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+ cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+ cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz);
+ return ret;
+ }
+
+ /* Dst's offset < Src's offset, so one dst dword need two sequential src dwords to fill it. */
+ if (dst_offset % 4 < src_offset % 4) {
+ int align_diff = src_offset % 4 - dst_offset % 4;
+ unsigned int dw_mask = masks[align_diff];
+ int shift = align_diff * 8;
+
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET);
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+ cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+ cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+ cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
+ cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz);
+ return ret;
+ }
+
+ /* Dst's offset > Src's offset, so one dst dword need two sequential src - and src to fill it. */
+ if (dst_offset % 4 > src_offset % 4) {
+ int align_diff = dst_offset % 4 - src_offset % 4;
+ unsigned int dw_mask = masks[4 - align_diff];
+ int shift = align_diff * 8;
+ int src_less = !(src_offset % 4) && !((src_offset + cb) % 4);
+
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET);
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+ cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+ cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+ cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+ cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
+ cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
+ cl_kernel_set_arg(ker, 9, sizeof(int), &src_less);
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz);
+ return ret;
+ }
+
+ /* no case can hanldle? */
+ assert(0);
+
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_fill_gen(cl_command_queue queue, cl_event e, const void *pattern, size_t pattern_size,
+ cl_mem buffer, size_t offset, size_t size)
+{
+ cl_int ret = CL_SUCCESS;
+ cl_kernel ker = NULL;
+ size_t global_off[] = {0, 0, 0};
+ size_t global_sz[] = {1, 1, 1};
+ size_t local_sz[] = {1, 1, 1};
+ char pattern_comb[4];
+ int is_128 = 0;
+ const void *pattern1 = NULL;
+
+ assert(offset % pattern_size == 0);
+ assert(size % pattern_size == 0);
+
+ if (!size)
+ return ret;
+
+ if (pattern_size == 128) {
+ /* 128 is according to pattern of double16, but double works not very
+ well on some platform. We use two float16 to handle this. */
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_FILL_BUFFER_ALIGN128);
+ is_128 = 1;
+ pattern_size = pattern_size / 2;
+ pattern1 = pattern + pattern_size;
+ size = size / 2;
+ } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */
+ int order = ffs(pattern_size / 8) - 1;
+
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order);
+ } else if (pattern_size == 4) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_FILL_BUFFER_ALIGN4);
+ } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) {
+ /* The unaligned case. But if copy size and offset are aligned to 4, we can fake
+ the pattern with the pattern duplication fill in. */
+ assert(pattern_size == 1 || pattern_size == 2);
+
+ if (pattern_size == 2) {
+ memcpy(pattern_comb, pattern, sizeof(char) * 2);
+ memcpy(pattern_comb + 2, pattern, sizeof(char) * 2);
+ } else {
+ pattern_comb[0] = pattern_comb[1] = pattern_comb[2] = pattern_comb[3] = *(char *)pattern;
+ }
+
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_FILL_BUFFER_ALIGN4);
+ pattern_size = 4;
+ pattern = pattern_comb;
+ }
+ //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel
+ //functions. This depend on the usage but now we just use aligned 1 and 2.
+ else if (pattern_size == 2) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_FILL_BUFFER_ALIGN2);
+ } else if (pattern_size == 1) {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device,
+ CL_ENQUEUE_FILL_BUFFER_UNALIGN);
+ } else
+ assert(0);
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ size = size / pattern_size;
+ offset = offset / pattern_size;
+
+ if (size < LOCAL_SZ_0) {
+ local_sz[0] = 1;
+ } else {
+ local_sz[0] = LOCAL_SZ_0;
+ }
+ global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0;
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer);
+ cl_kernel_set_arg(ker, 1, pattern_size, pattern);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size);
+ if (is_128)
+ cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
+
+ ret = cl_command_queue_ND_range(queue, ker, e, 1, global_off, global_sz, local_sz);
+ return ret;
+}
+
+LOCAL cl_int
+cl_mem_copy_buffer_rect_gen(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
+ const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+ size_t src_row_pitch, size_t src_slice_pitch,
+ size_t dst_row_pitch, size_t dst_slice_pitch)
+{
+ cl_int ret;
+ cl_kernel ker;
+ size_t global_off[] = {0, 0, 0};
+ size_t global_sz[] = {1, 1, 1};
+ size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_1};
+ // the src and dst mem rect is continuous, the copy is degraded to buf copy
+ if ((region[0] == dst_row_pitch) && (region[0] == src_row_pitch) &&
+ (region[1] * src_row_pitch == src_slice_pitch) &&
+ (region[1] * dst_row_pitch == dst_slice_pitch)) {
+ cl_int src_offset = src_origin[2] * src_slice_pitch +
+ src_origin[1] * src_row_pitch + src_origin[0];
+ cl_int dst_offset = dst_origin[2] * dst_slice_pitch +
+ dst_origin[1] * dst_row_pitch + dst_origin[0];
+ cl_int size = region[0] * region[1] * region[2];
+ ret = cl_mem_copy_gen(queue, NULL, src_buf, dst_buf, src_offset, dst_offset, size);
+ return ret;
+ }
+
+ if (region[1] == 1)
+ local_sz[1] = 1;
+ if (region[2] == 1)
+ local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+ cl_int src_offset = src_origin[2] * src_slice_pitch + src_origin[1] * src_row_pitch + src_origin[0];
+ cl_int dst_offset = dst_origin[2] * dst_slice_pitch + dst_origin[1] * dst_row_pitch + dst_origin[0];
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(src_buf->ctx == dst_buf->ctx);
+
+ /* setup the kernel and run. */
+ size_t region0 = region[0];
+ if ((src_offset % 4 == 0) && (dst_offset % 4 == 0) &&
+ (src_row_pitch % 4 == 0) && (dst_row_pitch % 4 == 0) &&
+ (src_slice_pitch % 4 == 0) && (dst_slice_pitch % 4 == 0) && (region0 % 4 == 0)) {
+ region0 /= 4;
+ src_offset /= 4;
+ dst_offset /= 4;
+ src_row_pitch /= 4;
+ dst_row_pitch /= 4;
+ src_slice_pitch /= 4;
+ dst_slice_pitch /= 4;
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4);
+ } else {
+ ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_RECT);
+ }
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch);
+ cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
+ cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
+
+ ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz);
+ return ret;
+}
--
2.7.4
More information about the Beignet
mailing list