[Beignet] [PATCH 3/4] Implement the clEnqueueCopyBuffer API using internal binary kernel
Yang, Rong R
rong.r.yang at intel.com
Tue Oct 8 22:47:50 PDT 2013
In function cl_mem_copy, the local size is only 1, maybe set it to large or equal to 16 can improve performance.
-----Original Message-----
From: beignet-bounces+rong.r.yang=intel.com at lists.freedesktop.org [mailto:beignet-bounces+rong.r.yang=intel.com at lists.freedesktop.org] On Behalf Of junyan.he at inbox.com
Sent: Monday, September 23, 2013 5:02 PM
To: beignet at lists.freedesktop.org
Cc: Junyan He
Subject: [Beignet] [PATCH 3/4] Implement the clEnqueueCopyBuffer API using internal binary kernel
From: Junyan He <junyan.he at linux.intel.com>
Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
src/cl_api.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
src/cl_context.c | 26 ++++++++++++++++++++++++++
src/cl_context.h | 28 +++++++++++++++++-----------
src/cl_mem.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
src/cl_mem.h | 4 ++++
5 files changed, 150 insertions(+), 12 deletions(-)
diff --git a/src/cl_api.c b/src/cl_api.c index c81f730..e9303d8 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1521,8 +1521,57 @@ clEnqueueCopyBuffer(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(src_buffer);
+ CHECK_MEM(dst_buffer);
+
+ if (command_queue->ctx != src_buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (command_queue->ctx != dst_buffer->ctx) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (src_offset < 0 || src_offset + cb > src_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+ if (dst_offset < 0 || dst_offset + cb > src_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ /* Check overlap */
+ if (src_buffer == dst_buffer
+ && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1)
+ && (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
+ err = CL_MEM_COPY_OVERLAP;
+ goto error;
+ }
+
+ // TODO: Need to check the sub buffer cases.
+ err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset,
+ dst_offset, cb);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list,
+ event_wait_list, event, src_buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyBuffer;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_command_queue_flush(command_queue);
+ }
return 0;
+
+error:
+ return err;
}
cl_int
diff --git a/src/cl_context.c b/src/cl_context.c index 4f1c611..b62e946 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -268,3 +268,29 @@ cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kern
return ctx->internel_kernels[index];
}
+
+cl_kernel
+cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
+ const char * str_kernel, size_t size, const char *
+str_option) {
+ cl_int ret;
+ cl_int binary_status = CL_SUCCESS;
+ if (!ctx->internal_prgs[index])
+ {
+ ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
+ &size, (const unsigned char **)&str_kernel, &binary_status,
+&ret);
+
+ if (!ctx->internal_prgs[index])
+ return NULL;
+
+ ret = cl_program_build(ctx->internal_prgs[index], str_option);
+ if (ret != CL_SUCCESS)
+ return NULL;
+
+ ctx->internal_prgs[index]->is_built = 1;
+
+ ctx->internel_kernels[index] =
+ cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+ }
+
+ return ctx->internel_kernels[index];
+}
diff --git a/src/cl_context.h b/src/cl_context.h index 7016733..29bcb9f 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -40,17 +40,19 @@ enum _cl_gl_context_type { };
enum _cl_internal_ker_type {
- CL_ENQUEUE_COPY_BUFFER = 0,
- CL_ENQUEUE_COPY_BUFFER_RECT = 1,
- CL_ENQUEUE_COPY_IMAGE_0 = 2, //copy image 2d to image 2d
- CL_ENQUEUE_COPY_IMAGE_1 = 3, //copy image 3d to image 2d
- CL_ENQUEUE_COPY_IMAGE_2 = 4, //copy image 2d to image 3d
- CL_ENQUEUE_COPY_IMAGE_3 = 5, //copy image 3d to image 3d
- CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0 = 6, //copy image 2d to buffer
- CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1 = 7, //copy image 3d tobuffer
- CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0 = 8, //copy buffer to image 2d
- CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1 = 9, //copy buffer to image 3d
- CL_INTERNAL_KERNEL_MAX = 10
+ CL_ENQUEUE_COPY_BUFFER_ALIGN1 = 0,
+ CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+ CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+ CL_ENQUEUE_COPY_BUFFER_RECT,
+ CL_ENQUEUE_COPY_IMAGE_0, //copy image 2d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_1, //copy image 3d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_2, //copy image 2d to image 3d
+ CL_ENQUEUE_COPY_IMAGE_3, //copy image 3d to image 3d
+ CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0, //copy image 2d to buffer
+ CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1, //copy image 3d tobuffer
+ CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0, //copy buffer to image 2d
+ CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1, //copy buffer to image 3d
+ CL_INTERNAL_KERNEL_MAX
};
struct _cl_context_prop {
@@ -137,5 +139,9 @@ extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
/* Get the internal used kernel */
extern cl_kernel cl_context_get_static_kernel(cl_context ctx, cl_int index, const char *str_kernel, const char * str_option);
+/* Get the internal used kernel from binary*/ extern cl_kernel
+cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
+ const char * str_kernel, size_t size, const char *
+str_option);
+
#endif /* __CL_CONTEXT_H__ */
diff --git a/src/cl_mem.c b/src/cl_mem.c index 769e1cb..56bc6b1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -596,6 +596,59 @@ cl_mem_add_ref(cl_mem mem)
atomic_inc(&mem->ref_n);
}
+LOCAL cl_int
+cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+ size_t src_offset, size_t dst_offset, size_t cb) {
+ cl_int ret;
+ cl_kernel ker;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {1,1,1};
+
+ /* We use one kernel to copy the data. The kernel is lazily created.
+ */ assert(src_buf->ctx == dst_buf->ctx);
+
+ if ((cb % 4) || (src_offset % 4) || (dst_offset % 4)) {
+ extern char cl_internal_copy_buf_align1_str[];
+ extern int cl_internal_copy_buf_align1_str_size;
+
+ ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN1,
+ cl_internal_copy_buf_align1_str, (size_t)cl_internal_copy_buf_align1_str_size, NULL);
+ global_sz[0] = cb;
+ } else if ((cb % 16) || (src_offset % 16) || (dst_offset % 16)) {
+ extern char cl_internal_copy_buf_align4_str[];
+ extern int cl_internal_copy_buf_align4_str_size;
+
+ ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+ cl_internal_copy_buf_align4_str, (size_t)cl_internal_copy_buf_align4_str_size, NULL);
+ global_sz[0] = cb/4;
+ src_offset = src_offset/4;
+ dst_offset = dst_offset/4;
+ } else {
+ extern char cl_internal_copy_buf_align16_str[];
+ extern int cl_internal_copy_buf_align16_str_size;
+
+ ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+ cl_internal_copy_buf_align16_str, (size_t)cl_internal_copy_buf_align16_str_size, NULL);
+ global_sz[0] = cb/16;
+ src_offset = src_offset/4;
+ dst_offset = dst_offset/4;
+ }
+
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(int), &src_offset);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 3, sizeof(int), &dst_offset);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
+ local_sz);
+
+ return ret;
+}
+
#define LOCAL_SZ_0 16
#define LOCAL_SZ_1 4
#define LOCAL_SZ_2 4
diff --git a/src/cl_mem.h b/src/cl_mem.h index ac09c3b..3313224 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -187,6 +187,10 @@ extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
/* Add one more reference to this object */ extern void cl_mem_add_ref(cl_mem);
+/* api clEnqueueCopyBuffer help function */ extern cl_int
+cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+ size_t src_offset, size_t dst_offset, size_t cb);
+
/* api clEnqueueCopyBufferRect help function */ extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
const size_t *, const size_t *, const size_t *,
--
1.7.9.5
_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list