[Beignet] [PATCH 3/3] Add clEnqueueCopyBufferRect api.
Zhigang Gong
zhigang.gong at linux.intel.com
Sun Sep 8 22:16:31 PDT 2013
Two comments as below, please check it out.
On Wed, Sep 04, 2013 at 04:58:09PM +0800, Yang Rong wrote:
> Using enqueue ND range to copy two buffers. Now compile the kernel string, after
> load binary ready, should using static binary.
>
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
> src/cl_api.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> src/cl_context.h | 14 ++++++
> src/cl_enqueue.c | 1 +
> src/cl_mem.c | 86 +++++++++++++++++++++++++++++++++++
> src/cl_mem.h | 4 ++
> 5 files changed, 237 insertions(+), 2 deletions(-)
>
> diff --git a/src/cl_api.c b/src/cl_api.c
> index 3630b48..fda5c11 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -79,6 +79,65 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
> return status;
> }
>
Maybe you can put a comment here to specify that the following code is
from the OpenCL spec.
> +inline cl_bool check_copy_overlap(const size_t src_offset[3],
> + const size_t dst_offset[3],
> + const size_t region[3],
> + size_t row_pitch, size_t slice_pitch)
> +{
> + const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
> + const size_t src_max[] = {src_offset[0] + region[0],
> + src_offset[1] + region[1],
> + src_offset[2] + region[2]};
> + const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
> + const size_t dst_max[] = {dst_offset[0] + region[0],
> + dst_offset[1] + region[1],
> + dst_offset[2] + region[2]};
> + // Check for overlap
> + cl_bool overlap = CL_TRUE;
> + unsigned i;
> + size_t dst_start = dst_offset[2] * slice_pitch +
> + dst_offset[1] * row_pitch + dst_offset[0];
> + size_t dst_end = dst_start + (region[2] * slice_pitch +
> + region[1] * row_pitch + region[0]);
> + size_t src_start = src_offset[2] * slice_pitch +
> + src_offset[1] * row_pitch + src_offset[0];
> + size_t src_end = src_start + (region[2] * slice_pitch +
> + region[1] * row_pitch + region[0]);
> +
> + for (i=0; i != 3; ++i) {
> + overlap = overlap && (src_min[i] < dst_max[i])
> + && (src_max[i] > dst_min[i]);
> + }
> +
> + if (!overlap) {
> + size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
> + src_offset[0] + region[0] - row_pitch : 0;
> + size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
> + dst_offset[0] + region[0] - row_pitch : 0;
> + if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
> + (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
> + if ( (src_start <= dst_start && dst_start < src_end) ||
> + (dst_start <= src_start && src_start < dst_end) )
> + overlap = CL_TRUE;
> + }
> + if (region[2] > 1) {
> + size_t src_height = slice_pitch / row_pitch;
> + size_t dst_height = slice_pitch / row_pitch;
> + size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
> + src_offset[1] + region[1] - src_height : 0;
> + size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
> + dst_offset[1] + region[1] - dst_height : 0;
> + if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
> + (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
> + if ( (src_start <= dst_start && dst_start < src_end) ||
> + (dst_start <= src_start && src_start < dst_end) )
> + overlap = CL_TRUE;
> + }
> + }
> + }
> + return overlap;
> +}
> +
> static cl_int
> cl_check_device_type(cl_device_type device_type)
> {
> @@ -1483,8 +1542,79 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue,
> const cl_event * event_wait_list,
> cl_event * event)
> {
> - NOT_IMPLEMENTED;
> - return 0;
> + cl_int err = CL_SUCCESS;
> + enqueue_data *data, no_wait_data = { 0 };
> +
> + CHECK_QUEUE(command_queue);
> + CHECK_MEM(src_buffer);
> + CHECK_MEM(dst_buffer);
> +
> + if ((command_queue->ctx != src_buffer->ctx) ||
> + (command_queue->ctx != dst_buffer->ctx)) {
> + err = CL_INVALID_CONTEXT;
> + goto error;
> + }
> +
> + if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
> + err = CL_INVALID_VALUE;
> + goto error;
> + }
> +
> + if(src_row_pitch == 0)
> + src_row_pitch = region[0];
> + if(src_slice_pitch == 0)
> + src_slice_pitch = region[1] * src_row_pitch;
> +
> + if(dst_row_pitch == 0)
> + dst_row_pitch = region[0];
> + if(dst_slice_pitch == 0)
> + dst_slice_pitch = region[1] * dst_row_pitch;
> +
> + if (src_row_pitch < region[0] ||
> + dst_row_pitch < region[0]) {
> + err = CL_INVALID_VALUE;
> + goto error;
> + }
> +
> + if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) ||
> + (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) {
> + err = CL_INVALID_VALUE;
> + goto error;
> + }
> +
> + if ((src_origin[2]+region[2])*src_slice_pitch + (src_origin[1]+region[1])*src_row_pitch + src_origin[0] + region[0] > src_buffer->size ||
> + (dst_origin[2]+region[2])*dst_slice_pitch + (dst_origin[1]+region[1])*dst_row_pitch + dst_origin[0] + region[0] > dst_buffer->size) {
> + err = CL_INVALID_VALUE;
> + goto error;
> + }
> +
> + if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
> + err = CL_INVALID_VALUE;
> + goto error;
> + }
> +
> + if (src_buffer == dst_buffer &&
> + check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
> + err = CL_MEM_COPY_OVERLAP;
> + goto error;
> + }
> +
> + cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
> + src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
> +
> + TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
> +
> + data = &no_wait_data;
> + data->type = EnqueueCopyBufferRect;
> + data->queue = command_queue;
> +
> + if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
> + event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
> + err = cl_command_queue_flush(command_queue);
> + }
> +
> +error:
> + return err;
> }
>
> cl_int
> diff --git a/src/cl_context.h b/src/cl_context.h
> index b1ef479..8b63104 100644
> --- a/src/cl_context.h
> +++ b/src/cl_context.h
> @@ -39,6 +39,18 @@ enum _cl_gl_context_type {
> CL_GL_CGL_SHAREGROUP
> };
>
> +enum _cl_internal_ker_type {
> + CL_ENQUEUE_COPY_BUFFER = 0,
> + CL_ENQUEUE_COPY_BUFFER_RECT = 1,
> + CL_ENQUEUE_COPY_IMAGE_0 = 2, //copy image 2d to image 2d
> + CL_ENQUEUE_COPY_IMAGE_1 = 3, //copy image 2d to image 2d
> + CL_ENQUEUE_COPY_IMAGE_2 = 4, //copy image 2d to image 2d
> + CL_ENQUEUE_COPY_IMAGE_3 = 5, //copy image 2d to image 2d
> + CL_ENQUEUE_COPY_IMAGE_TO_BUFFER = 6,
> + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE = 7,
> + CL_INVALID = 8
The above enum name CL_INVALID is really not a good name.
I prefer to use CL_INTERNAL_KERNEL_MAX or CL_ENQUEUE_KERNEL_TYPE_MAX.
-- zhigang.
> +};
> +
> struct _cl_context_prop {
> cl_context_properties platform_id;
> enum _cl_gl_context_type gl_type;
> @@ -68,6 +80,8 @@ struct _cl_context {
> pthread_mutex_t buffer_lock; /* To allocate and deallocate buffers */
> pthread_mutex_t sampler_lock; /* To allocate and deallocate samplers */
> pthread_mutex_t event_lock; /* To allocate and deallocate events */
> + cl_program internal_prgs[CL_INVALID]; /* All programs internal used, for example clEnqueuexxx api use */
> + cl_kernel internel_kernels[CL_INVALID]; /* All kernels for clenqueuexxx api, for example clEnqueuexxx api use */
> uint32_t ver; /* Gen version */
> struct _cl_context_prop props;
> cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
> diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
> index 156ea8c..3446ac3 100644
> --- a/src/cl_enqueue.c
> +++ b/src/cl_enqueue.c
> @@ -372,6 +372,7 @@ cl_int cl_enqueue_handle(enqueue_data* data)
> return cl_enqueue_map_image(data);
> case EnqueueUnmapMemObject:
> return cl_enqueue_unmap_mem_object(data);
> + case EnqueueCopyBufferRect:
> case EnqueueNDRangeKernel:
> cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr); //goto default
> default:
> diff --git a/src/cl_mem.c b/src/cl_mem.c
> index 8df2f89..fb6dc90 100644
> --- a/src/cl_mem.c
> +++ b/src/cl_mem.c
> @@ -25,6 +25,9 @@
> #include "cl_device_id.h"
> #include "cl_driver.h"
> #include "cl_khr_icd.h"
> +#include "cl_program.h"
> +#include "cl_kernel.h"
> +#include "cl_command_queue.h"
>
> #include "CL/cl.h"
> #include "CL/cl_intel.h"
> @@ -537,6 +540,89 @@ cl_mem_add_ref(cl_mem mem)
> atomic_inc(&mem->ref_n);
> }
>
> +#define LOCAL_SZ_0 16
> +#define LOCAL_SZ_1 4
> +#define LOCAL_SZ_2 4
> +
> +LOCAL cl_int
> +cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
> + const size_t *src_origin, const size_t *dst_origin, const size_t *region,
> + size_t src_row_pitch, size_t src_slice_pitch,
> + size_t dst_row_pitch, size_t dst_slice_pitch) {
> + cl_int ret;
> + cl_kernel ker;
> + size_t global_off[] = {0,0,0};
> + size_t global_sz[] = {1,1,1};
> + size_t local_sz[] = {LOCAL_SZ_2,LOCAL_SZ_1,LOCAL_SZ_0};
> + if(region[1] == 1) local_sz[1] = 1;
> + if(region[2] == 1) local_sz[2] = 1;
> + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
> + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
> + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
> + cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
> + cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
> + cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
> +
> + static const char *kernel_str =
> + "kernel void __cl_cpy_buffer_rect ( \n"
> + " global char* src, global char* dst, \n"
> + " unsigned int region0, unsigned int region1, unsigned int region2, \n"
> + " unsigned int src_offset, unsigned int dst_offset, \n"
> + " unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
> + " unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
> + " int i = get_global_id(0); \n"
> + " int j = get_global_id(1); \n"
> + " int k = get_global_id(2); \n"
> + " if((i >= region0) || (j>= region1) || (k>=region2)) \n"
> + " return; \n"
> + " src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
> + " dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
> + " dst[dst_offset] = src[src_offset]; \n"
> + "}";
> +
> +
> + /* We use one kernel to copy the data. The kernel is lazily created. */
> + assert(src_buf->ctx == dst_buf->ctx);
> + if (!src_buf->ctx->internal_prgs[index])
> + {
> + size_t length = strlen(kernel_str) + 1;
> + src_buf->ctx->internal_prgs[index] = cl_program_create_from_source(src_buf->ctx, 1, &kernel_str, &length, NULL);
> +
> + if (!src_buf->ctx->internal_prgs[index])
> + return CL_OUT_OF_RESOURCES;
> +
> + ret = cl_program_build(src_buf->ctx->internal_prgs[index], NULL);
> + if (ret != CL_SUCCESS)
> + return CL_OUT_OF_RESOURCES;
> +
> + src_buf->ctx->internal_prgs[index]->is_built = 1;
> +
> + src_buf->ctx->internel_kernels[index] = cl_kernel_dup(src_buf->ctx->internal_prgs[index]->ker[0]);
> + }
> +
> + /* setup the kernel and run. */
> + ker = src_buf->ctx->internel_kernels[index];
> + if (!ker)
> + return CL_OUT_OF_RESOURCES;
> +
> + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
> + cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
> + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
> + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
> + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
> + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
> + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset);
> + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch);
> + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch);
> + cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
> + cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
> +
> + ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
> + cl_command_queue_finish(queue);
> +
> + return ret;
> +}
> +
> LOCAL void*
> cl_mem_map(cl_mem mem)
> {
> diff --git a/src/cl_mem.h b/src/cl_mem.h
> index c0d5503..cf05252 100644
> --- a/src/cl_mem.h
> +++ b/src/cl_mem.h
> @@ -166,6 +166,10 @@ extern void cl_mem_gl_delete(struct _cl_mem_image *);
> /* Add one more reference to this object */
> extern void cl_mem_add_ref(cl_mem);
>
> +/* api clEnqueueCopy buffer rect help function */
> +extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
> + const size_t *, const size_t *, const size_t *,
> + size_t, size_t, size_t, size_t);
> /* Directly map a memory object */
> extern void *cl_mem_map(cl_mem);
>
> --
> 1.8.1.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list