[Beignet] [PATCH 1/2] Add clEnqueueCopyBufferRect api.
Yang Rong
rong.r.yang at intel.com
Thu Sep 5 19:37:32 PDT 2013
Using enqueue ND range to copy two buffers. Now compile the kernel string, after
load binary ready, should using static binary.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
src/cl_api.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
src/cl_context.h | 14 ++++++
src/cl_enqueue.c | 1 +
src/cl_mem.c | 86 +++++++++++++++++++++++++++++++++++
src/cl_mem.h | 4 ++
5 files changed, 237 insertions(+), 2 deletions(-)
diff --git a/src/cl_api.c b/src/cl_api.c
index 3630b48..fda5c11 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -79,6 +79,65 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
return status;
}
+inline cl_bool check_copy_overlap(const size_t src_offset[3],
+ const size_t dst_offset[3],
+ const size_t region[3],
+ size_t row_pitch, size_t slice_pitch)
+{
+ const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
+ const size_t src_max[] = {src_offset[0] + region[0],
+ src_offset[1] + region[1],
+ src_offset[2] + region[2]};
+ const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
+ const size_t dst_max[] = {dst_offset[0] + region[0],
+ dst_offset[1] + region[1],
+ dst_offset[2] + region[2]};
+ // Check for overlap
+ cl_bool overlap = CL_TRUE;
+ unsigned i;
+ size_t dst_start = dst_offset[2] * slice_pitch +
+ dst_offset[1] * row_pitch + dst_offset[0];
+ size_t dst_end = dst_start + (region[2] * slice_pitch +
+ region[1] * row_pitch + region[0]);
+ size_t src_start = src_offset[2] * slice_pitch +
+ src_offset[1] * row_pitch + src_offset[0];
+ size_t src_end = src_start + (region[2] * slice_pitch +
+ region[1] * row_pitch + region[0]);
+
+ for (i=0; i != 3; ++i) {
+ overlap = overlap && (src_min[i] < dst_max[i])
+ && (src_max[i] > dst_min[i]);
+ }
+
+ if (!overlap) {
+ size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
+ src_offset[0] + region[0] - row_pitch : 0;
+ size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
+ dst_offset[0] + region[0] - row_pitch : 0;
+ if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
+ (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
+ if ( (src_start <= dst_start && dst_start < src_end) ||
+ (dst_start <= src_start && src_start < dst_end) )
+ overlap = CL_TRUE;
+ }
+ if (region[2] > 1) {
+ size_t src_height = slice_pitch / row_pitch;
+ size_t dst_height = slice_pitch / row_pitch;
+ size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
+ src_offset[1] + region[1] - src_height : 0;
+ size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
+ dst_offset[1] + region[1] - dst_height : 0;
+ if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
+ (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
+ if ( (src_start <= dst_start && dst_start < src_end) ||
+ (dst_start <= src_start && src_start < dst_end) )
+ overlap = CL_TRUE;
+ }
+ }
+ }
+ return overlap;
+}
+
static cl_int
cl_check_device_type(cl_device_type device_type)
{
@@ -1483,8 +1542,79 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue,
const cl_event * event_wait_list,
cl_event * event)
{
- NOT_IMPLEMENTED;
- return 0;
+ cl_int err = CL_SUCCESS;
+ enqueue_data *data, no_wait_data = { 0 };
+
+ CHECK_QUEUE(command_queue);
+ CHECK_MEM(src_buffer);
+ CHECK_MEM(dst_buffer);
+
+ if ((command_queue->ctx != src_buffer->ctx) ||
+ (command_queue->ctx != dst_buffer->ctx)) {
+ err = CL_INVALID_CONTEXT;
+ goto error;
+ }
+
+ if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if(src_row_pitch == 0)
+ src_row_pitch = region[0];
+ if(src_slice_pitch == 0)
+ src_slice_pitch = region[1] * src_row_pitch;
+
+ if(dst_row_pitch == 0)
+ dst_row_pitch = region[0];
+ if(dst_slice_pitch == 0)
+ dst_slice_pitch = region[1] * dst_row_pitch;
+
+ if (src_row_pitch < region[0] ||
+ dst_row_pitch < region[0]) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) ||
+ (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if ((src_origin[2]+region[2])*src_slice_pitch + (src_origin[1]+region[1])*src_row_pitch + src_origin[0] + region[0] > src_buffer->size ||
+ (dst_origin[2]+region[2])*dst_slice_pitch + (dst_origin[1]+region[1])*dst_row_pitch + dst_origin[0] + region[0] > dst_buffer->size) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
+ err = CL_INVALID_VALUE;
+ goto error;
+ }
+
+ if (src_buffer == dst_buffer &&
+ check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
+ err = CL_MEM_COPY_OVERLAP;
+ goto error;
+ }
+
+ cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
+ src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
+
+ TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
+
+ data = &no_wait_data;
+ data->type = EnqueueCopyBufferRect;
+ data->queue = command_queue;
+
+ if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+ event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+ err = cl_command_queue_flush(command_queue);
+ }
+
+error:
+ return err;
}
cl_int
diff --git a/src/cl_context.h b/src/cl_context.h
index b1ef479..8b63104 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -39,6 +39,18 @@ enum _cl_gl_context_type {
CL_GL_CGL_SHAREGROUP
};
+enum _cl_internal_ker_type {
+ CL_ENQUEUE_COPY_BUFFER = 0,
+ CL_ENQUEUE_COPY_BUFFER_RECT = 1,
+ CL_ENQUEUE_COPY_IMAGE_0 = 2, //copy image 2d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_1 = 3, //copy image 2d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_2 = 4, //copy image 2d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_3 = 5, //copy image 2d to image 2d
+ CL_ENQUEUE_COPY_IMAGE_TO_BUFFER = 6,
+ CL_ENQUEUE_COPY_BUFFER_TO_IMAGE = 7,
+ CL_INVALID = 8
+};
+
struct _cl_context_prop {
cl_context_properties platform_id;
enum _cl_gl_context_type gl_type;
@@ -68,6 +80,8 @@ struct _cl_context {
pthread_mutex_t buffer_lock; /* To allocate and deallocate buffers */
pthread_mutex_t sampler_lock; /* To allocate and deallocate samplers */
pthread_mutex_t event_lock; /* To allocate and deallocate events */
+ cl_program internal_prgs[CL_INVALID]; /* All programs internal used, for example clEnqueuexxx api use */
+ cl_kernel internel_kernels[CL_INVALID]; /* All kernels for clenqueuexxx api, for example clEnqueuexxx api use */
uint32_t ver; /* Gen version */
struct _cl_context_prop props;
cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 156ea8c..3446ac3 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -372,6 +372,7 @@ cl_int cl_enqueue_handle(enqueue_data* data)
return cl_enqueue_map_image(data);
case EnqueueUnmapMemObject:
return cl_enqueue_unmap_mem_object(data);
+ case EnqueueCopyBufferRect:
case EnqueueNDRangeKernel:
cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr); //goto default
default:
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 8df2f89..fb6dc90 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -25,6 +25,9 @@
#include "cl_device_id.h"
#include "cl_driver.h"
#include "cl_khr_icd.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
#include "CL/cl.h"
#include "CL/cl_intel.h"
@@ -537,6 +540,89 @@ cl_mem_add_ref(cl_mem mem)
atomic_inc(&mem->ref_n);
}
+#define LOCAL_SZ_0 16
+#define LOCAL_SZ_1 4
+#define LOCAL_SZ_2 4
+
+LOCAL cl_int
+cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+ const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+ size_t src_row_pitch, size_t src_slice_pitch,
+ size_t dst_row_pitch, size_t dst_slice_pitch) {
+ cl_int ret;
+ cl_kernel ker;
+ size_t global_off[] = {0,0,0};
+ size_t global_sz[] = {1,1,1};
+ size_t local_sz[] = {LOCAL_SZ_2,LOCAL_SZ_1,LOCAL_SZ_0};
+ if(region[1] == 1) local_sz[1] = 1;
+ if(region[2] == 1) local_sz[2] = 1;
+ global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+ global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+ global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+ cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
+ cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
+ cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
+
+ static const char *kernel_str =
+ "kernel void __cl_cpy_buffer_rect ( \n"
+ " global char* src, global char* dst, \n"
+ " unsigned int region0, unsigned int region1, unsigned int region2, \n"
+ " unsigned int src_offset, unsigned int dst_offset, \n"
+ " unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
+ " unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
+ " int i = get_global_id(0); \n"
+ " int j = get_global_id(1); \n"
+ " int k = get_global_id(2); \n"
+ " if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+ " return; \n"
+ " src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+ " dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+ " dst[dst_offset] = src[src_offset]; \n"
+ "}";
+
+
+ /* We use one kernel to copy the data. The kernel is lazily created. */
+ assert(src_buf->ctx == dst_buf->ctx);
+ if (!src_buf->ctx->internal_prgs[index])
+ {
+ size_t length = strlen(kernel_str) + 1;
+ src_buf->ctx->internal_prgs[index] = cl_program_create_from_source(src_buf->ctx, 1, &kernel_str, &length, NULL);
+
+ if (!src_buf->ctx->internal_prgs[index])
+ return CL_OUT_OF_RESOURCES;
+
+ ret = cl_program_build(src_buf->ctx->internal_prgs[index], NULL);
+ if (ret != CL_SUCCESS)
+ return CL_OUT_OF_RESOURCES;
+
+ src_buf->ctx->internal_prgs[index]->is_built = 1;
+
+ src_buf->ctx->internel_kernels[index] = cl_kernel_dup(src_buf->ctx->internal_prgs[index]->ker[0]);
+ }
+
+ /* setup the kernel and run. */
+ ker = src_buf->ctx->internel_kernels[index];
+ if (!ker)
+ return CL_OUT_OF_RESOURCES;
+
+ cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+ cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
+ cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]);
+ cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]);
+ cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]);
+ cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
+ cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset);
+ cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch);
+ cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch);
+ cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
+ cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
+
+ ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+ cl_command_queue_finish(queue);
+
+ return ret;
+}
+
LOCAL void*
cl_mem_map(cl_mem mem)
{
diff --git a/src/cl_mem.h b/src/cl_mem.h
index c0d5503..cf05252 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -166,6 +166,10 @@ extern void cl_mem_gl_delete(struct _cl_mem_image *);
/* Add one more reference to this object */
extern void cl_mem_add_ref(cl_mem);
+/* api clEnqueueCopy buffer rect help function */
+extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
+ const size_t *, const size_t *, const size_t *,
+ size_t, size_t, size_t, size_t);
/* Directly map a memory object */
extern void *cl_mem_map(cl_mem);
--
1.8.1.2
More information about the Beignet
mailing list