[Beignet] [PATCH 2/3] enable USE_HOST_PTR for cl image with userptr to avoid extra copying
Guo Yejun
yejun.guo at intel.com
Thu Sep 24 13:27:51 PDT 2015
the pointer must be 64 byte aligned, and only when w,h equals to its
aligned value, otherwise, roll back to the old method with extra copying.
Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
src/cl_command_queue.c | 7 ++-----
src/cl_enqueue.c | 10 ++++++----
src/cl_mem.c | 45 ++++++++++++++++++++++++++++++++++++---------
3 files changed, 44 insertions(+), 18 deletions(-)
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4b92311..f506a87 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -135,19 +135,16 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
struct _cl_mem_image *image;
assert(interp_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
- //currently, user ptr is not supported for cl image, so offset should be always zero
- assert(k->args[id].mem->offset == 0);
-
image = cl_mem_image(k->args[id].mem);
set_image_info(k->curbe, &k->images[i], image);
- cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
+ cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
image->intel_fmt, image->image_type, image->bpp,
image->w, image->h, image->depth,
image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
// TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
// on demand.
if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
- cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset,
+ cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset + k->args[id].mem->offset,
image->intel_fmt, image->image_type, image->bpp,
image->w, image->h, image->depth,
image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 9e34bb8..cec368c 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -316,8 +316,9 @@ cl_int cl_enqueue_map_image(enqueue_data *data)
if(mem->flags & CL_MEM_USE_HOST_PTR) {
assert(mem->host_ptr);
- //src and dst need add offset in function cl_mem_copy_image_region
- cl_mem_copy_image_region(data->origin, data->region,
+ if (!mem->is_userptr)
+ //src and dst need add offset in function cl_mem_copy_image_region
+ cl_mem_copy_image_region(data->origin, data->region,
mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
}
@@ -374,8 +375,9 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
row_pitch = image->slice_pitch;
else
row_pitch = image->row_pitch;
- //v_ptr have added offset, host_ptr have not added offset.
- cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
+ if (!memobj->is_userptr)
+ //v_ptr have added offset, host_ptr have not added offset.
+ cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
image, CL_FALSE, CL_TRUE);
}
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 80c9064..c512355 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -288,7 +288,6 @@ cl_mem_allocate(enum cl_mem_type type,
int cacheline_size = 0;
cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
- /* currently only cl buf is supported, will add cl image support later */
if (type == CL_MEM_BUFFER_TYPE) {
if (flags & CL_MEM_USE_HOST_PTR) {
assert(host_ptr != NULL);
@@ -312,6 +311,18 @@ cl_mem_allocate(enum cl_mem_type type,
mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", internal_host_ptr, alignedSZ, 0);
bufCreated = 1;
}
+ } else if (type == CL_MEM_IMAGE_TYPE) {
+ if (host_ptr != NULL) {
+ assert(flags & CL_MEM_USE_HOST_PTR);
+ assert(!is_tiled);
+ assert(ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr);
+ void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & (~(page_size - 1)));
+ mem->offset = host_ptr - aligned_host_ptr;
+ mem->is_userptr = 1;
+ size_t aligned_sz = ALIGN((mem->offset + sz), page_size);
+ mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", aligned_host_ptr, aligned_sz, 0);
+ bufCreated = 1;
+ }
}
}
@@ -823,6 +834,16 @@ _cl_mem_new_image(cl_context ctx,
#undef DO_IMAGE_ERROR
+ uint8_t enableUserptr = 0;
+ if (ctx->device->host_unified_memory && data != NULL && (flags & CL_MEM_USE_HOST_PTR)) {
+ int cacheline_size = 0;
+ cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
+ if (ALIGN((unsigned long)data, cacheline_size) == (unsigned long)data) { //might more conditions here
+ tiling = CL_NO_TILE;
+ enableUserptr = 1;
+ }
+ }
+
/* Tiling requires to align both pitch and height */
if (tiling == CL_NO_TILE) {
aligned_pitch = w * bpp;
@@ -861,8 +882,12 @@ _cl_mem_new_image(cl_context ctx,
if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
if (image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL)
mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, buffer, &err);
- else
- mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, data, NULL, &err);
+ else {
+ if (enableUserptr)
+ mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, data, NULL, &err);
+ else
+ mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err);
+ }
} else {
mem = cl_mem_allocate(CL_MEM_BUFFER1D_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err);
if (mem != NULL && err == CL_SUCCESS) {
@@ -892,13 +917,15 @@ _cl_mem_new_image(cl_context ctx,
0, 0, 0);
/* Copy the data if required */
- if (flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) {
+ if (flags & CL_MEM_COPY_HOST_PTR)
cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
- if (flags & CL_MEM_USE_HOST_PTR) {
- mem->host_ptr = data;
- cl_mem_image(mem)->host_row_pitch = pitch;
- cl_mem_image(mem)->host_slice_pitch = slice_pitch;
- }
+
+ if (flags & CL_MEM_USE_HOST_PTR) {
+ mem->host_ptr = data;
+ cl_mem_image(mem)->host_row_pitch = pitch;
+ cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+ if (!enableUserptr)
+ cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
}
exit:
--
1.9.1
More information about the Beignet
mailing list