[Beignet] [PATCH v3 5/8] Create image with TILE_Y mode still when image size>128MB for performance.
yan.wang at linux.intel.com
yan.wang at linux.intel.com
Tue May 16 11:03:33 UTC 2017
From: Yan Wang <yan.wang at linux.intel.com>
It may failed to copy data from host ptr to TILE_Y large image.
So use clCopyBufferToImage to do this on GPU side.
Signed-off-by: Yan Wang <yan.wang at linux.intel.com>
---
src/cl_context.c | 6 ++++
src/cl_context.h | 2 +-
src/cl_mem.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
src/cl_mem.h | 2 ++
4 files changed, 111 insertions(+), 6 deletions(-)
diff --git a/src/cl_context.c b/src/cl_context.c
index 1ba2302..4b8281c 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -342,6 +342,7 @@ cl_context_new(struct _cl_context_prop *props, cl_uint dev_num, cl_device_id* al
TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new(props));
ctx->props = *props;
ctx->ver = cl_driver_get_ver(ctx->drv);
+ ctx->image_queue = NULL;
exit:
return ctx;
@@ -362,6 +363,11 @@ cl_context_delete(cl_context ctx)
if (CL_OBJECT_DEC_REF(ctx) > 1)
return;
+ if (ctx->image_queue) {
+ clReleaseCommandQueue(ctx->image_queue);
+ ctx->image_queue = NULL;
+ }
+
/* delete the internal programs. */
for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
if (ctx->internal_kernels[i]) {
diff --git a/src/cl_context.h b/src/cl_context.h
index 4812afd..8ba499f 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -129,7 +129,7 @@ struct _cl_context {
void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *);
/* User's callback when error occur in context */
void *user_data; /* A pointer to user supplied data */
-
+ cl_command_queue image_queue; /* A internal command queue for image data copying */
};
#define CL_OBJECT_CONTEXT_MAGIC 0x20BBCADE993134AALL
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 4a7bec8..0c49c3d 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -153,6 +153,8 @@ cl_mem_allocate(enum cl_mem_type type,
if (mem->type == CL_MEM_IMAGE_TYPE) {
cl_mem_image(mem)->is_image_from_buffer = 0;
cl_mem_image(mem)->is_image_from_nv12_image = 0;
+ cl_mem_image(mem)->is_ker_copy = 0;
+ cl_mem_image(mem)->tmp_ker_buf = NULL;
}
if (sz != 0) {
@@ -751,6 +753,80 @@ cl_image_tiling_t cl_get_default_tiling(cl_driver drv)
}
static cl_mem
+_cl_new_image_copy_from_host_ptr(cl_context ctx,
+ cl_mem_flags flags,
+ const cl_image_format *fmt,
+ const cl_mem_object_type image_type,
+ size_t w,
+ size_t h,
+ size_t depth,
+ size_t pitch,
+ size_t slice_pitch,
+ size_t sz,
+ size_t aligned_pitch,
+ uint32_t intel_fmt,
+ uint32_t bpp,
+ cl_image_tiling_t tiling,
+ void *data, //pointer from application
+ cl_int *errcode_ret)
+{
+ cl_int err = CL_SUCCESS;
+ cl_mem mem = NULL;
+ size_t origin[3] = {0, 0, 0};
+ size_t region[3] = {w, h, depth};
+ size_t aligned_slice_pitch = 0;
+
+ if (ctx->image_queue == NULL) {
+ ctx->image_queue = clCreateCommandQueueWithProperties(ctx, ctx->devices[0], 0, &err);
+ if (err != CL_SUCCESS || !ctx->image_queue) {
+ *errcode_ret = err;
+ ctx->image_queue = NULL;
+ return NULL;
+ }
+ }
+
+ // Map host ptr to OCL buffer
+ cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, sz, data, &err);
+ if (err != CL_SUCCESS) {
+ *errcode_ret = err;
+ return NULL;
+ }
+
+ mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err);
+ if (mem == NULL || err != CL_SUCCESS) {
+ clReleaseMemObject(buf);
+ return NULL;
+ }
+
+ cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+
+ if (image_type == CL_MEM_OBJECT_IMAGE2D)
+ aligned_slice_pitch = 0;
+ else
+ //SKL need use tiling's aligned_h to calc slice_pitch and IVB to BDW need CL_NO_TILE's aligned_h to calc.
+ aligned_slice_pitch = aligned_pitch * ALIGN(h, cl_buffer_get_tiling_align(ctx, tiling, 2));
+
+ cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
+ intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
+ 0, 0, 0);
+
+ err = clEnqueueCopyBufferToImage(ctx->image_queue, buf, mem, 0, origin, region, 0, NULL, NULL);
+ if(err != CL_SUCCESS) {
+ clReleaseMemObject(buf);
+ clReleaseMemObject(mem);
+ return NULL;
+ }
+
+ clReleaseMemObject(buf);
+ if (flags & CL_MEM_USE_HOST_PTR && data) {
+ mem->host_ptr = data;
+ cl_mem_image(mem)->host_row_pitch = pitch;
+ cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+ }
+ return mem;
+}
+
+static cl_mem
_cl_mem_new_image(cl_context ctx,
cl_mem_flags flags,
const cl_image_format *fmt,
@@ -765,6 +841,7 @@ _cl_mem_new_image(cl_context ctx,
cl_int *errcode_ret)
{
cl_int err = CL_SUCCESS;
+ cl_bool is_ker_copy = 0;
cl_mem mem = NULL;
cl_mem_object_type image_type = orig_image_type;
uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
@@ -931,11 +1008,25 @@ _cl_mem_new_image(cl_context ctx,
/* If sz is large than 128MB, map gtt may fail in some system.
Because there is no obviours performance drop, disable tiling. */
- if(tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
- tiling = CL_NO_TILE;
- aligned_pitch = w * bpp;
- aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
- sz = aligned_pitch * aligned_h * depth;
+ if (tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
+ if ((image_type == CL_MEM_OBJECT_IMAGE2D || image_type == CL_MEM_OBJECT_IMAGE3D) &&
+ buffer == NULL) {
+ if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) {
+ mem = _cl_new_image_copy_from_host_ptr(ctx, flags, fmt, image_type, w, h, depth, pitch,
+ slice_pitch, sz, aligned_pitch, intel_fmt, bpp, tiling, data, &err);
+ if (mem != NULL) {
+ cl_mem_image(mem)->is_ker_copy = 1;
+ goto exit;
+ } else
+ goto error;
+ } else
+ is_ker_copy = 1;
+ } else {
+ tiling = CL_NO_TILE;
+ aligned_pitch = w * bpp;
+ aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
+ sz = aligned_pitch * aligned_h * depth;
+ }
}
if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
@@ -992,6 +1083,8 @@ _cl_mem_new_image(cl_context ctx,
cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
}
+ cl_mem_image(mem)->is_ker_copy = is_ker_copy;
+
exit:
if (errcode_ret)
*errcode_ret = err;
@@ -1389,6 +1482,10 @@ cl_mem_delete(cl_mem mem)
mem->bo = NULL;
}
}
+ if (cl_mem_image(mem)->tmp_ker_buf) {
+ cl_mem_delete(cl_mem_image(mem)->tmp_ker_buf);
+ cl_mem_image(mem)->tmp_ker_buf = NULL;
+ }
}
/* Someone still mapped, unmap */
diff --git a/src/cl_mem.h b/src/cl_mem.h
index edfd043..0b33c31 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -145,6 +145,8 @@ struct _cl_mem_image {
uint8_t is_image_from_buffer; /* IMAGE from Buffer*/
cl_mem nv12_image; /* if the image is created from nv12 Image, it point to the image.*/
uint8_t is_image_from_nv12_image; /* IMAGE from NV12 Image*/
+ cl_bool is_ker_copy; /* this object is copied by OCL kernel */
+ cl_mem tmp_ker_buf; /* this object is tmp buffer for OCL kernel copying */
};
struct _cl_mem_gl_image {
--
2.7.4
More information about the Beignet
mailing list