[Beignet] [PATCH v3 5/8] Create image with TILE_Y mode still when image size>128MB for performance.

yan.wang at linux.intel.com yan.wang at linux.intel.com
Tue May 16 11:03:33 UTC 2017


From: Yan Wang <yan.wang at linux.intel.com>

It may failed to copy data from host ptr to TILE_Y large image.
So use clCopyBufferToImage to do this on GPU side.

Signed-off-by: Yan Wang <yan.wang at linux.intel.com>
---
 src/cl_context.c |   6 ++++
 src/cl_context.h |   2 +-
 src/cl_mem.c     | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 src/cl_mem.h     |   2 ++
 4 files changed, 111 insertions(+), 6 deletions(-)

diff --git a/src/cl_context.c b/src/cl_context.c
index 1ba2302..4b8281c 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -342,6 +342,7 @@ cl_context_new(struct _cl_context_prop *props, cl_uint dev_num, cl_device_id* al
   TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new(props));
   ctx->props = *props;
   ctx->ver = cl_driver_get_ver(ctx->drv);
+  ctx->image_queue = NULL;
 
 exit:
   return ctx;
@@ -362,6 +363,11 @@ cl_context_delete(cl_context ctx)
   if (CL_OBJECT_DEC_REF(ctx) > 1)
     return;
 
+  if (ctx->image_queue) {
+    clReleaseCommandQueue(ctx->image_queue);
+    ctx->image_queue = NULL;
+  }
+
   /* delete the internal programs. */
   for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
     if (ctx->internal_kernels[i]) {
diff --git a/src/cl_context.h b/src/cl_context.h
index 4812afd..8ba499f 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -129,7 +129,7 @@ struct _cl_context {
   void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *);
                                      /* User's callback when error occur in context */
   void *user_data;                   /* A pointer to user supplied data */
-
+  cl_command_queue image_queue;      /* A internal command queue for image data copying */
 };
 
 #define CL_OBJECT_CONTEXT_MAGIC 0x20BBCADE993134AALL
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 4a7bec8..0c49c3d 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -153,6 +153,8 @@ cl_mem_allocate(enum cl_mem_type type,
   if (mem->type == CL_MEM_IMAGE_TYPE) {
     cl_mem_image(mem)->is_image_from_buffer = 0;
     cl_mem_image(mem)->is_image_from_nv12_image = 0;
+    cl_mem_image(mem)->is_ker_copy = 0;
+    cl_mem_image(mem)->tmp_ker_buf = NULL;
   }
 
   if (sz != 0) {
@@ -751,6 +753,80 @@ cl_image_tiling_t cl_get_default_tiling(cl_driver drv)
 }
 
 static cl_mem
+_cl_new_image_copy_from_host_ptr(cl_context ctx,
+                  cl_mem_flags flags,
+                  const cl_image_format *fmt,
+                  const cl_mem_object_type image_type,
+                  size_t w,
+                  size_t h,
+                  size_t depth,
+                  size_t pitch,
+                  size_t slice_pitch,
+                  size_t sz,
+                  size_t aligned_pitch,
+                  uint32_t intel_fmt,
+                  uint32_t bpp,
+                  cl_image_tiling_t tiling,
+                  void *data,           //pointer from application
+                  cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, depth};
+  size_t aligned_slice_pitch = 0;
+
+  if (ctx->image_queue == NULL) {
+    ctx->image_queue = clCreateCommandQueueWithProperties(ctx, ctx->devices[0], 0, &err);
+    if (err != CL_SUCCESS || !ctx->image_queue) {
+      *errcode_ret = err;
+      ctx->image_queue = NULL;
+      return NULL;
+    }
+  }
+
+  // Map host ptr to OCL buffer
+  cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, sz, data, &err);
+  if (err != CL_SUCCESS) {
+    *errcode_ret = err;
+    return NULL;
+  }
+
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err);
+  if (mem == NULL || err != CL_SUCCESS) {
+    clReleaseMemObject(buf);
+    return NULL;
+  }
+
+  cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+
+  if (image_type == CL_MEM_OBJECT_IMAGE2D)
+    aligned_slice_pitch = 0;
+  else
+    //SKL need use tiling's aligned_h to calc slice_pitch and IVB to BDW need CL_NO_TILE's aligned_h to calc.
+    aligned_slice_pitch = aligned_pitch * ALIGN(h, cl_buffer_get_tiling_align(ctx, tiling, 2));
+
+  cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
+                    intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
+                    0, 0, 0);
+
+  err = clEnqueueCopyBufferToImage(ctx->image_queue, buf, mem, 0, origin, region, 0, NULL, NULL);
+  if(err != CL_SUCCESS) {
+    clReleaseMemObject(buf);
+    clReleaseMemObject(mem);
+    return NULL;
+  }
+
+  clReleaseMemObject(buf);
+  if (flags & CL_MEM_USE_HOST_PTR && data) {
+    mem->host_ptr = data;
+    cl_mem_image(mem)->host_row_pitch = pitch;
+    cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+  }
+  return mem;
+}
+
+static cl_mem
 _cl_mem_new_image(cl_context ctx,
                   cl_mem_flags flags,
                   const cl_image_format *fmt,
@@ -765,6 +841,7 @@ _cl_mem_new_image(cl_context ctx,
                   cl_int *errcode_ret)
 {
   cl_int err = CL_SUCCESS;
+  cl_bool is_ker_copy = 0;
   cl_mem mem = NULL;
   cl_mem_object_type image_type = orig_image_type;
   uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
@@ -931,11 +1008,25 @@ _cl_mem_new_image(cl_context ctx,
 
   /* If sz is large than 128MB, map gtt may fail in some system.
      Because there is no obviours performance drop, disable tiling. */
-  if(tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
-    tiling = CL_NO_TILE;
-    aligned_pitch = w * bpp;
-    aligned_h     = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
-    sz = aligned_pitch * aligned_h * depth;
+  if (tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
+    if ((image_type == CL_MEM_OBJECT_IMAGE2D || image_type == CL_MEM_OBJECT_IMAGE3D) &&
+      buffer == NULL) {
+      if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) {
+        mem = _cl_new_image_copy_from_host_ptr(ctx, flags, fmt, image_type, w, h, depth, pitch,
+          slice_pitch, sz, aligned_pitch, intel_fmt, bpp, tiling, data, &err);
+        if (mem != NULL) {
+          cl_mem_image(mem)->is_ker_copy = 1;
+          goto exit;
+        } else
+          goto error;
+      } else
+        is_ker_copy = 1;
+    } else {
+      tiling = CL_NO_TILE;
+      aligned_pitch = w * bpp;
+      aligned_h     = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
+      sz = aligned_pitch * aligned_h * depth;
+    }
   }
 
   if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
@@ -992,6 +1083,8 @@ _cl_mem_new_image(cl_context ctx,
       cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
   }
 
+  cl_mem_image(mem)->is_ker_copy = is_ker_copy;
+
 exit:
   if (errcode_ret)
     *errcode_ret = err;
@@ -1389,6 +1482,10 @@ cl_mem_delete(cl_mem mem)
           mem->bo = NULL;
         }
     }
+    if (cl_mem_image(mem)->tmp_ker_buf) {
+      cl_mem_delete(cl_mem_image(mem)->tmp_ker_buf);
+      cl_mem_image(mem)->tmp_ker_buf = NULL;
+    }
   }
 
   /* Someone still mapped, unmap */
diff --git a/src/cl_mem.h b/src/cl_mem.h
index edfd043..0b33c31 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -145,6 +145,8 @@ struct _cl_mem_image {
   uint8_t is_image_from_buffer;       /* IMAGE from Buffer*/
   cl_mem nv12_image;               /* if the image is created from nv12 Image, it point to the image.*/
   uint8_t is_image_from_nv12_image;       /* IMAGE from NV12 Image*/
+  cl_bool is_ker_copy;      /* this object is copied by OCL kernel */
+  cl_mem tmp_ker_buf;       /* this object is tmp buffer for OCL kernel copying */
 };
 
 struct _cl_mem_gl_image {
-- 
2.7.4



More information about the Beignet mailing list