[Beignet] [PATCH 3/6] Create image with TILE_Y mode still when image size > 128MB for performance.

yan.wang at linux.intel.com yan.wang at linux.intel.com
Tue May 9 10:02:51 UTC 2017


From: Yan Wang <yan.wang at linux.intel.com>

It may failed to copy data from host ptr to TILE_Y large image.
So use clCopyBufferToImage to do this on GPU side.

Signed-off-by: Yan Wang <yan.wang at linux.intel.com>
---
 src/cl_mem.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 src/cl_mem.h |   2 ++
 2 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index 4a7bec8..fe0dd2f 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -149,6 +149,8 @@ cl_mem_allocate(enum cl_mem_type type,
   mem->is_userptr = 0;
   mem->offset = 0;
   mem->is_svm = 0;
+  mem->is_ker_copy = 0;
+  mem->tmp_ker_buf = NULL;
   mem->cmrt_mem = NULL;
   if (mem->type == CL_MEM_IMAGE_TYPE) {
     cl_mem_image(mem)->is_image_from_buffer = 0;
@@ -750,6 +752,77 @@ cl_image_tiling_t cl_get_default_tiling(cl_driver drv)
   return tiling;
 }
 
+cl_command_queue image_queue = NULL;
+
+static cl_mem
+_cl_new_image_copy_from_host_ptr(cl_context ctx,
+                  cl_mem_flags flags,
+                  const cl_image_format *fmt,
+                  const cl_mem_object_type image_type,
+                  size_t w,
+                  size_t h,
+                  size_t depth,
+                  size_t pitch,
+                  size_t slice_pitch,
+                  size_t sz,
+                  size_t aligned_pitch,
+                  uint32_t intel_fmt,
+                  uint32_t bpp,
+                  cl_image_tiling_t tiling,
+                  void *data,           //pointer from application
+                  cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, depth};
+  size_t aligned_slice_pitch = 0;
+
+  if (image_queue == NULL) {
+    image_queue = clCreateCommandQueueWithProperties(ctx, ctx->devices[0], 0, &err);
+    if (err != CL_SUCCESS) {
+      *errcode_ret = err;
+      return NULL;
+    }
+  }
+
+  // Map host ptr to OCL buffer
+  cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, sz, data, &err);
+  if (err != CL_SUCCESS) {
+    *errcode_ret = err;
+    return NULL;
+  }
+
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err);
+  if (mem == NULL || err != CL_SUCCESS) {
+    clReleaseMemObject(buf);
+    return NULL;
+  }
+
+  cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+
+  if (image_type == CL_MEM_OBJECT_IMAGE2D)
+    aligned_slice_pitch = 0;
+  else
+    //SKL need use tiling's aligned_h to calc slice_pitch and IVB to BDW need CL_NO_TILE's aligned_h to calc.
+    aligned_slice_pitch = aligned_pitch * ALIGN(h, cl_buffer_get_tiling_align(ctx, tiling, 2));
+
+  cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
+                    intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
+                    0, 0, 0);
+
+  err = clEnqueueCopyBufferToImage(image_queue, buf, mem, 0, origin, region, 0, NULL, NULL);
+  if(err != CL_SUCCESS) {
+    clReleaseMemObject(buf);
+    clReleaseMemObject(mem);
+    return NULL;
+  }
+
+  mem->is_ker_copy = 1;
+  clReleaseMemObject(buf);
+  return mem;
+}
+
 static cl_mem
 _cl_mem_new_image(cl_context ctx,
                   cl_mem_flags flags,
@@ -765,6 +838,7 @@ _cl_mem_new_image(cl_context ctx,
                   cl_int *errcode_ret)
 {
   cl_int err = CL_SUCCESS;
+  cl_bool is_ker_copy = 0;
   cl_mem mem = NULL;
   cl_mem_object_type image_type = orig_image_type;
   uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
@@ -931,11 +1005,22 @@ _cl_mem_new_image(cl_context ctx,
 
   /* If sz is large than 128MB, map gtt may fail in some system.
      Because there is no obviours performance drop, disable tiling. */
-  if(tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
-    tiling = CL_NO_TILE;
-    aligned_pitch = w * bpp;
-    aligned_h     = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
-    sz = aligned_pitch * aligned_h * depth;
+  if (tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
+    if ((image_type == CL_MEM_OBJECT_IMAGE2D || image_type == CL_MEM_OBJECT_IMAGE3D) &&
+        (flags & CL_MEM_COPY_HOST_PTR)) {
+      mem = _cl_new_image_copy_from_host_ptr(ctx, flags, fmt, image_type, w, h, depth, pitch,
+          slice_pitch, sz, aligned_pitch, intel_fmt, bpp, tiling, data, &err);
+      if (mem != NULL)
+        goto exit;
+    }
+
+    if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) {
+      tiling = CL_NO_TILE;
+      aligned_pitch = w * bpp;
+      aligned_h     = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
+      sz = aligned_pitch * aligned_h * depth;
+    } else
+      is_ker_copy = 1;
   }
 
   if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
@@ -992,6 +1077,8 @@ _cl_mem_new_image(cl_context ctx,
       cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
   }
 
+  mem->is_ker_copy = is_ker_copy;
+
 exit:
   if (errcode_ret)
     *errcode_ret = err;
@@ -1368,6 +1455,9 @@ cl_mem_delete(cl_mem mem)
     cl_free(cb);
   }
 
+  if (mem->tmp_ker_buf)
+    cl_mem_delete(mem->tmp_ker_buf);
+
   /* iff we are a image, delete the 1d buffer if has. */
   if (IS_IMAGE(mem)) {
     if (cl_mem_image(mem)->buffer_1d) {
diff --git a/src/cl_mem.h b/src/cl_mem.h
index edfd043..20afe23 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -98,6 +98,8 @@ typedef  struct _cl_mem {
   list_head dstr_cb_head;   /* All destroy callbacks. */
   uint8_t is_userptr;       /* CL_MEM_USE_HOST_PTR is enabled */
   cl_bool is_svm;           /* This object  is svm */
+  cl_bool is_ker_copy;      /* this object is copied by OCL kernel */
+  cl_mem tmp_ker_buf;       /* this object is tmp buffer for OCL kernel copying */
   size_t offset;            /* offset of host_ptr to the page beginning, only for CL_MEM_USE_HOST_PTR*/
 
   uint8_t cmrt_mem_type;    /* CmBuffer, CmSurface2D, ... */
-- 
2.7.4



More information about the Beignet mailing list