[Beignet] [PATCH 2/3] enable USE_HOST_PTR for cl image with userptr to avoid extra copying

Guo Yejun yejun.guo at intel.com
Thu Sep 24 13:27:51 PDT 2015


the pointer must be 64 byte aligned, and only when w,h equals to its
aligned value, otherwise, roll back to the old method with extra copying.

Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 src/cl_command_queue.c |  7 ++-----
 src/cl_enqueue.c       | 10 ++++++----
 src/cl_mem.c           | 45 ++++++++++++++++++++++++++++++++++++---------
 3 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4b92311..f506a87 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -135,19 +135,16 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
     struct _cl_mem_image *image;
     assert(interp_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
 
-    //currently, user ptr is not supported for cl image, so offset should be always zero
-    assert(k->args[id].mem->offset == 0);
-
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
-    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
+    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
                         image->intel_fmt, image->image_type, image->bpp,
                         image->w, image->h, image->depth,
                         image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
     // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
     // on demand.
     if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
-      cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset,
+      cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset + k->args[id].mem->offset,
                           image->intel_fmt, image->image_type, image->bpp,
                           image->w, image->h, image->depth,
                           image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 9e34bb8..cec368c 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -316,8 +316,9 @@ cl_int cl_enqueue_map_image(enqueue_data *data)
 
   if(mem->flags & CL_MEM_USE_HOST_PTR) {
     assert(mem->host_ptr);
-    //src and dst need add offset in function cl_mem_copy_image_region
-    cl_mem_copy_image_region(data->origin, data->region,
+    if (!mem->is_userptr)
+      //src and dst need add offset in function cl_mem_copy_image_region
+      cl_mem_copy_image_region(data->origin, data->region,
                              mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
                              data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
   }
@@ -374,8 +375,9 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
         row_pitch = image->slice_pitch;
       else
         row_pitch = image->row_pitch;
-      //v_ptr have added offset, host_ptr have not added offset.
-      cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
+      if (!memobj->is_userptr)
+        //v_ptr have added offset, host_ptr have not added offset.
+        cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
                                memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
                                image, CL_FALSE, CL_TRUE);
     }
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 80c9064..c512355 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -288,7 +288,6 @@ cl_mem_allocate(enum cl_mem_type type,
       int cacheline_size = 0;
       cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
 
-      /* currently only cl buf is supported, will add cl image support later */
       if (type == CL_MEM_BUFFER_TYPE) {
         if (flags & CL_MEM_USE_HOST_PTR) {
           assert(host_ptr != NULL);
@@ -312,6 +311,18 @@ cl_mem_allocate(enum cl_mem_type type,
           mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", internal_host_ptr, alignedSZ, 0);
           bufCreated = 1;
         }
+      } else if (type == CL_MEM_IMAGE_TYPE) {
+        if (host_ptr != NULL) {
+          assert(flags & CL_MEM_USE_HOST_PTR);
+          assert(!is_tiled);
+          assert(ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr);
+          void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & (~(page_size - 1)));
+          mem->offset = host_ptr - aligned_host_ptr;
+          mem->is_userptr = 1;
+          size_t aligned_sz = ALIGN((mem->offset + sz), page_size);
+          mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", aligned_host_ptr, aligned_sz, 0);
+          bufCreated = 1;
+        }
       }
     }
 
@@ -823,6 +834,16 @@ _cl_mem_new_image(cl_context ctx,
 
 #undef DO_IMAGE_ERROR
 
+  uint8_t enableUserptr = 0;
+  if (ctx->device->host_unified_memory && data != NULL && (flags & CL_MEM_USE_HOST_PTR)) {
+    int cacheline_size = 0;
+    cl_get_device_info(ctx->device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
+    if (ALIGN((unsigned long)data, cacheline_size) == (unsigned long)data) {  //might more conditions here
+      tiling = CL_NO_TILE;
+      enableUserptr = 1;
+    }
+  }
+
   /* Tiling requires to align both pitch and height */
   if (tiling == CL_NO_TILE) {
     aligned_pitch = w * bpp;
@@ -861,8 +882,12 @@ _cl_mem_new_image(cl_context ctx,
   if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
     if (image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL)
       mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, buffer, &err);
-    else
-      mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, data, NULL, &err);
+    else {
+      if (enableUserptr)
+        mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, data, NULL, &err);
+      else
+        mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err);
+    }
   } else {
     mem = cl_mem_allocate(CL_MEM_BUFFER1D_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err);
     if (mem != NULL && err == CL_SUCCESS) {
@@ -892,13 +917,15 @@ _cl_mem_new_image(cl_context ctx,
                     0, 0, 0);
 
   /* Copy the data if required */
-  if (flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) {
+  if (flags & CL_MEM_COPY_HOST_PTR)
     cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
-    if (flags & CL_MEM_USE_HOST_PTR) {
-      mem->host_ptr = data;
-      cl_mem_image(mem)->host_row_pitch = pitch;
-      cl_mem_image(mem)->host_slice_pitch = slice_pitch;
-    }
+
+  if (flags & CL_MEM_USE_HOST_PTR) {
+    mem->host_ptr = data;
+    cl_mem_image(mem)->host_row_pitch = pitch;
+    cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+    if (!enableUserptr)
+      cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
   }
 
 exit:
-- 
1.9.1



More information about the Beignet mailing list