[Beignet] [PATCH V3 1/3] support CL_MEM_USE_HOST_PTR with userptr for cl buffer

Guo Yejun yejun.guo at intel.com
Fri Nov 7 00:18:54 PST 2014


userptr is used to wrap a memory pointer (page aligned) supplied
by user space into a buffer object accessed by GPU, and so no extra
copy is needed. It is supported starting from linux kernel 3.16
and libdrm 2.4.58.

This patch is originally finished by Zhenyu Wang <zhenyuw at linux.intel.com>,
I did a little change and some code clean.

No regression issue found on IVB+Ubuntu14.10 with libdrm upgraded with tests:
beignet/utests, piglit, OpenCV/test&perf, conformance/basic&mem_host_flags&buffers

V2: add page align limit for data size, add comments for kernel without MMU_NOTIFIER
V3: add runtime check with host_unified_memory, return CL_MEM_OBJECT_ALLOCATION_FAILURE if failed
Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 CMakeLists.txt           | 11 +++++++++--
 src/CMakeLists.txt       |  5 +++++
 src/cl_api.c             | 10 +++++++---
 src/cl_driver.h          |  3 +++
 src/cl_driver_defs.c     |  1 +
 src/cl_enqueue.c         | 19 ++++++++++++-------
 src/cl_mem.c             | 37 ++++++++++++++++++++++++++++++++-----
 src/cl_mem.h             |  2 ++
 src/cl_mem_gl.c          |  2 +-
 src/intel/intel_driver.c | 15 +++++++++++++++
 10 files changed, 87 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 40cb74c..15386f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,7 +108,7 @@ ENDIF(X11_FOUND)
 # DRM
 pkg_check_modules(DRM REQUIRED libdrm)
 IF(DRM_FOUND)
-  MESSAGE(STATUS "Looking for DRM - found at ${DRM_PREFIX}")
+  MESSAGE(STATUS "Looking for DRM - found at ${DRM_PREFIX} ${DRM_VERSION}")
   INCLUDE_DIRECTORIES(${DRM_INCLUDE_DIRS})
 ELSE(DRM_FOUND)
   MESSAGE(STATUS "Looking for DRM - not found")
@@ -118,7 +118,14 @@ ENDIF(DRM_FOUND)
 pkg_check_modules(DRM_INTEL libdrm_intel>=2.4.52)
 IF(DRM_INTEL_FOUND)
   INCLUDE_DIRECTORIES(${DRM_INTEL_INCLUDE_DIRS})
-  MESSAGE(STATUS "Looking for DRM Intel - found at ${DRM_INTEL_PREFIX}")
+  MESSAGE(STATUS "Looking for DRM Intel - found at ${DRM_INTEL_PREFIX} ${DRM_INTEL_VERSION}")
+  #userptr support starts from 2.4.57, but 2.4.58 is the actual stable release
+  IF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+    MESSAGE(STATUS "Enable userptr support")
+    SET(DRM_INTEL_USERPTR "enable")
+  ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+    MESSAGE(STATUS "Disable userptr support")
+  ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
 ELSE(DRM_INTEL_FOUND)
   MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
 ENDIF(DRM_INTEL_FOUND)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fc5de89..7182bad 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -109,6 +109,11 @@ SET(CMAKE_CXX_FLAGS "-DHAS_OCLIcd ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DHAS_OCLIcd ${CMAKE_C_FLAGS}")
 endif (OCLIcd_FOUND)
 
+if (DRM_INTEL_USERPTR)
+SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_USERPTR ${CMAKE_C_FLAGS}")
+endif (DRM_INTEL_USERPTR)
+
 set(GIT_SHA1 "git_sha1.h")
 add_custom_target(${GIT_SHA1} ALL
   COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh
diff --git a/src/cl_api.c b/src/cl_api.c
index 05d3093..1f24638 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -2665,9 +2665,13 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
     ptr = data->ptr;
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   } else {
-    if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
-      err = CL_MAP_FAILURE;
-      goto error;
+    if (buffer->is_userptr)
+      ptr = buffer->host_ptr;
+    else {
+      if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
+        err = CL_MAP_FAILURE;
+        goto error;
+      }
     }
   }
   err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 638b791..8697ff2 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -285,6 +285,9 @@ extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
 typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, size_t, size_t);
 extern cl_buffer_alloc_cb *cl_buffer_alloc;
 
+typedef cl_buffer (cl_buffer_alloc_userptr_cb)(cl_buffer_mgr, const char*, void *, size_t, unsigned long);
+extern cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr;
+
 /* Set a buffer's tiling mode */
 typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
 extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index c31b6fc..1335c20 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -29,6 +29,7 @@ LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
 
 /* Buffer */
 LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
+LOCAL cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr = NULL;
 LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
 LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
 LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index db0bce7..5bdb7cd 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -234,11 +234,15 @@ cl_int cl_enqueue_map_buffer(enqueue_data *data)
          mem->type == CL_MEM_SUBBUFFER_TYPE);
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
 
-  if(data->unsync_map == 1)
-    //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
-    ptr = cl_mem_map_gtt(mem);
-  else
-    ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
+  if (mem->is_userptr)
+    ptr = mem->host_ptr;
+  else {
+    if(data->unsync_map == 1)
+      //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+      ptr = cl_mem_map_gtt(mem);
+    else
+      ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
+  }
 
   if (ptr == NULL) {
     err = CL_MAP_FAILURE;
@@ -246,7 +250,7 @@ cl_int cl_enqueue_map_buffer(enqueue_data *data)
   }
   data->ptr = ptr;
 
-  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+  if((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) {
     assert(mem->host_ptr);
     ptr = (char*)ptr + data->offset + buffer->sub_offset;
     memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
@@ -331,7 +335,8 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
       assert(mapped_ptr >= memobj->host_ptr &&
         mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
       /* Sync the data. */
-      memcpy(v_ptr, mapped_ptr, mapped_size);
+      if (!memobj->is_userptr)
+        memcpy(v_ptr, mapped_ptr, mapped_size);
     } else {
       CHECK_IMAGE(memobj, image);
 
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 16bd613..9e38670 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -33,6 +33,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 
 #define FIELD_SIZE(CASE,TYPE)               \
   case JOIN(CL_,CASE):                      \
@@ -223,6 +224,7 @@ cl_mem_allocate(enum cl_mem_type type,
                 cl_mem_flags flags,
                 size_t sz,
                 cl_int is_tiled,
+                void *host_ptr,
                 cl_int *errcode)
 {
   cl_buffer_mgr bufmgr = NULL;
@@ -251,6 +253,7 @@ cl_mem_allocate(enum cl_mem_type type,
   mem->ref_n = 1;
   mem->magic = CL_MAGIC_MEM_HEADER;
   mem->flags = flags;
+  mem->is_userptr = 0;
 
   if (sz != 0) {
     /* Pinning will require stricter alignment rules */
@@ -260,7 +263,28 @@ cl_mem_allocate(enum cl_mem_type type,
     /* Allocate space in memory */
     bufmgr = cl_context_get_bufmgr(ctx);
     assert(bufmgr);
+
+#ifdef HAS_USERPTR
+    if (ctx->device->host_unified_memory) {
+      /* currently only cl buf is supported, will add cl image support later */
+      if ((flags & CL_MEM_USE_HOST_PTR) && host_ptr != NULL) {
+          /* userptr not support tiling */
+          if (!is_tiled) {
+              int page_size = getpagesize();
+              if ((((unsigned long)host_ptr | sz) & (page_size - 1)) == 0) {
+                mem->is_userptr = 1;
+                mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", host_ptr, sz, 0);
+              }
+          }
+      }
+    }
+
+    if (!mem->is_userptr)
+      mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+#else
     mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+#endif
+
     if (UNLIKELY(mem->bo == NULL)) {
       err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
       goto error;
@@ -387,12 +411,15 @@ cl_mem_new_buffer(cl_context ctx,
   sz = ALIGN(sz, 4);
 
   /* Create the buffer in video memory */
-  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, &err);
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, data, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
   /* Copy the data if required */
-  if (flags & CL_MEM_COPY_HOST_PTR || flags & CL_MEM_USE_HOST_PTR)
+  if (flags & CL_MEM_COPY_HOST_PTR)
+    cl_buffer_subdata(mem->bo, 0, sz, data);
+
+  if ((flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr)
     cl_buffer_subdata(mem->bo, 0, sz, data);
 
   if (flags & CL_MEM_USE_HOST_PTR || flags & CL_MEM_COPY_HOST_PTR)
@@ -762,7 +789,7 @@ _cl_mem_new_image(cl_context ctx,
     sz = aligned_pitch * aligned_h * depth;
   }
 
-  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, &err);
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
@@ -1834,7 +1861,7 @@ LOCAL cl_mem cl_mem_new_libva_buffer(cl_context ctx,
   cl_int err = CL_SUCCESS;
   cl_mem mem = NULL;
 
-  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, &err);
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
@@ -1875,7 +1902,7 @@ LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx,
     goto error;
   }
 
-  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, &err);
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS) {
     err = CL_OUT_OF_HOST_MEMORY;
     goto error;
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 95c5f05..2e9dd5a 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -92,6 +92,7 @@ typedef  struct _cl_mem {
   int map_ref;              /* The mapped count. */
   uint8_t mapped_gtt;       /* This object has mapped gtt, for unmap. */
   cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
+  uint8_t is_userptr;    /* CL_MEM_USE_HOST_PTR is enabled*/
 } _cl_mem;
 
 struct _cl_mem_image {
@@ -262,6 +263,7 @@ cl_mem_allocate(enum cl_mem_type type,
                 cl_mem_flags flags,
                 size_t sz,
                 cl_int is_tiled,
+                void *host_ptr,
                 cl_int *errcode);
 
 void
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index 28d2ac6..3640908 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -63,7 +63,7 @@ cl_mem_new_gl_texture(cl_context ctx,
     goto error;
   }
 
-  mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, &err);
+  mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index bb97220..fc037cc 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -690,6 +690,20 @@ cl_buffer intel_share_image_from_libva(cl_context ctx,
   return (cl_buffer)intel_bo;
 }
 
+static cl_buffer intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char* name, void *data,size_t size, unsigned long flags)
+{
+#ifdef HAS_USERPTR
+  drm_intel_bo *bo;
+  bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags);
+  /* Fallback to unsynchronized userptr allocation if kernel has no MMU notifier enabled. */
+  if (bo == NULL)
+    bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED);
+  return (cl_buffer)bo;
+#else
+  return NULL;
+#endif
+}
+
 static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
 {
   switch (tiling) {
@@ -734,6 +748,7 @@ intel_setup_callbacks(void)
   cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
   cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
   cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+  cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
   cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
 #if defined(HAS_EGL)
   cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
-- 
2.1.0



More information about the Beignet mailing list