[Beignet] [PATCH 4/4] Use pread/pwrite for buffer enqueue read/write

Zhenyu Wang zhenyuw at linux.intel.com
Tue Oct 21 20:27:58 PDT 2014


Instead of mmap, use pread/write interface for bo read/write with
optimized operations.

Result on one mem bandwidth benchmark for buffer enqueue read/write on HSW.

**** Host to device copy - workgroup_size=512

                     old code with mmap       new code with pread/pwrite

WG=512  SZ=  1 KiB     2325 MB/s               3336 MB/s
WG=512  SZ=  2 KiB     4479 MB/s	       6270 MB/s
WG=512  SZ=  4 KiB     8215 MB/s	      11808 MB/s
WG=512  SZ=  8 KiB    14271 MB/s	      19493 MB/s
WG=512  SZ= 16 KiB    16456 MB/s	      20079 MB/s
WG=512  SZ= 32 KiB    21136 MB/s	      22334 MB/s
WG=512  SZ= 64 KiB    24785 MB/s	      24792 MB/s
WG=512  SZ=128 KiB    24590 MB/s	      24908 MB/s
WG=512  SZ=256 KiB    17928 MB/s	      21435 MB/s
WG=512  SZ=512 KiB    18346 MB/s	      20583 MB/s
WG=512  SZ=  1 MiB    18558 MB/s	      20808 MB/s
WG=512  SZ=  2 MiB    18582 MB/s	      20939 MB/s
WG=512  SZ=  4 MiB    15382 MB/s	      18230 MB/s
WG=512  SZ=  8 MiB     7737 MB/s	      11558 MB/s
WG=512  SZ= 16 MiB     7073 MB/s	       8962 MB/s
WG=512  SZ= 32 MiB     6984 MB/s	       8302 MB/s
WG=512  SZ= 64 MiB     6938 MB/s	       8308 MB/s
WG=512  SZ=128 MiB     6950 MB/s	       8271 MB/s
WG=512  SZ=256 MiB     6941 MB/s	       8034 MB/s

**** Device to host copy - workgroup_size=512

                     old code with mmap       new code with pread/pwrite

WG=512  SZ=  1 KiB     2165 MB/s               2596 MB/s
WG=512  SZ=  2 KiB     4453 MB/s	       6154 MB/s
WG=512  SZ=  4 KiB     8211 MB/s	      11198 MB/s
WG=512  SZ=  8 KiB    14001 MB/s	      11319 MB/s
WG=512  SZ= 16 KiB    16218 MB/s	      14394 MB/s
WG=512  SZ= 32 KiB    21310 MB/s	      22438 MB/s
WG=512  SZ= 64 KiB    24738 MB/s	      25237 MB/s
WG=512  SZ=128 KiB    22784 MB/s	      25113 MB/s
WG=512  SZ=256 KiB    18018 MB/s	      22265 MB/s
WG=512  SZ=512 KiB    18429 MB/s	      21687 MB/s
WG=512  SZ=  1 MiB    18654 MB/s	      21856 MB/s
WG=512  SZ=  2 MiB    18655 MB/s	      21998 MB/s
WG=512  SZ=  4 MiB    15098 MB/s	      17563 MB/s
WG=512  SZ=  8 MiB     7737 MB/s	      11170 MB/s
WG=512  SZ= 16 MiB     6989 MB/s	       8812 MB/s
WG=512  SZ= 32 MiB     6891 MB/s	       7597 MB/s
WG=512  SZ= 64 MiB     6868 MB/s	       7798 MB/s
WG=512  SZ=128 MiB     6838 MB/s	       7630 MB/s
WG=512  SZ=256 MiB     6842 MB/s	       7466 MB/s

Signed-off-by: Zhenyu Wang <zhenyuw at linux.intel.com>
---
 src/cl_driver.h          |  4 ++++
 src/cl_driver_defs.c     |  1 +
 src/cl_enqueue.c         | 30 ++++--------------------------
 src/intel/intel_driver.c |  1 +
 4 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/src/cl_driver.h b/src/cl_driver.h
index e973ba5..53d2917 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -353,6 +353,10 @@ extern cl_buffer_unpin_cb *cl_buffer_unpin;
 typedef int (cl_buffer_subdata_cb)(cl_buffer, unsigned long, unsigned long, const void*);
 extern cl_buffer_subdata_cb *cl_buffer_subdata;
 
+/* Get data from buffer */
+typedef int (cl_buffer_get_subdata_cb)(cl_buffer, unsigned long, unsigned long, void*);
+extern cl_buffer_get_subdata_cb *cl_buffer_get_subdata;
+
 /* Wait for all pending rendering for this buffer to complete */
 typedef int (cl_buffer_wait_rendering_cb) (cl_buffer);
 extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 72f25d9..241101f 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -44,6 +44,7 @@ LOCAL cl_buffer_get_size_cb *cl_buffer_get_size = NULL;
 LOCAL cl_buffer_pin_cb *cl_buffer_pin = NULL;
 LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
 LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
+LOCAL cl_buffer_get_subdata_cb *cl_buffer_get_subdata = NULL;
 LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
 LOCAL cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva = NULL;
 LOCAL cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 2e43122..db0bce7 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -31,24 +31,13 @@
 
 cl_int cl_enqueue_read_buffer(enqueue_data* data)
 {
-  cl_int err = CL_SUCCESS;
   cl_mem mem = data->mem_obj;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
          mem->type == CL_MEM_SUBBUFFER_TYPE);
-  void* src_ptr;
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
 
-  if (!(src_ptr = cl_mem_map_auto(data->mem_obj, 0))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-
-  memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
-
-  err = cl_mem_unmap_auto(data->mem_obj);
-
-error:
-  return err;
+  return cl_buffer_get_subdata(mem->bo, data->offset + buffer->sub_offset,
+			       data->size, data->ptr);
 }
 
 cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
@@ -105,24 +94,13 @@ error:
 
 cl_int cl_enqueue_write_buffer(enqueue_data *data)
 {
-  cl_int err = CL_SUCCESS;
   cl_mem mem = data->mem_obj;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
          mem->type == CL_MEM_SUBBUFFER_TYPE);
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
-  void* dst_ptr;
-
-  if (!(dst_ptr = cl_mem_map_auto(data->mem_obj, 1))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-
-  memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
-
-  err = cl_mem_unmap_auto(data->mem_obj);
 
-error:
-  return err;
+  return cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset,
+			   data->size, data->const_ptr);
 }
 
 cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 2c2ed5f..e47109c 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -739,6 +739,7 @@ intel_setup_callbacks(void)
   cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin;
   cl_buffer_unpin = (cl_buffer_unpin_cb *) drm_intel_bo_unpin;
   cl_buffer_subdata = (cl_buffer_subdata_cb *) drm_intel_bo_subdata;
+  cl_buffer_get_subdata = (cl_buffer_get_subdata_cb *) drm_intel_bo_get_subdata;
   cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
   cl_buffer_get_fd = (cl_buffer_get_fd_cb *) drm_intel_bo_gem_export_to_prime;
   intel_set_gpgpu_callbacks(intel_get_device_id());
-- 
2.1.1



More information about the Beignet mailing list