[Beignet] [PATCH 4/4] Use pread/pwrite for buffer enqueue read/write
Zhenyu Wang
zhenyuw at linux.intel.com
Tue Oct 21 20:27:58 PDT 2014
Instead of mmap, use pread/write interface for bo read/write with
optimized operations.
Result on one mem bandwidth benchmark for buffer enqueue read/write on HSW.
**** Host to device copy - workgroup_size=512
old code with mmap new code with pread/pwrite
WG=512 SZ= 1 KiB 2325 MB/s 3336 MB/s
WG=512 SZ= 2 KiB 4479 MB/s 6270 MB/s
WG=512 SZ= 4 KiB 8215 MB/s 11808 MB/s
WG=512 SZ= 8 KiB 14271 MB/s 19493 MB/s
WG=512 SZ= 16 KiB 16456 MB/s 20079 MB/s
WG=512 SZ= 32 KiB 21136 MB/s 22334 MB/s
WG=512 SZ= 64 KiB 24785 MB/s 24792 MB/s
WG=512 SZ=128 KiB 24590 MB/s 24908 MB/s
WG=512 SZ=256 KiB 17928 MB/s 21435 MB/s
WG=512 SZ=512 KiB 18346 MB/s 20583 MB/s
WG=512 SZ= 1 MiB 18558 MB/s 20808 MB/s
WG=512 SZ= 2 MiB 18582 MB/s 20939 MB/s
WG=512 SZ= 4 MiB 15382 MB/s 18230 MB/s
WG=512 SZ= 8 MiB 7737 MB/s 11558 MB/s
WG=512 SZ= 16 MiB 7073 MB/s 8962 MB/s
WG=512 SZ= 32 MiB 6984 MB/s 8302 MB/s
WG=512 SZ= 64 MiB 6938 MB/s 8308 MB/s
WG=512 SZ=128 MiB 6950 MB/s 8271 MB/s
WG=512 SZ=256 MiB 6941 MB/s 8034 MB/s
**** Device to host copy - workgroup_size=512
old code with mmap new code with pread/pwrite
WG=512 SZ= 1 KiB 2165 MB/s 2596 MB/s
WG=512 SZ= 2 KiB 4453 MB/s 6154 MB/s
WG=512 SZ= 4 KiB 8211 MB/s 11198 MB/s
WG=512 SZ= 8 KiB 14001 MB/s 11319 MB/s
WG=512 SZ= 16 KiB 16218 MB/s 14394 MB/s
WG=512 SZ= 32 KiB 21310 MB/s 22438 MB/s
WG=512 SZ= 64 KiB 24738 MB/s 25237 MB/s
WG=512 SZ=128 KiB 22784 MB/s 25113 MB/s
WG=512 SZ=256 KiB 18018 MB/s 22265 MB/s
WG=512 SZ=512 KiB 18429 MB/s 21687 MB/s
WG=512 SZ= 1 MiB 18654 MB/s 21856 MB/s
WG=512 SZ= 2 MiB 18655 MB/s 21998 MB/s
WG=512 SZ= 4 MiB 15098 MB/s 17563 MB/s
WG=512 SZ= 8 MiB 7737 MB/s 11170 MB/s
WG=512 SZ= 16 MiB 6989 MB/s 8812 MB/s
WG=512 SZ= 32 MiB 6891 MB/s 7597 MB/s
WG=512 SZ= 64 MiB 6868 MB/s 7798 MB/s
WG=512 SZ=128 MiB 6838 MB/s 7630 MB/s
WG=512 SZ=256 MiB 6842 MB/s 7466 MB/s
Signed-off-by: Zhenyu Wang <zhenyuw at linux.intel.com>
---
src/cl_driver.h | 4 ++++
src/cl_driver_defs.c | 1 +
src/cl_enqueue.c | 30 ++++--------------------------
src/intel/intel_driver.c | 1 +
4 files changed, 10 insertions(+), 26 deletions(-)
diff --git a/src/cl_driver.h b/src/cl_driver.h
index e973ba5..53d2917 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -353,6 +353,10 @@ extern cl_buffer_unpin_cb *cl_buffer_unpin;
typedef int (cl_buffer_subdata_cb)(cl_buffer, unsigned long, unsigned long, const void*);
extern cl_buffer_subdata_cb *cl_buffer_subdata;
+/* Get data from buffer */
+typedef int (cl_buffer_get_subdata_cb)(cl_buffer, unsigned long, unsigned long, void*);
+extern cl_buffer_get_subdata_cb *cl_buffer_get_subdata;
+
/* Wait for all pending rendering for this buffer to complete */
typedef int (cl_buffer_wait_rendering_cb) (cl_buffer);
extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 72f25d9..241101f 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -44,6 +44,7 @@ LOCAL cl_buffer_get_size_cb *cl_buffer_get_size = NULL;
LOCAL cl_buffer_pin_cb *cl_buffer_pin = NULL;
LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
+LOCAL cl_buffer_get_subdata_cb *cl_buffer_get_subdata = NULL;
LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
LOCAL cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva = NULL;
LOCAL cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva = NULL;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 2e43122..db0bce7 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -31,24 +31,13 @@
cl_int cl_enqueue_read_buffer(enqueue_data* data)
{
- cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
- void* src_ptr;
struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
- if (!(src_ptr = cl_mem_map_auto(data->mem_obj, 0))) {
- err = CL_MAP_FAILURE;
- goto error;
- }
-
- memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
-
- err = cl_mem_unmap_auto(data->mem_obj);
-
-error:
- return err;
+ return cl_buffer_get_subdata(mem->bo, data->offset + buffer->sub_offset,
+ data->size, data->ptr);
}
cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
@@ -105,24 +94,13 @@ error:
cl_int cl_enqueue_write_buffer(enqueue_data *data)
{
- cl_int err = CL_SUCCESS;
cl_mem mem = data->mem_obj;
assert(mem->type == CL_MEM_BUFFER_TYPE ||
mem->type == CL_MEM_SUBBUFFER_TYPE);
struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
- void* dst_ptr;
-
- if (!(dst_ptr = cl_mem_map_auto(data->mem_obj, 1))) {
- err = CL_MAP_FAILURE;
- goto error;
- }
-
- memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
-
- err = cl_mem_unmap_auto(data->mem_obj);
-error:
- return err;
+ return cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset,
+ data->size, data->const_ptr);
}
cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 2c2ed5f..e47109c 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -739,6 +739,7 @@ intel_setup_callbacks(void)
cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin;
cl_buffer_unpin = (cl_buffer_unpin_cb *) drm_intel_bo_unpin;
cl_buffer_subdata = (cl_buffer_subdata_cb *) drm_intel_bo_subdata;
+ cl_buffer_get_subdata = (cl_buffer_get_subdata_cb *) drm_intel_bo_get_subdata;
cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
cl_buffer_get_fd = (cl_buffer_get_fd_cb *) drm_intel_bo_gem_export_to_prime;
intel_set_gpgpu_callbacks(intel_get_device_id());
--
2.1.1
More information about the Beignet
mailing list