[Beignet] [PATCH] Benchmark 3 ways of uchar/uchar4 access.

Chuanbo Weng chuanbo.weng at intel.com
Thu Nov 27 18:47:30 PST 2014


Add benchmark to see the performance of 3 ways of uchar/uchar4
access from global memory.

Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
---
 benchmark/CMakeLists.txt                  |  5 +++-
 benchmark/uchar4_direct_access.cpp        | 47 +++++++++++++++++++++++++++++++
 benchmark/uchar_direct_access.cpp         | 47 +++++++++++++++++++++++++++++++
 benchmark/uchar_vload4_vstore4_access.cpp | 47 +++++++++++++++++++++++++++++++
 kernels/uchar4_direct_access.cl           |  9 ++++++
 kernels/uchar_direct_access.cl            | 18 ++++++++++++
 kernels/uchar_vload4_vstore4_access.cl    | 10 +++++++
 utests/utest.hpp                          |  2 +-
 8 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 benchmark/uchar4_direct_access.cpp
 create mode 100644 benchmark/uchar_direct_access.cpp
 create mode 100644 benchmark/uchar_vload4_vstore4_access.cpp
 create mode 100644 kernels/uchar4_direct_access.cl
 create mode 100644 kernels/uchar_direct_access.cl
 create mode 100644 kernels/uchar_vload4_vstore4_access.cl

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 0a959c8..82476c6 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -11,7 +11,10 @@ set (benchmark_sources
   ../utests/utest_file_map.cpp
   ../utests/utest_helper.cpp
   ../utests/vload_bench.cpp
-  enqueue_copy_buf.cpp)
+  enqueue_copy_buf.cpp
+  uchar_direct_access.cpp
+  uchar4_direct_access.cpp
+  uchar_vload4_vstore4_access.cpp)
 
 
 SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
diff --git a/benchmark/uchar4_direct_access.cpp b/benchmark/uchar4_direct_access.cpp
new file mode 100644
index 0000000..59db794
--- /dev/null
+++ b/benchmark/uchar4_direct_access.cpp
@@ -0,0 +1,47 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define BUF_SZ 2048*1024
+
+static int uchar4_direct_access(void)
+{ 
+  unsigned char cpu_dst[BUF_SZ], cpu_src[BUF_SZ];
+	
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("uchar4_direct_access");
+  OCL_CREATE_BUFFER(buf[0], 0, BUF_SZ, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, BUF_SZ, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = BUF_SZ/4;
+  locals[0] = 1;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) BUF_SZ; ++i)
+    cpu_src[i] = ((unsigned char*)buf_data[0])[i] = (rand() & 255);;
+  OCL_UNMAP_BUFFER(0);
+
+  struct timeval start,stop;
+  gettimeofday(&start,0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  OCL_FINISH();
+  gettimeofday(&stop,0);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) BUF_SZ; ++i)
+    cpu_dst[i] = 255 - cpu_src[i];
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) BUF_SZ; ++i)
+    OCL_ASSERT(((unsigned char *)buf_data[1])[i] == cpu_dst[i]);
+  OCL_UNMAP_BUFFER(1);
+
+  int msec = 1000.0*(stop.tv_sec - start.tv_sec) + (stop.tv_usec - start.tv_usec)/1000.0;
+  return msec;
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(uchar4_direct_access);
diff --git a/benchmark/uchar_direct_access.cpp b/benchmark/uchar_direct_access.cpp
new file mode 100644
index 0000000..5363e13
--- /dev/null
+++ b/benchmark/uchar_direct_access.cpp
@@ -0,0 +1,47 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define BUF_SZ 2048*1024
+
+static int uchar_direct_access(void)
+{ 
+  unsigned char cpu_dst[BUF_SZ], cpu_src[BUF_SZ];
+	
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("uchar_direct_access");
+  OCL_CREATE_BUFFER(buf[0], 0, BUF_SZ, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, BUF_SZ, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = BUF_SZ/4;
+  locals[0] = 1;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) BUF_SZ; ++i)
+    cpu_src[i] = ((unsigned char*)buf_data[0])[i] = (rand() & 255);;
+  OCL_UNMAP_BUFFER(0);
+
+  struct timeval start,stop;
+  gettimeofday(&start,0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  OCL_FINISH();
+  gettimeofday(&stop,0);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) BUF_SZ; ++i)
+    cpu_dst[i] = 255 - cpu_src[i];
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) BUF_SZ; ++i)
+    OCL_ASSERT(((unsigned char *)buf_data[1])[i] == cpu_dst[i]);
+  OCL_UNMAP_BUFFER(1);
+
+  int msec = 1000.0*(stop.tv_sec - start.tv_sec) + (stop.tv_usec - start.tv_usec)/1000.0;
+  return msec;
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(uchar_direct_access);
diff --git a/benchmark/uchar_vload4_vstore4_access.cpp b/benchmark/uchar_vload4_vstore4_access.cpp
new file mode 100644
index 0000000..82a6aa9
--- /dev/null
+++ b/benchmark/uchar_vload4_vstore4_access.cpp
@@ -0,0 +1,47 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define BUF_SZ 2048*1024
+
+static int uchar_vload4_vstore4_access(void)
+{ 
+  unsigned char cpu_dst[BUF_SZ], cpu_src[BUF_SZ];
+	
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("uchar_vload4_vstore4_access");
+  OCL_CREATE_BUFFER(buf[0], 0, BUF_SZ, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, BUF_SZ, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = BUF_SZ/4;
+  locals[0] = 1;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) BUF_SZ; ++i)
+    cpu_src[i] = ((unsigned char*)buf_data[0])[i] = (rand() & 255);;
+  OCL_UNMAP_BUFFER(0);
+
+  struct timeval start,stop;
+  gettimeofday(&start,0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  OCL_FINISH();
+  gettimeofday(&stop,0);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) BUF_SZ; ++i)
+    cpu_dst[i] = 255 - cpu_src[i];
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) BUF_SZ; ++i)
+    OCL_ASSERT(((unsigned char *)buf_data[1])[i] == cpu_dst[i]);
+  OCL_UNMAP_BUFFER(1);
+
+  int msec = 1000.0*(stop.tv_sec - start.tv_sec) + (stop.tv_usec - start.tv_usec)/1000.0;
+  return msec;
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(uchar_vload4_vstore4_access);
diff --git a/kernels/uchar4_direct_access.cl b/kernels/uchar4_direct_access.cl
new file mode 100644
index 0000000..8d367bf
--- /dev/null
+++ b/kernels/uchar4_direct_access.cl
@@ -0,0 +1,9 @@
+__kernel
+void uchar4_direct_access(__global uchar4 *src, __global uchar4 *dst){
+  int id = (int)get_global_id(0);
+
+  uchar4 src_val = src[id];
+  
+  src_val = (uchar4)255 - src_val;
+  dst[id] = src_val;
+}
diff --git a/kernels/uchar_direct_access.cl b/kernels/uchar_direct_access.cl
new file mode 100644
index 0000000..f64a8df
--- /dev/null
+++ b/kernels/uchar_direct_access.cl
@@ -0,0 +1,18 @@
+__kernel
+void uchar_direct_access(__global uchar *src, __global uchar *dst){
+  int id = (int)get_global_id(0);
+
+  uchar4 src_val;
+  __global uchar* src_base = src + id*4;
+  
+  src_val.x = src_base[0];
+  src_val.y = src_base[1];
+  src_val.z = src_base[2];
+  src_val.w = src_base[3];
+  src_val = (uchar4)255 - src_val;
+  __global uchar* dst_base = dst + id*4;
+  dst_base[0] = src_val.x;
+  dst_base[1] = src_val.y;
+  dst_base[2] = src_val.z;
+  dst_base[3] = src_val.w;
+}
diff --git a/kernels/uchar_vload4_vstore4_access.cl b/kernels/uchar_vload4_vstore4_access.cl
new file mode 100644
index 0000000..235b5a9
--- /dev/null
+++ b/kernels/uchar_vload4_vstore4_access.cl
@@ -0,0 +1,10 @@
+__kernel
+void uchar_vload4_vstore4_access(__global uchar4 *src, __global uchar4 *dst){
+  int id = (int)get_global_id(0);
+
+  uchar4 src_val = vload4(0, (__global uchar*)(src + id));
+  
+  src_val = (uchar4)255 - src_val;
+
+  vstore4(src_val, 0, (__global uchar*)(dst + id));
+}
diff --git a/utests/utest.hpp b/utests/utest.hpp
index 0dc611d..5d259f9 100644
--- a/utests/utest.hpp
+++ b/utests/utest.hpp
@@ -137,7 +137,7 @@ struct UTest
     int ret = 0;\
     try { \
       ret = EXPR; \
-      std::cout << "    [Result: " << ret << "]    [SUCCESS]" << std::endl; \
+      std::cout << "    [Result: " << ret << " ms]    [SUCCESS]" << std::endl; \
       UTest::retStatistics.passCount += 1; \
     } \
     catch (Exception e) { \
-- 
1.9.1



More information about the Beignet mailing list