[Beignet] [PATCH 1/2] Add new vload benchmark/test case.

Zhigang Gong zhigang.gong at intel.com
Tue Aug 26 21:12:43 PDT 2014


Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 kernels/vload_bench.cl  | 33 +++++++++++++++++++
 utests/CMakeLists.txt   |  1 +
 utests/utest_helper.cpp |  5 +--
 utests/vload_bench.cpp  | 85 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 122 insertions(+), 2 deletions(-)
 create mode 100644 kernels/vload_bench.cl
 create mode 100644 utests/vload_bench.cpp

diff --git a/kernels/vload_bench.cl b/kernels/vload_bench.cl
new file mode 100644
index 0000000..c906c75
--- /dev/null
+++ b/kernels/vload_bench.cl
@@ -0,0 +1,33 @@
+#define VLOAD_BENCH(T, N, M) \
+__kernel void \
+vload_bench_##M ##T ##N(__global T* src, __global uint* dst, uint offset) \
+{ \
+  int id = (int)get_global_id(0); \
+  uint ##N srcV = 0; \
+  for(int i = 0; i < M; i++) \
+  { \
+    srcV += convert_uint ##N(vload ##N(id + (i & 0xFFFF), src + offset)); \
+  } \
+  vstore ##N(srcV, id, dst);\
+  /*if (id < 16)*/ \
+  /*printf("id %d %d %d\n", id, srcV.s0, srcV.s1);*/ \
+}
+
+#define VLOAD_BENCH_ALL_VECTOR(T, N_ITERATIONS) \
+               VLOAD_BENCH(T, 2, N_ITERATIONS)  \
+               VLOAD_BENCH(T, 3, N_ITERATIONS)  \
+               VLOAD_BENCH(T, 4, N_ITERATIONS)  \
+               VLOAD_BENCH(T, 8, N_ITERATIONS)  \
+               VLOAD_BENCH(T, 16, N_ITERATIONS)
+
+#define VLOAD_BENCH_ALL_TYPES(N_ITERATIONS)     \
+   VLOAD_BENCH_ALL_VECTOR(uchar, N_ITERATIONS)  \
+   VLOAD_BENCH_ALL_VECTOR(char, N_ITERATIONS)   \
+   VLOAD_BENCH_ALL_VECTOR(ushort, N_ITERATIONS) \
+   VLOAD_BENCH_ALL_VECTOR(short, N_ITERATIONS)  \
+   VLOAD_BENCH_ALL_VECTOR(uint, N_ITERATIONS)   \
+   VLOAD_BENCH_ALL_VECTOR(int, N_ITERATIONS)    \
+   VLOAD_BENCH_ALL_VECTOR(float, N_ITERATIONS)
+
+VLOAD_BENCH_ALL_TYPES(1)
+VLOAD_BENCH_ALL_TYPES(10000)
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 721e6f7..b30e6f9 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -184,6 +184,7 @@ set (utests_sources
   image_1D_buffer.cpp
   compare_image_2d_and_1d_array.cpp
   compiler_constant_expr.cpp
+  vload_bench.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index cb4dd66..e4badea 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -262,9 +262,10 @@ cl_kernel_init(const char *file_name, const char *kernel_name, int format, const
       goto error;
     }
     prevFileName = file_name;
+
+    /* OCL requires to build the program even if it is created from a binary */
+    OCL_CALL (clBuildProgram, program, 1, &device, build_opt, NULL, NULL);
   }
-  /* OCL requires to build the program even if it is created from a binary */
-  OCL_CALL (clBuildProgram, program, 1, &device, build_opt, NULL, NULL);
 
   /* Create a kernel from the program */
   if (kernel)
diff --git a/utests/vload_bench.cpp b/utests/vload_bench.cpp
new file mode 100644
index 0000000..2acc063
--- /dev/null
+++ b/utests/vload_bench.cpp
@@ -0,0 +1,85 @@
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+#define N_ITERATIONS 10000
+
+#define T uint8_t
+template <typename T>
+static void vload_bench(const char *kernelFunc, uint32_t N, uint32_t offset, bool benchMode)
+{
+  const size_t n = benchMode ? (512 * 1024) : (8 * 1024);
+  struct timeval start, end;
+
+  // Setup kernel and buffers
+  std::string kernelName = kernelFunc + std::to_string(N);
+  OCL_CALL (cl_kernel_init, "vload_bench.cl", kernelName.c_str(), SOURCE, NULL);
+  //OCL_CREATE_KERNEL("compiler_array");
+  buf_data[0] = (T*) malloc(sizeof(T) * n);
+  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = i; //rand() & ((1LL << N) - 1);
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(uint32_t), &offset);
+  globals[0] = n / ((N + 1) & ~0x1);
+  locals[0] = 256;
+  if (benchMode)
+    gettimeofday(&start, NULL);
+  OCL_NDRANGE(1);
+  if (benchMode) {
+    OCL_FINISH();
+    gettimeofday(&end, NULL);
+    double elapsed = (end.tv_sec - start.tv_sec) * 1e6 + (end.tv_usec - start.tv_usec);
+    double bandwidth = (globals[0] * (N_ITERATIONS) * sizeof(T) * N) / elapsed;
+    printf("\t%2.1fGB/S\n", bandwidth/1000.);
+  } else {
+    // Check result
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (uint32_t i = 0; i < globals[0]; ++i) {
+      OCL_ASSERT(((T*)buf_data[0])[i + offset] == ((uint32_t*)buf_data[1])[i]);
+    }
+  }
+}
+
+#define VLOAD_TEST(T, kT) \
+static void vload_test_ ##kT(void) \
+{ \
+  uint8_t vectorSize[] = {2, 3, 4, 8, 16}; \
+  for(uint32_t i = 0; i < sizeof(vectorSize); i++) { \
+    for(uint32_t offset = 0; offset < vectorSize[i]; offset++) {\
+      vload_bench<T>("vload_bench_1" #kT, vectorSize[i], offset, false); \
+    }\
+  } \
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(vload_test_ ##kT, true)
+
+VLOAD_TEST(uint8_t, uchar)
+VLOAD_TEST(int8_t, char)
+VLOAD_TEST(uint16_t, ushort)
+VLOAD_TEST(int16_t, short)
+VLOAD_TEST(uint32_t, uint)
+VLOAD_TEST(int32_t, int)
+VLOAD_TEST(float, float)
+
+#define VLOAD_BENCH(T, kT) \
+static void vload_bench_ ##kT(void) \
+{ \
+  uint8_t vectorSize[] = {2, 3, 4, 8, 16}; \
+  printf("\n");\
+  for(uint32_t i = 0; i < sizeof(vectorSize); i++) { \
+    printf("  Vector size %d:\n", vectorSize[i]); \
+    for(uint32_t offset = 0; offset < vectorSize[i]; offset++) {\
+      printf("\tOffset %d :", offset); \
+      vload_bench<T>("vload_bench_10000"  #kT, vectorSize[i], offset, true); \
+    }\
+  } \
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(vload_bench_ ##kT, true)
+VLOAD_BENCH(uint8_t, uchar)
+VLOAD_BENCH(uint16_t, ushort)
+VLOAD_BENCH(uint32_t, uint)
-- 
1.8.3.2



More information about the Beignet mailing list