[Beignet] [PATCH 01/10] Benchmark: Add performance tests for various workgroup functions

Thu Mar 31 15:26:33 UTC 2016

From: Grigore Lupescu <grigore.lupescu at intel.com>

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 benchmark/CMakeLists.txt          |   3 +-
 benchmark/benchmark_workgroup.cpp | 289 ++++++++++++++++++++++++++++++++++++++
 kernels/bench_workgroup.cl        | 180 ++++++++++++++++++++++++
 3 files changed, 471 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/benchmark_workgroup.cpp
 create mode 100644 kernels/bench_workgroup.cl

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 3e43a21..a89e411 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -16,7 +16,8 @@ set (benchmark_sources
   benchmark_read_buffer.cpp
   benchmark_read_image.cpp
   benchmark_copy_buffer_to_image.cpp
-  benchmark_copy_image_to_buffer.cpp)
+  benchmark_copy_image_to_buffer.cpp
+  benchmark_workgroup.cpp)
 
 
 SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
diff --git a/benchmark/benchmark_workgroup.cpp b/benchmark/benchmark_workgroup.cpp
new file mode 100644
index 0000000..9baf2ad
--- /dev/null
+++ b/benchmark/benchmark_workgroup.cpp
@@ -0,0 +1,289 @@
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include <sys/time.h>
+#include <iomanip>
+#include <algorithm>
+
+using namespace std;
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  (512 * 256)
+#define WG_LOCAL_SIZE   128
+#define WG_LOOP_COUNT   10000
+
+enum WG_FUNCTION
+{
+  WG_REDUCE_ADD,
+  WG_REDUCE_MIN,
+  WG_REDUCE_MAX,
+  WG_SCAN_EXCLUSIVE_ADD,
+  WG_SCAN_EXCLUSIVE_MAX,
+  WG_SCAN_EXCLUSIVE_MIN,
+  WG_SCAN_INCLUSIVE_ADD,
+  WG_SCAN_INCLUSIVE_MAX,
+  WG_SCAN_INCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected on CPU function for any workgroup type
+ * and any variable type
+ */
+template<class T>
+static void benchmark_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected)
+{
+  if(wg_func == WG_REDUCE_ADD)
+  {
+    T wg_sum = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_sum += input[i];
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_sum;
+  }
+  else if(wg_func == WG_REDUCE_MAX)
+  {
+    T wg_max = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_max = max(input[i], wg_max);
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_max;
+  }
+  else if(wg_func == WG_REDUCE_MIN)
+  {
+    T wg_min = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      wg_min = min(input[i], wg_min);
+    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+      expected[i] = wg_min;
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_ADD)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      expected[i] = input[i] + expected[i - 1];
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      expected[i] = max(input[i], expected[i - 1]);
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+      expected[i] = min(input[i], expected[i - 1]);
+  }
+}
+
+/*
+ * Generic input-expected generate function for any workgroup type
+ * and any variable type
+ */
+template<class T>
+static void benchmark_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += WG_LOCAL_SIZE)
+  {
+    /* input values */
+    for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++)
+      input[gid + lid] = (rand() % 112) / 3.1415f;
+
+    /* expected values */
+    benchmark_expected(wg_func, input + gid, expected + gid);
+  }
+}
+
+/*
+ * Generic benchmark function for any workgroup type
+ * and any variable type
+ */
+template<class T>
+static double benchmark_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  double elapsed = 0;
+  const uint32_t reduce_loop = 10000;
+  struct timeval start,stop;
+
+  /* input and expected data */
+  benchmark_data(wg_func, input, expected);
+
+  /* prepare input for datatype */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+
+  gettimeofday(&start,0);
+  OCL_NDRANGE(1);
+  clFinish(queue);
+  gettimeofday(&stop,0);
+  elapsed = time_subtract(&stop, &start, 0);
+
+  /* check if mistmatch, display execution time */
+  OCL_MAP_BUFFER(1);
+  uint32_t mistmatches = 0;
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i)){
+      cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+      mistmatches++;
+    }
+  cout << endl << endl << "Mistmatches " << mistmatches << endl;
+  cout << "Exec time " << elapsed << endl << endl;
+  OCL_UNMAP_BUFFER(1);
+
+  return BANDWIDTH(WG_GLOBAL_SIZE * WG_LOOP_COUNT, elapsed);
+}
+
+/*
+ * Benchmark workgroup reduce add
+ */
+double benchmark_workgroup_reduce_add_short(void)
+{
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_add_short");
+  return benchmark_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_add_short);
+double benchmark_workgroup_reduce_add_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_add_int");
+  return benchmark_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_add_int);
+double benchmark_workgroup_reduce_add_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_add_long");
+  return benchmark_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_add_long);
+
+/*
+ * Benchmark workgroup reduce min
+ */
+double benchmark_workgroup_reduce_min_short(void)
+{
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_min_short");
+  return benchmark_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_min_short);
+double benchmark_workgroup_reduce_min_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_min_int");
+  return benchmark_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_min_int);
+double benchmark_workgroup_reduce_min_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_reduce_min_long");
+  return benchmark_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_reduce_min_long);
+
+/*
+ * Benchmark workgroup scan inclusive add
+ */
+double benchmark_workgroup_scan_inclusive_add_short(void)
+{
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_add_short");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_add_short);
+double benchmark_workgroup_scan_inclusive_add_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_add_int");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_add_int);
+double benchmark_workgroup_scan_inclusive_add_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_add_long");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_add_long);
+
+/*
+ * Benchmark workgroup scan inclusive min
+ */
+double benchmark_workgroup_scan_inclusive_min_short(void)
+{
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_min_short");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_min_short);
+double benchmark_workgroup_scan_inclusive_min_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_min_int");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_min_int);
+double benchmark_workgroup_scan_inclusive_min_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_scan_inclusive_min_long");
+  return benchmark_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_scan_inclusive_min_long);
+
+
+
diff --git a/kernels/bench_workgroup.cl b/kernels/bench_workgroup.cl
new file mode 100644
index 0000000..596e936
--- /dev/null
+++ b/kernels/bench_workgroup.cl
@@ -0,0 +1,180 @@
+/*
+ * Benchmark workgroup reduce add
+ */
+kernel void bench_workgroup_reduce_add_short(
+  global short *src,
+  global short *dst,
+  int reduce_loop)
+{
+  short val = src[get_global_id(0)];
+  short result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_reduce_add(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_reduce_add_int(
+  global int *src,
+  global int *dst,
+  int reduce_loop)
+{
+  int val = src[get_global_id(0)];
+  int result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_reduce_add(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_reduce_add_long(
+  global long *src,
+  global long *dst,
+  int reduce_loop)
+{
+  long val = src[get_global_id(0)];
+  long result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_reduce_add(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+/*
+ * Benchmark workgroup reduce min
+ */
+kernel void bench_workgroup_reduce_min_short(
+  global short *src,
+  global short *dst,
+  int reduce_loop)
+{
+  short val = src[get_global_id(0)];
+  short result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_reduce_min(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_reduce_min_int(
+  global int *src,
+  global int *dst,
+  int reduce_loop)
+{
+  int val = src[get_global_id(0)];
+  int result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_reduce_min(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_reduce_min_long(
+  global long *src,
+  global long *dst,
+  int reduce_loop)
+{
+  long val = src[get_global_id(0)];
+  long result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_reduce_min(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+/*
+ * Benchmark workgroup scan inclusive add
+ */
+kernel void bench_workgroup_scan_inclusive_add_short(
+  global short *src,
+  global short *dst,
+  int reduce_loop)
+{
+  short val = src[get_global_id(0)];
+  short result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_scan_inclusive_add(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_scan_inclusive_add_int(
+  global int *src,
+  global int *dst,
+  int reduce_loop)
+{
+  int val = src[get_global_id(0)];
+  int result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_scan_inclusive_add(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_scan_inclusive_add_long(
+  global long *src,
+  global long *dst,
+  int reduce_loop)
+{
+  long val = src[get_global_id(0)];
+  long result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_scan_inclusive_add(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+/*
+ * Benchmark workgroup scan inclusive min
+ */
+kernel void bench_workgroup_scan_inclusive_min_short(
+  global short *src,
+  global short *dst,
+  int reduce_loop)
+{
+  short val = src[get_global_id(0)];
+  short result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_scan_inclusive_min(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_scan_inclusive_min_int(
+  global int *src,
+  global int *dst,
+  int reduce_loop)
+{
+  int val = src[get_global_id(0)];
+  int result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_scan_inclusive_min(val);
+
+  dst[get_global_id(0)] = result;
+}
+
+kernel void bench_workgroup_scan_inclusive_min_long(
+  global long *src,
+  global long *dst,
+  int reduce_loop)
+{
+  long val = src[get_global_id(0)];
+  long result;
+
+  for(int i = 0; i < reduce_loop; i++)
+    result = work_group_scan_inclusive_min(val);
+
+  dst[get_global_id(0)] = result;
+}
+
-- 
2.5.0