[Beignet] [PATCH V2 17/17] Benchmark: Add performance tests for workgroup broadcast

Mon Apr 11 14:41:30 UTC 2016

From: Grigore Lupescu <grigore.lupescu at intel.com>

Added the following performance tests:
benchmark_workgroup_broadcast_1D_int
benchmark_workgroup_broadcast_1D_long
benchmark_workgroup_broadcast_2D_int
benchmark_workgroup_broadcast_2D_long

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 benchmark/benchmark_workgroup.cpp | 187 +++++++++++++++++++++++++++++++-------
 kernels/bench_workgroup.cl        |  99 ++++++++++++++++++++
 2 files changed, 251 insertions(+), 35 deletions(-)

diff --git a/benchmark/benchmark_workgroup.cpp b/benchmark/benchmark_workgroup.cpp
index 9baf2ad..ee56a45 100644
--- a/benchmark/benchmark_workgroup.cpp
+++ b/benchmark/benchmark_workgroup.cpp
@@ -9,13 +9,26 @@
 
 using namespace std;
 
-/* NDRANGE */
-#define WG_GLOBAL_SIZE  (512 * 256)
-#define WG_LOCAL_SIZE   128
-#define WG_LOOP_COUNT   10000
+/* work-group general settings */
+#define WG_GLOBAL_SIZE          (512 * 256)
+#define WG_LOCAL_SIZE           128
+#define WG_LOOP_COUNT           1000
+
+/* work-group broadcast only */
+#define WG_GLOBAL_SIZE_X        1024
+#define WG_GLOBAL_SIZE_Y        1024
+
+#define WG_LOCAL_SIZE_X         32
+#define WG_LOCAL_SIZE_Y         2
+
+#define WG_LOCAL_X    5
+#define WG_LOCAL_Y    0
+
 
 enum WG_FUNCTION
 {
+  WG_BROADCAST_1D,
+  WG_BROADCAST_2D,
   WG_REDUCE_ADD,
   WG_REDUCE_MIN,
   WG_REDUCE_MAX,
@@ -34,48 +47,62 @@ enum WG_FUNCTION
 template<class T>
 static void benchmark_expected(WG_FUNCTION wg_func,
                     T* input,
-                    T* expected)
+                    T* expected,
+                    uint32_t wg_global_size,
+                    uint32_t wg_local_size)
 {
-  if(wg_func == WG_REDUCE_ADD)
+  if(wg_func == WG_BROADCAST_1D)
+  {
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] = input[WG_LOCAL_X];
+  }
+  else if(wg_func == WG_BROADCAST_2D)
+  {
+    for(uint32_t i = 0; i < wg_local_size; i++)
+      expected[i] =
+          input[WG_LOCAL_X +
+                WG_LOCAL_Y * WG_LOCAL_SIZE_X];
+  }
+  else if(wg_func == WG_REDUCE_ADD)
   {
     T wg_sum = input[0];
-    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+    for(uint32_t i = 1; i < wg_local_size; i++)
       wg_sum += input[i];
-    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+    for(uint32_t i = 0; i < wg_local_size; i++)
       expected[i] = wg_sum;
   }
   else if(wg_func == WG_REDUCE_MAX)
   {
     T wg_max = input[0];
-    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+    for(uint32_t i = 1; i < wg_local_size; i++)
       wg_max = max(input[i], wg_max);
-    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+    for(uint32_t i = 0; i < wg_local_size; i++)
       expected[i] = wg_max;
   }
   else if(wg_func == WG_REDUCE_MIN)
   {
     T wg_min = input[0];
-    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+    for(uint32_t i = 1; i < wg_local_size; i++)
       wg_min = min(input[i], wg_min);
-    for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+    for(uint32_t i = 0; i < wg_local_size; i++)
       expected[i] = wg_min;
   }
   else if(wg_func == WG_SCAN_INCLUSIVE_ADD)
   {
     expected[0] = input[0];
-    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+    for(uint32_t i = 1; i < wg_local_size; i++)
       expected[i] = input[i] + expected[i - 1];
   }
   else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
   {
     expected[0] = input[0];
-    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+    for(uint32_t i = 1; i < wg_local_size; i++)
       expected[i] = max(input[i], expected[i - 1]);
   }
   else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
   {
     expected[0] = input[0];
-    for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+    for(uint32_t i = 1; i < wg_local_size; i++)
       expected[i] = min(input[i], expected[i - 1]);
   }
 }
@@ -87,23 +114,42 @@ static void benchmark_expected(WG_FUNCTION wg_func,
 template<class T>
 static void benchmark_data(WG_FUNCTION wg_func,
                    T* &input,
-                   T* &expected)
+                   T* &expected,
+                   uint32_t &wg_global_size,
+                   uint32_t &wg_local_size)
 {
-  input = new T[WG_GLOBAL_SIZE];
-  expected = new T[WG_GLOBAL_SIZE];
+  if(wg_func == WG_BROADCAST_1D)
+  {
+    wg_global_size = WG_GLOBAL_SIZE_X;
+    wg_local_size = WG_LOCAL_SIZE_X;
+  }
+  else if(wg_func == WG_BROADCAST_2D)
+  {
+    wg_global_size = WG_GLOBAL_SIZE_X * WG_GLOBAL_SIZE_Y;
+    wg_local_size = WG_LOCAL_SIZE_X * WG_LOCAL_SIZE_Y;
+  }
+  else
+  {
+    wg_global_size = WG_GLOBAL_SIZE;
+    wg_local_size = WG_LOCAL_SIZE;
+  }
+
+  input = new T[wg_global_size];
+  expected = new T[wg_global_size];
 
   /* seed for random inputs */
   srand (time(NULL));
 
   /* generate inputs and expected values */
-  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += WG_LOCAL_SIZE)
+  for(uint32_t gid = 0; gid < wg_global_size; gid += wg_local_size)
   {
     /* input values */
-    for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++)
-      input[gid + lid] = (rand() % 112) / 3.1415f;
+    for(uint32_t lid = 0; lid < wg_local_size; lid++)
+      input[gid + lid] = (rand() % 512) / 3.1415f;
 
     /* expected values */
-    benchmark_expected(wg_func, input + gid, expected + gid);
+    benchmark_expected(wg_func, input + gid, expected + gid,
+                       wg_global_size, wg_local_size);
   }
 }
 
@@ -117,30 +163,60 @@ static double benchmark_generic(WG_FUNCTION wg_func,
                        T* expected)
 {
   double elapsed = 0;
-  const uint32_t reduce_loop = 10000;
+  const uint32_t reduce_loop = WG_LOOP_COUNT;
   struct timeval start,stop;
 
+  uint32_t wg_global_size = 0;
+  uint32_t wg_local_size = 0;
+
   /* input and expected data */
-  benchmark_data(wg_func, input, expected);
+  benchmark_data(wg_func, input, expected, wg_global_size, wg_local_size);
 
   /* prepare input for datatype */
-  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[0], 0, wg_global_size * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, wg_global_size * sizeof(T), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
   OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
 
+  if(wg_func == WG_BROADCAST_1D ||
+      wg_func == WG_BROADCAST_2D)
+  {
+    cl_uint wg_local_x = WG_LOCAL_X;
+    cl_uint wg_local_y = WG_LOCAL_Y;
+    OCL_SET_ARG(3, sizeof(cl_uint), &wg_local_x);
+    OCL_SET_ARG(4, sizeof(cl_uint), &wg_local_y);
+  }
+
   /* set input data for GPU */
   OCL_MAP_BUFFER(0);
-  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  memcpy(buf_data[0], input, wg_global_size * sizeof(T));
   OCL_UNMAP_BUFFER(0);
 
   /* run the kernel on GPU */
-  globals[0] = WG_GLOBAL_SIZE;
-  locals[0] = WG_LOCAL_SIZE;
-
   gettimeofday(&start,0);
-  OCL_NDRANGE(1);
+
+  if(wg_func == WG_BROADCAST_1D)
+  {
+    globals[0] = WG_GLOBAL_SIZE_X;
+    locals[0] = WG_LOCAL_SIZE_X;
+    OCL_NDRANGE(1);
+  }
+  else if(wg_func == WG_BROADCAST_2D)
+  {
+    globals[0] = WG_GLOBAL_SIZE_X;
+    locals[0] = WG_LOCAL_SIZE_X;
+    globals[1] = WG_GLOBAL_SIZE_Y;
+    locals[1] = WG_LOCAL_SIZE_Y;
+    OCL_NDRANGE(2);
+  }
+  else
+  { /* reduce, scan inclulsive, scan exclusive */
+    globals[0] = WG_GLOBAL_SIZE;
+    locals[0] = WG_LOCAL_SIZE;
+    OCL_NDRANGE(1);
+  }
+
   clFinish(queue);
   gettimeofday(&stop,0);
   elapsed = time_subtract(&stop, &start, 0);
@@ -148,18 +224,59 @@ static double benchmark_generic(WG_FUNCTION wg_func,
   /* check if mistmatch, display execution time */
   OCL_MAP_BUFFER(1);
   uint32_t mistmatches = 0;
-  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+  for (uint32_t i = 0; i < wg_global_size; i++)
     if(((T *)buf_data[1])[i] != *(expected + i)){
-      cout << "Err at " << i << ", " <<
-        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+      /* uncomment bellow for DEBUG */
+      /* cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl; */
       mistmatches++;
     }
   cout << endl << endl << "Mistmatches " << mistmatches << endl;
   cout << "Exec time " << elapsed << endl << endl;
   OCL_UNMAP_BUFFER(1);
 
-  return BANDWIDTH(WG_GLOBAL_SIZE * WG_LOOP_COUNT, elapsed);
+  return BANDWIDTH(sizeof(T) * wg_global_size * reduce_loop, elapsed);
+}
+
+/*
+ * Benchmark workgroup broadcast
+ */
+double benchmark_workgroup_broadcast_1D_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_broadcast_1D_int");
+  return benchmark_generic(WG_BROADCAST_1D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_1D_int);
+double benchmark_workgroup_broadcast_1D_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_broadcast_1D_long");
+  return benchmark_generic(WG_BROADCAST_1D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_1D_long);
+double benchmark_workgroup_broadcast_2D_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_broadcast_2D_int");
+  return benchmark_generic(WG_BROADCAST_2D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_2D_int);
+double benchmark_workgroup_broadcast_2D_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+                  "bench_workgroup_broadcast_2D_long");
+  return benchmark_generic(WG_BROADCAST_2D, input, expected);
 }
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_2D_long);
 
 /*
  * Benchmark workgroup reduce add
diff --git a/kernels/bench_workgroup.cl b/kernels/bench_workgroup.cl
index f26537b..5343231 100644
--- a/kernels/bench_workgroup.cl
+++ b/kernels/bench_workgroup.cl
@@ -1,4 +1,103 @@
 /*
+ * Benchmark broadcast 1D
+ */
+kernel void bench_workgroup_broadcast_1D_int(global int *src,
+                                  global int *dst,
+                                  int reduce_loop,
+                                  uint wg_local_x,
+                                  uint wg_local_y)
+{
+  uint offset = 0;
+  uint index = offset + get_global_id(0);
+
+  int val = src[index];
+  /* depending on generated ASM, volatile may be removed */
+  volatile int result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    result = work_group_broadcast(val,
+                                  wg_local_x);
+  }
+
+  dst[index] = result;
+}
+
+kernel void bench_workgroup_broadcast_1D_long(global long *src,
+                                  global long *dst,
+                                  int reduce_loop,
+                                  uint wg_local_x,
+                                  uint wg_local_y)
+{
+  uint offset = 0;
+  uint index = offset + get_global_id(0);
+
+  long val = src[index];
+  /* depending on generated ASM, volatile may be removed */
+  volatile long result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    result = work_group_broadcast(val,
+                                  wg_local_x);
+  }
+
+  dst[index] = result;
+}
+
+
+/*
+ * Benchmark broadcast 2D
+ */
+kernel void bench_workgroup_broadcast_2D_int(global int *src,
+                                  global int *dst,
+                                  int reduce_loop,
+                                  uint wg_local_x,
+                                  uint wg_local_y)
+{
+  uint lsize = get_local_size(0) * get_local_size(1);
+  uint offset = get_group_id(0) * lsize +
+      get_group_id(1) * get_num_groups(0) * lsize;
+  uint index = offset + get_local_id(0) +
+      get_local_id(1) * get_local_size(0);
+
+  int val = src[index];
+  /* depending on generated ASM, volatile may be removed */
+  int result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    result = work_group_broadcast(val,
+                                  wg_local_x,
+                                  wg_local_y);
+  }
+
+  dst[index] = result;
+}
+
+kernel void bench_workgroup_broadcast_2D_long(global long *src,
+                                  global long *dst,
+                                  int reduce_loop,
+                                  uint wg_local_x,
+                                  uint wg_local_y)
+{
+  uint lsize = get_local_size(0) * get_local_size(1);
+  uint offset = get_group_id(0) * lsize +
+      get_group_id(1) * get_num_groups(0) * lsize;
+  uint index = offset + get_local_id(0) +
+      get_local_id(1) * get_local_size(0);
+
+  long val = src[index];
+  /* depending on generated ASM, volatile may be removed */
+  long result;
+
+  for(; reduce_loop > 0; reduce_loop--){
+    result = work_group_broadcast(val,
+                                  wg_local_x,
+                                  wg_local_y);
+  }
+
+  dst[index] = result;
+}
+
+/*
  * Benchmark workgroup reduce add
  */
 kernel void bench_workgroup_reduce_add_short(
-- 
2.5.0