[Beignet] [PATCH] Math benchmark for evaluating internal pow performance

Wed Feb 24 16:07:43 UTC 2016

Computing 1000 times A = A ** B, A = (1.000002, 1.000004...), B = 1.0102003
Performance results on Intel Core i7 5600U BDW (GPU HD5500 GT2):
INTERNAL [Result: 0.388 Mpow/sec]  ... 1.189, 1.229, 1.277, 1.337, 1.412, ...
NATIVE   [Result: 13.306 Mpow/sec] ... 1.000, 1.000, 1.000, 1.000, 1.306, ...

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 benchmark/benchmark_math.cpp | 28 +++++++++++++++-------------
 kernels/bench_math.cl        | 21 ++++++++++++++++-----
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/benchmark/benchmark_math.cpp b/benchmark/benchmark_math.cpp
index b93a4f3..a92b39e 100644
--- a/benchmark/benchmark_math.cpp
+++ b/benchmark/benchmark_math.cpp
@@ -1,6 +1,5 @@
 #include "utests/utest_helper.hpp"
 #include <sys/time.h>
-
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
@@ -8,23 +7,27 @@
 #include "utest_helper.hpp"
 #include <sys/time.h>
 
-double benchmark_math_exp(void)
+double benchmark_math_pow(void)
 {
   double elapsed = 0;
   struct timeval start,stop;
   const size_t global_size = 1024 * 1024;
-  const size_t local_size = 128;
-  const uint32_t reduce_loop = 10000;
+  const size_t local_size = 64;
+
+  /* computes recursive base = base ** pwr, loop times */
+  cl_float base = 1.000002;
+  cl_float pwr = 1.0102003;
+  uint32_t loop = 1000;
 
   /* Input set will be generated */
   float* src = (float*)calloc(sizeof(float), global_size);
   OCL_ASSERT(src != NULL);
   for(uint32_t i = 0; i < global_size; i++)
-    src[i] = i % local_size;
+    src[i] = base + i * (base - 1);
 
   /* Setup kernel and buffers */
   OCL_CREATE_KERNEL_FROM_FILE("bench_math",
-    "bench_math_exp");
+    "bench_math_pow");
 
   OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL);
@@ -38,7 +41,8 @@ double benchmark_math_exp(void)
 
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+  OCL_SET_ARG(2, sizeof(cl_float), &pwr);
+  OCL_SET_ARG(3, sizeof(cl_uint), &loop);
 
   /* Measure performance */
   gettimeofday(&start,0);
@@ -49,12 +53,10 @@ double benchmark_math_exp(void)
 
   /* Check results */
   OCL_MAP_BUFFER(1);
-  for(uint32_t i = 0; i < global_size; i += local_size){
-    //printf(" %f", ((float*)buf_data[1])[i]);
-    //OCL_ASSERT( ((float*)buf_data[1])[i] == (float)i );
-  }
+  for(uint32_t i = 0; i < local_size; i ++)
+    printf("\t%.3f", ((float*)buf_data[1])[i]);
   OCL_UNMAP_BUFFER(1);
 
-  return BANDWIDTH(global_size * reduce_loop, elapsed);
+  return BANDWIDTH(global_size * loop, elapsed);
 }
-MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp, "Mflops/sec");
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_pow, "Mpow/sec");
diff --git a/kernels/bench_math.cl b/kernels/bench_math.cl
index 75da4d2..1174387 100644
--- a/kernels/bench_math.cl
+++ b/kernels/bench_math.cl
@@ -1,13 +1,24 @@
-kernel void bench_math_exp(
+#define USE_GEN_NATIVE_POW 0
+
+/* computes recursive base = base ** pwr, loop times
+ * base = src[get_global_id(0)] */
+kernel void bench_math_pow(
   global float *src,
   global float *dst,
-  uint reduce_loop)
+  float pwr,
+  uint loop)
 {
   float val = src[get_global_id(0)];
-  float result = exp(result);
+  float result = pow(val, pwr);
 
-  for(; reduce_loop > 0; reduce_loop--)
-    result = exp(result);
+  for(; loop > 0; loop--){
+#if USE_GEN_NATIVE_POW
+    result = native_powr(result, pwr);
+#else
+    result = pow(result, pwr);
+#endif
+  }
 
   dst[get_global_id(0)] = result;
 }
+
-- 
2.5.0