[Beignet] [PATCH] (1)Refine the the benchmark cmake file to make it build pass. (2)Refine the math benchmark. the origion algorithm do test like result = sin(result); the make the data range narrow down to a small range. Now change the data range from the smallest to biggest, and divide the range by 128, and send the base value to kernel, and in kernel the base multiply loop to biggest

Fri Jan 6 02:53:59 UTC 2017

Signed-off-by: rander <rander.wang at intel.com>
---
 benchmark/CMakeLists.txt     |   6 +-
 benchmark/benchmark_math.cpp |  69 ++++++++++++-----------
 kernels/bench_math.cl        | 127 ++++++++++++++++++++++---------------------
 3 files changed, 106 insertions(+), 96 deletions(-)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index f9b246b..e92b269 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,5 +1,7 @@
+cmake_minimum_required(VERSION 3.1)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
                     ${CMAKE_CURRENT_SOURCE_DIR}/../utests
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include)
 
 
@@ -23,13 +25,13 @@ set (benchmark_sources
   benchmark_math.cpp)
 
 
-SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK -std=c++11 ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DBUILD_BENCHMARK ${CMAKE_C_FLAGS}")
 
 ADD_LIBRARY(benchmarks SHARED ${ADDMATHFUNC} ${benchmark_sources})
 
 #TARGET_LINK_LIBRARIES(benchmarks cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-TARGET_LINK_LIBRARIES(benchmarks cl m)
+TARGET_LINK_LIBRARIES(benchmarks OpenCL pthread m)
 
 ADD_EXECUTABLE(benchmark_run benchmark_run.cpp)
 TARGET_LINK_LIBRARIES(benchmark_run benchmarks)
diff --git a/benchmark/benchmark_math.cpp b/benchmark/benchmark_math.cpp
index 72bc316..b47814a 100644
--- a/benchmark/benchmark_math.cpp
+++ b/benchmark/benchmark_math.cpp
@@ -8,119 +8,122 @@
 #include <sys/time.h>
 
 double benchmark_generic_math(const char* str_filename,
-                              const char* str_kernel)
+                              const char* str_kernel,
+                              float base,
+                              float max)
 {
   double elapsed = 0;
   struct timeval start,stop;
   const size_t global_size = 1024 * 1024;
-  const size_t local_size = 64;
+  const size_t local_size = 256;
 
   /* Compute math OP, loop times on global size */
-  cl_float base = 1.000002;
   cl_float pwr = 1.0102003;
-  uint32_t loop = 1000;
+  uint32_t loop = 128;
 
-  /* Input set will be generated */
-  float* src = (float*)calloc(sizeof(float), global_size);
-  OCL_ASSERT(src != NULL);
-  for(uint32_t i = 0; i < global_size; i++)
-    src[i] = base + i * (base - 1);
+  float step;
+
+  step = (max - base) / loop;
 
   /* Setup kernel and buffers */
   OCL_CALL(cl_kernel_init, str_filename, str_kernel, SOURCE, "");
 
-  OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL);
 
-  OCL_MAP_BUFFER(0);
-  memcpy(buf_data[0], src, global_size * sizeof(float));
-  OCL_UNMAP_BUFFER(0);
-
   globals[0] = global_size;
   locals[0] = local_size;
 
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  OCL_SET_ARG(2, sizeof(cl_float), &pwr);
-  OCL_SET_ARG(3, sizeof(cl_uint), &loop);
+  OCL_SET_ARG(0, sizeof(float), &base);
+  OCL_SET_ARG(1, sizeof(float), &step);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(3, sizeof(cl_float), &pwr);
+  OCL_SET_ARG(4, sizeof(cl_uint), &loop);
+
+  OCL_NDRANGE(1);
+  clFinish(queue);
 
   /* Measure performance */
   gettimeofday(&start,0);
   OCL_NDRANGE(1);
+  OCL_NDRANGE(1);
+  OCL_NDRANGE(1);
+  OCL_NDRANGE(1);
   clFinish(queue);
   gettimeofday(&stop,0);
   elapsed = time_subtract(&stop, &start, 0);
 
   /* Show compute results */
+#if SHOWRESULT
   OCL_MAP_BUFFER(1);
   for(uint32_t i = 0; i < global_size; i += 8192)
     printf("\t%.3f", ((float*)buf_data[1])[i]);
   OCL_UNMAP_BUFFER(1);
-
-  return BANDWIDTH(global_size * loop, elapsed);
+#endif
+  printf("-----------------------");
+  return BANDWIDTH(global_size * loop, elapsed / 4.0);
 }
 
 double benchmark_math_pow(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_pow");
+  return benchmark_generic_math("bench_math.cl", "bench_math_pow", 1.0, 128.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_pow, "Mop/s");
 
 double benchmark_math_exp2(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_exp2");
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp2", 0.1, 8.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp2, "Mop/s");
 
 double benchmark_math_exp(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_exp");
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp", 0.1, 4.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp, "Mop/s");
 
 double benchmark_math_exp10(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_exp10");
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp10", 0.1, 4.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp10, "Mop/s");
 
 double benchmark_math_log2(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_log2");
+  return benchmark_generic_math("bench_math.cl", "bench_math_log2", 0.008, 1.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log2, "Mop/s");
 
 double benchmark_math_log(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_log");
+  return benchmark_generic_math("bench_math.cl", "bench_math_log", 0.008, 1.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log, "Mop/s");
 
 double benchmark_math_log10(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_log10");
+  return benchmark_generic_math("bench_math.cl", "bench_math_log10", 0.008, 0.01);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log10, "Mop/s");
 
 double benchmark_math_sqrt(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_sqrt");
+  return benchmark_generic_math("bench_math.cl", "bench_math_sqrt", 0.1, 65537.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_sqrt, "Mop/s");
 
 double benchmark_math_sin(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_sin");
+  return benchmark_generic_math("bench_math.cl", "bench_math_sin", 0.001, 5000.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_sin, "Mop/s");
 
 double benchmark_math_cos(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_cos");
+  return benchmark_generic_math("bench_math.cl", "bench_math_cos", 0.001, 5000.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_cos, "Mop/s");
 
 double benchmark_math_tan(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_tan");
+  return benchmark_generic_math("bench_math.cl", "bench_math_tan", 0.001, 5000.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_tan, "Mop/s");
 
 double benchmark_math_asin(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_asin");
+  return benchmark_generic_math("bench_math.cl", "bench_math_asin", 0.001, 1.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_asin, "Mop/s");
 
 double benchmark_math_acos(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_acos");
+  return benchmark_generic_math("bench_math.cl", "bench_math_acos", 0.001, 1.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_acos, "Mop/s");
diff --git a/kernels/bench_math.cl b/kernels/bench_math.cl
index 8d85d51..f0c1a81 100644
--- a/kernels/bench_math.cl
+++ b/kernels/bench_math.cl
@@ -3,19 +3,20 @@
 
 /* benchmark pow performance */
 kernel void bench_math_pow(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_powr(result, pwr); /* calls native */
+    result += native_powr((src + step*loop), pwr); /* calls native */
 #else
-    result = pow(result, pwr); /* calls internal slow */
+    result += pow((src + step*loop), pwr); /* calls internal slow */
 #endif
   }
   dst[get_global_id(0)] = result;
@@ -23,15 +24,16 @@ kernel void bench_math_pow(
 
 /* benchmark exp2 performance, exp2 is native */
 kernel void bench_math_exp2(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
-    result = exp2(result) * 0.1f;
+    result += exp2((src + step*loop));
 
   dst[get_global_id(0)] = result;
 }
@@ -39,21 +41,22 @@ kernel void bench_math_exp2(
 /* benchmark exp performance */
 /* calls internal fast (native) if (x > -0x1.6p1 && x < 0x1.6p1) */
 kernel void bench_math_exp(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_exp((float)-0x1.6p1 - result * 0.1f); /* calls native */
+    result += native_exp((float)-0x1.6p1 - (src + step*loop)); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = exp((float)-0x1.6p1 + result * 0.1f); /* calls internal fast */
+    result += exp((float)-0x1.6p1 + (src + step*loop)); /* calls internal fast */
 #else
-    result = exp((float)-0x1.6p1 - result * 0.1f); /* calls internal slow */
+    result += exp((float)-0x1.6p1 - (src + step*loop)); /* calls internal slow */
 #endif
   }
 
@@ -63,21 +66,22 @@ kernel void bench_math_exp(
 /* benchmark exp10 performance */
 /* calls internal fast (native) if (x < -0x1.4p+5) || (x > +0x1.4p+5)  */
 kernel void bench_math_exp10(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_exp10((float)0x1.4p+5 + result * 0.1f); /* calls native */
+    result += native_exp10((float)0x1.4p+5 + (src +  step*loop)); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = exp10((float)-0x1.4p+5 - result * 0.1f); /* calls internal fast */
+    result += exp10((float)-0x1.4p+5 - (src +  step*loop)); /* calls internal fast */
 #else
-    result = exp10((float)-0x1.2p+5 - result * 0.1f); /* calls internal slow */
+    result += exp10((float)-0x1.2p+5 + (src +  step*loop)); /* calls internal slow */
 #endif
   }
 
@@ -87,21 +91,22 @@ kernel void bench_math_exp10(
 /* benchmark log2 performance */
 /* calls internal fast (native) if (x > 0x1.1p0)  */
 kernel void bench_math_log2(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_log2((float)0x1.1p0 + result * 0.0001f); /* calls native */
+    result += native_log2((float)0x1.1p0 + (src +  step*loop)); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = log2((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+    result += log2((float)0x1.1p0 + (src +  step*loop)); /* calls internal fast */
 #else
-    result = log2((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+    result += log2((float)0x1.1p0 - (src +  step*loop)); /* calls internal slow */
 #endif
   }
 
@@ -111,21 +116,22 @@ kernel void bench_math_log2(
 /* benchmark log performance */
 /* calls internal fast (native) if (x > 0x1.1p0)  */
 kernel void bench_math_log(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_log((float)0x1.1p0 + result * 0.0001f); /* calls native */
+    result += native_log((float)0x1.1p0 + (src +  step*loop)); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = log((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+    result += log((float)0x1.1p0 + (src +  step*loop)); /* calls internal fast */
 #else
-    result = log((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+    result += log((float)0x1.1p0 - (src +  step*loop)); /* calls internal slow */
 #endif
   }
 
@@ -135,21 +141,22 @@ kernel void bench_math_log(
 /* benchmark log10 performance */
 /* calls internal fast (native) if (x > 0x1.1p0)  */
 kernel void bench_math_log10(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_log10((float)0x1.1p0 + result * 0.0001f); /* calls native */
+    result += native_log10((float)0x1.1p0 + (src +  step*loop)); /* calls native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = log10((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+    result += log10((float)0x1.1p0 + (src +  step*loop)); /* calls internal fast */
 #else
-    result = log10((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+    result += log10((float)0x1.1p0 - (src +  step*loop)); /* calls internal slow */
 #endif
   }
 
@@ -158,38 +165,36 @@ kernel void bench_math_log10(
 
 /* benchmark sqrt performance */
 kernel void bench_math_sqrt(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
-    result = sqrt(result) + sqrt(pwr + result);
+    result += sqrt((src +  step*loop));
 
   dst[get_global_id(0)] = result;
 }
 
 /* benchmark sin performance */
 kernel void bench_math_sin(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_sin(result); /* calls native */
+    result += native_sin((src +  step*loop)); /* calls native */
 #else
-    result = sin(result);	/* calls internal, random complexity */
-    //result = sin(0.1f + result); /* calls internal, (1) no reduction */
-    //result = sin(2.f + result); /* calls internal, (2) fast reduction */
-    //result = sin(4001 + result); /* calls internal, (3) slow reduction */
-    result *= 0x1p-16;
+    result += sin((src +  step*loop));	/* calls internal, random complexity */
 #endif
   }
 
@@ -198,23 +203,20 @@ kernel void bench_math_sin(
 
 /* benchmark cos performance */
 kernel void bench_math_cos(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_cos(result); /* calls native */
+    result += native_cos((src +  step*loop)); /* calls native */
 #else
-    result = cos(result);	/* calls internal, random complexity */
-    //result = cos(0.1f + result); /* calls internal, (1) no reduction */
-    //result = cos(2.f + result); /* calls internal, (2) fast reduction */
-    //result = cos(4001.f + result); /* calls internal, (3) slow reduction */
-    result *= 0x1p-16;
+    result += cos((src +  step*loop));	/* calls internal, random complexity */
 #endif
   }
   dst[get_global_id(0)] = result;
@@ -222,19 +224,20 @@ kernel void bench_math_cos(
 
 /* benchmark native tan performance */
 kernel void bench_math_tan(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_tan(result); /* calls native */
+    result += native_tan((src +  step*loop)); /* calls native */
 #else
-    result = tan(result); /* calls internal slow */
+    result += tan((src +  step*loop)); /* calls internal slow */
 #endif
   }
 
@@ -243,30 +246,32 @@ kernel void bench_math_tan(
 
 /* benchmark asin performance */
 kernel void bench_math_asin(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
-    result = asin(pwr - 1);
+    result += asin((src +  step*loop));
 
   dst[get_global_id(0)] = result;
 }
 
 /* benchmark acos performance */
 kernel void bench_math_acos(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
-    result = acos(pwr - 1);
+    result += acos((src +  step*loop));
 
   dst[get_global_id(0)] = result;
 }
-- 
2.7.4