[Beignet] [PATCH 1/4] Benchmark: Evaluate math performance on intervals

Mon Jun 27 19:03:45 UTC 2016

From: Grigore Lupescu <grigore.lupescu at intel.com>

Functions to benchmark math functions on intervals.
Tests: sin, cos, exp2, exp, exp10, log2, log, log10

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 benchmark/CMakeLists.txt     |   3 +-
 benchmark/benchmark_math.cpp | 126 ++++++++++++++++++++
 kernels/bench_math.cl        | 272 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 400 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/benchmark_math.cpp
 create mode 100644 kernels/bench_math.cl

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 4319ccc..85416e0 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -19,7 +19,8 @@ set (benchmark_sources
   benchmark_copy_image_to_buffer.cpp
   benchmark_copy_buffer.cpp
   benchmark_copy_image.cpp
-  benchmark_workgroup.cpp)
+  benchmark_workgroup.cpp
+  benchmark_math.cpp)
 
 
 SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
diff --git a/benchmark/benchmark_math.cpp b/benchmark/benchmark_math.cpp
new file mode 100644
index 0000000..72bc316
--- /dev/null
+++ b/benchmark/benchmark_math.cpp
@@ -0,0 +1,126 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+double benchmark_generic_math(const char* str_filename,
+                              const char* str_kernel)
+{
+  double elapsed = 0;
+  struct timeval start,stop;
+  const size_t global_size = 1024 * 1024;
+  const size_t local_size = 64;
+
+  /* Compute math OP, loop times on global size */
+  cl_float base = 1.000002;
+  cl_float pwr = 1.0102003;
+  uint32_t loop = 1000;
+
+  /* Input set will be generated */
+  float* src = (float*)calloc(sizeof(float), global_size);
+  OCL_ASSERT(src != NULL);
+  for(uint32_t i = 0; i < global_size; i++)
+    src[i] = base + i * (base - 1);
+
+  /* Setup kernel and buffers */
+  OCL_CALL(cl_kernel_init, str_filename, str_kernel, SOURCE, "");
+
+  OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL);
+
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, global_size * sizeof(float));
+  OCL_UNMAP_BUFFER(0);
+
+  globals[0] = global_size;
+  locals[0] = local_size;
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_float), &pwr);
+  OCL_SET_ARG(3, sizeof(cl_uint), &loop);
+
+  /* Measure performance */
+  gettimeofday(&start,0);
+  OCL_NDRANGE(1);
+  clFinish(queue);
+  gettimeofday(&stop,0);
+  elapsed = time_subtract(&stop, &start, 0);
+
+  /* Show compute results */
+  OCL_MAP_BUFFER(1);
+  for(uint32_t i = 0; i < global_size; i += 8192)
+    printf("\t%.3f", ((float*)buf_data[1])[i]);
+  OCL_UNMAP_BUFFER(1);
+
+  return BANDWIDTH(global_size * loop, elapsed);
+}
+
+double benchmark_math_pow(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_pow");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_pow, "Mop/s");
+
+double benchmark_math_exp2(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp2");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp2, "Mop/s");
+
+double benchmark_math_exp(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp, "Mop/s");
+
+double benchmark_math_exp10(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp10");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp10, "Mop/s");
+
+double benchmark_math_log2(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_log2");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log2, "Mop/s");
+
+double benchmark_math_log(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_log");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log, "Mop/s");
+
+double benchmark_math_log10(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_log10");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log10, "Mop/s");
+
+double benchmark_math_sqrt(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_sqrt");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_sqrt, "Mop/s");
+
+double benchmark_math_sin(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_sin");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_sin, "Mop/s");
+
+double benchmark_math_cos(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_cos");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_cos, "Mop/s");
+
+double benchmark_math_tan(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_tan");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_tan, "Mop/s");
+
+double benchmark_math_asin(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_asin");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_asin, "Mop/s");
+
+double benchmark_math_acos(void){
+  return benchmark_generic_math("bench_math.cl", "bench_math_acos");
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_acos, "Mop/s");
diff --git a/kernels/bench_math.cl b/kernels/bench_math.cl
new file mode 100644
index 0000000..8d85d51
--- /dev/null
+++ b/kernels/bench_math.cl
@@ -0,0 +1,272 @@
+//#define BENCHMARK_NATIVE 1
+//#define BENCHMARK_INTERNAL_FAST 2
+
+/* benchmark pow performance */
+kernel void bench_math_pow(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_powr(result, pwr); /* calls native */
+#else
+    result = pow(result, pwr); /* calls internal slow */
+#endif
+  }
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark exp2 performance, exp2 is native */
+kernel void bench_math_exp2(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+    result = exp2(result) * 0.1f;
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark exp performance */
+/* calls internal fast (native) if (x > -0x1.6p1 && x < 0x1.6p1) */
+kernel void bench_math_exp(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_exp((float)-0x1.6p1 - result * 0.1f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = exp((float)-0x1.6p1 + result * 0.1f); /* calls internal fast */
+#else
+    result = exp((float)-0x1.6p1 - result * 0.1f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark exp10 performance */
+/* calls internal fast (native) if (x < -0x1.4p+5) || (x > +0x1.4p+5)  */
+kernel void bench_math_exp10(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_exp10((float)0x1.4p+5 + result * 0.1f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = exp10((float)-0x1.4p+5 - result * 0.1f); /* calls internal fast */
+#else
+    result = exp10((float)-0x1.2p+5 - result * 0.1f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark log2 performance */
+/* calls internal fast (native) if (x > 0x1.1p0)  */
+kernel void bench_math_log2(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_log2((float)0x1.1p0 + result * 0.0001f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = log2((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+#else
+    result = log2((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark log performance */
+/* calls internal fast (native) if (x > 0x1.1p0)  */
+kernel void bench_math_log(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_log((float)0x1.1p0 + result * 0.0001f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = log((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+#else
+    result = log((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark log10 performance */
+/* calls internal fast (native) if (x > 0x1.1p0)  */
+kernel void bench_math_log10(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_log10((float)0x1.1p0 + result * 0.0001f); /* calls native */
+#elif defined(BENCHMARK_INTERNAL_FAST)
+    result = log10((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+#else
+    result = log10((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark sqrt performance */
+kernel void bench_math_sqrt(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+    result = sqrt(result) + sqrt(pwr + result);
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark sin performance */
+kernel void bench_math_sin(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_sin(result); /* calls native */
+#else
+    result = sin(result);	/* calls internal, random complexity */
+    //result = sin(0.1f + result); /* calls internal, (1) no reduction */
+    //result = sin(2.f + result); /* calls internal, (2) fast reduction */
+    //result = sin(4001 + result); /* calls internal, (3) slow reduction */
+    result *= 0x1p-16;
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark cos performance */
+kernel void bench_math_cos(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_cos(result); /* calls native */
+#else
+    result = cos(result);	/* calls internal, random complexity */
+    //result = cos(0.1f + result); /* calls internal, (1) no reduction */
+    //result = cos(2.f + result); /* calls internal, (2) fast reduction */
+    //result = cos(4001.f + result); /* calls internal, (3) slow reduction */
+    result *= 0x1p-16;
+#endif
+  }
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark native tan performance */
+kernel void bench_math_tan(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+  {
+#if defined(BENCHMARK_NATIVE)
+    result = native_tan(result); /* calls native */
+#else
+    result = tan(result); /* calls internal slow */
+#endif
+  }
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark asin performance */
+kernel void bench_math_asin(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+    result = asin(pwr - 1);
+
+  dst[get_global_id(0)] = result;
+}
+
+/* benchmark acos performance */
+kernel void bench_math_acos(
+  global float *src,
+  global float *dst,
+  float pwr,
+  uint loop)
+{
+  float result = src[get_global_id(0)];
+
+  for(; loop > 0; loop--)
+    result = acos(pwr - 1);
+
+  dst[get_global_id(0)] = result;
+}
-- 
2.5.0