[Beignet] [PATCH 1/4] Benchmark: Add performance tests for workgroup reduce add/min/max
grigore.lupescu at intel.com
grigore.lupescu at intel.com
Wed Mar 9 13:00:46 UTC 2016
From: Grigore Lupescu <grigore.lupescu at intel.com>
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
benchmark/CMakeLists.txt | 3 +-
benchmark/benchmark_workgroup_reduce.cpp | 212 +++++++++++++++++++++++++++++++
kernels/bench_workgroup_reduce.cl | 55 ++++++++
3 files changed, 269 insertions(+), 1 deletion(-)
create mode 100644 benchmark/benchmark_workgroup_reduce.cpp
create mode 100644 kernels/bench_workgroup_reduce.cl
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index dd33829..a6539d9 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -18,7 +18,8 @@ set (benchmark_sources
benchmark_copy_buffer_to_image.cpp
benchmark_copy_image_to_buffer.cpp
benchmark_copy_buffer.cpp
- benchmark_copy_image.cpp)
+ benchmark_copy_image.cpp
+ benchmark_workgroup_reduce.cpp)
SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
diff --git a/benchmark/benchmark_workgroup_reduce.cpp b/benchmark/benchmark_workgroup_reduce.cpp
new file mode 100644
index 0000000..c93ef26
--- /dev/null
+++ b/benchmark/benchmark_workgroup_reduce.cpp
@@ -0,0 +1,212 @@
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+double benchmark_workgroup_add_uint(void)
+{
+ double elapsed = 0;
+ struct timeval start,stop;
+ const size_t global_size = 512 * 256;
+ const size_t local_size = 128;
+ const uint32_t reduce_loop = 10000;
+
+ /* Input set will be generated */
+ uint32_t* src = (uint32_t*)calloc(sizeof(uint32_t), global_size);
+ OCL_ASSERT(src != NULL);
+ for(uint32_t i = 0; i < global_size; i++){
+ src[i] = (i / local_size);
+ }
+
+ /* Setup kernel and buffers */
+ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce",
+ "bench_workgroup_reduce_add_uint");
+
+ OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(uint32_t), NULL);
+
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, global_size * sizeof(uint32_t));
+ OCL_UNMAP_BUFFER(0);
+
+ globals[0] = global_size;
+ locals[0] = local_size;
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+
+ /* Measure performance */
+ gettimeofday(&start,0);
+ OCL_NDRANGE(1);
+ clFinish(queue);
+ gettimeofday(&stop,0);
+ elapsed = time_subtract(&stop, &start, 0);
+
+ /* Check results */
+ OCL_MAP_BUFFER(1);
+ for(uint32_t i = 0; i < global_size; i += local_size){
+ //printf(" %u", ((uint32_t*)buf_data[1])[i]);
+ OCL_ASSERT( ((uint32_t*)buf_data[1])[i] == i );
+ }
+ OCL_UNMAP_BUFFER(1);
+
+ return BANDWIDTH(global_size * reduce_loop, elapsed);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_uint, "M/sec");
+
+double benchmark_workgroup_min_uint(void)
+{
+ double elapsed = 0;
+ struct timeval start,stop;
+ const size_t global_size = 512 * 256;
+ const size_t local_size = 128;
+ const uint32_t reduce_loop = 10000;
+
+ /* Input set will be generated */
+ uint32_t* src = (uint32_t*)calloc(sizeof(uint32_t), global_size);
+ OCL_ASSERT(src != NULL);
+ for(uint32_t i = 0; i < global_size; i++){
+ src[i] = i;
+ }
+
+ /* Setup kernel and buffers */
+ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce",
+ "bench_workgroup_reduce_min_uint");
+
+ OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(uint32_t), NULL);
+
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, global_size * sizeof(uint32_t));
+ OCL_UNMAP_BUFFER(0);
+
+ globals[0] = global_size;
+ locals[0] = local_size;
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+
+ /* Measure performance */
+ gettimeofday(&start,0);
+ OCL_NDRANGE(1);
+ clFinish(queue);
+ gettimeofday(&stop,0);
+ elapsed = time_subtract(&stop, &start, 0);
+
+ /* Check results */
+ OCL_MAP_BUFFER(1);
+ for(uint32_t i = local_size/2; i < global_size; i += local_size){
+ //printf(" %u", ((uint32_t*)buf_data[1])[i]);
+ OCL_ASSERT( ((uint32_t*)buf_data[1])[i] == (src[i] - (local_size / 2)) );
+ }
+ OCL_UNMAP_BUFFER(1);
+
+ return BANDWIDTH(global_size * reduce_loop, elapsed);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_min_uint, "M/sec");
+
+double benchmark_workgroup_add_float(void)
+{
+ double elapsed = 0;
+ struct timeval start,stop;
+ const size_t global_size = 512 * 256;
+ const size_t local_size = 128;
+ const uint32_t reduce_loop = 10000;
+
+ /* Input set will be generated */
+ float* src = (float*)calloc(sizeof(float), global_size);
+ OCL_ASSERT(src != NULL);
+ for(uint32_t i = 0; i < global_size; i++)
+ src[i] = (i / local_size);
+
+ /* Setup kernel and buffers */
+ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce",
+ "bench_workgroup_reduce_add_float");
+
+ OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL);
+
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, global_size * sizeof(float));
+ OCL_UNMAP_BUFFER(0);
+
+ globals[0] = global_size;
+ locals[0] = local_size;
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+
+ /* Measure performance */
+ gettimeofday(&start,0);
+ OCL_NDRANGE(1);
+ clFinish(queue);
+ gettimeofday(&stop,0);
+ elapsed = time_subtract(&stop, &start, 0);
+
+ /* Check results */
+ OCL_MAP_BUFFER(1);
+ for(uint32_t i = 0; i < global_size; i += local_size){
+ //printf(" %f", ((float*)buf_data[1])[i]);
+ OCL_ASSERT( ((float*)buf_data[1])[i] == (float)i );
+ }
+ OCL_UNMAP_BUFFER(1);
+
+ return BANDWIDTH(global_size * reduce_loop, elapsed);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_float, "M/sec");
+
+double benchmark_workgroup_min_float(void)
+{
+ double elapsed = 0;
+ struct timeval start,stop;
+ const size_t global_size = 512 * 256;
+ const size_t local_size = 128;
+ const uint32_t reduce_loop = 10000;
+
+ /* Input set will be generated */
+ float* src = (float*)calloc(sizeof(float), global_size);
+ OCL_ASSERT(src != NULL);
+ for(uint32_t i = 0; i < global_size; i++)
+ src[i] = 1.0f * i + 1;
+
+ /* Setup kernel and buffers */
+ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce",
+ "bench_workgroup_reduce_min_float");
+
+ OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL);
+
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], src, global_size * sizeof(float));
+ OCL_UNMAP_BUFFER(0);
+
+ globals[0] = global_size;
+ locals[0] = local_size;
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+
+ /* Measure performance */
+ gettimeofday(&start,0);
+ OCL_NDRANGE(1);
+ clFinish(queue);
+ gettimeofday(&stop,0);
+ elapsed = time_subtract(&stop, &start, 0);
+
+ /* Check results */
+ OCL_MAP_BUFFER(1);
+ for(uint32_t i = local_size/2; i < global_size; i += local_size){
+ //printf(" %f", ((float*)buf_data[1])[i]);
+ OCL_ASSERT( ((float*)buf_data[1])[i] == (src[i] - (local_size / 2)) );
+ }
+ OCL_UNMAP_BUFFER(1);
+
+ return BANDWIDTH(global_size * reduce_loop, elapsed);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_min_float, "M/sec");
diff --git a/kernels/bench_workgroup_reduce.cl b/kernels/bench_workgroup_reduce.cl
new file mode 100644
index 0000000..ba1c709
--- /dev/null
+++ b/kernels/bench_workgroup_reduce.cl
@@ -0,0 +1,55 @@
+kernel void bench_workgroup_reduce_add_uint(
+ global uint *src,
+ global uint *dst,
+ uint reduce_loop)
+{
+ uint val = src[get_global_id(0)];
+ uint sum = work_group_reduce_add(val);
+
+ for(; reduce_loop > 0; reduce_loop--)
+ sum = work_group_reduce_add(val);
+
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void bench_workgroup_reduce_min_uint(
+ global uint *src,
+ global uint *dst,
+ uint reduce_loop)
+{
+ uint val = src[get_global_id(0)];
+ uint min = work_group_reduce_min(val);
+
+ for(; reduce_loop > 0; reduce_loop--)
+ min = work_group_reduce_min(val);
+
+ dst[get_global_id(0)] = min;
+}
+
+kernel void bench_workgroup_reduce_add_float(
+ global float *src,
+ global float *dst,
+ uint reduce_loop)
+{
+ float val = src[get_global_id(0)];
+ float sum = work_group_reduce_add(val);
+
+ for(; reduce_loop > 0; reduce_loop--)
+ sum = work_group_reduce_add(val);
+
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void bench_workgroup_reduce_min_float(
+ global float *src,
+ global float *dst,
+ uint reduce_loop)
+{
+ float val = src[get_global_id(0)];
+ float min = work_group_reduce_min(val);
+
+ for(; reduce_loop > 0; reduce_loop--)
+ min = work_group_reduce_min(val);
+
+ dst[get_global_id(0)] = min;
+}
--
2.5.0
More information about the Beignet
mailing list