[Beignet] [PATCH] Add benchmark for workgroup functions

Grigore Lupescu grigore.lupescu at intel.com
Mon Dec 14 00:17:04 PST 2015


Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 benchmark/CMakeLists.txt                 |   3 +-
 benchmark/benchmark_workgroup_reduce.cpp | 108 +++++++++++++++++++++++++++++++
 kernels/bench_workgroup_reduce.cl        |  27 ++++++++
 3 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/benchmark_workgroup_reduce.cpp
 create mode 100644 kernels/bench_workgroup_reduce.cl

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index dd33829..a6539d9 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -18,7 +18,8 @@ set (benchmark_sources
   benchmark_copy_buffer_to_image.cpp
   benchmark_copy_image_to_buffer.cpp
   benchmark_copy_buffer.cpp
-  benchmark_copy_image.cpp)
+  benchmark_copy_image.cpp
+  benchmark_workgroup_reduce.cpp)
 
 
 SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
diff --git a/benchmark/benchmark_workgroup_reduce.cpp b/benchmark/benchmark_workgroup_reduce.cpp
new file mode 100644
index 0000000..b802768
--- /dev/null
+++ b/benchmark/benchmark_workgroup_reduce.cpp
@@ -0,0 +1,108 @@
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+double benchmark_workgroup_add_uint(void)
+{
+	double elapsed = 0;
+	struct timeval start,stop;
+	const size_t set_size = 128;
+	const size_t set_num = 512;
+	const uint32_t reduce_loop = 10000;
+
+	/* Input set will be generated */
+	uint32_t* src = (uint32_t*)calloc(sizeof(uint32_t), set_num * set_size);
+	OCL_ASSERT(src != NULL);
+	for(uint32_t i = 0; i < set_num * set_size; i++)
+		src[i] = 1;
+
+	/* Setup kernel and buffers */
+	OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce",
+			"bench_workgroup_reduce_add_uint");
+
+	OCL_CREATE_BUFFER(buf[0], 0, (set_num * set_size) * sizeof(uint32_t), NULL);
+	OCL_CREATE_BUFFER(buf[1], 0, (set_num * set_size) * sizeof(uint32_t), NULL);
+
+	OCL_MAP_BUFFER(0);
+	memcpy(buf_data[0], src, set_num * set_size * sizeof(uint32_t));
+	OCL_UNMAP_BUFFER(0);
+
+	globals[0] = set_num * set_size;
+	locals[0] = set_size;
+
+	OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+	OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+	OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+
+	/* Measure performance */
+	gettimeofday(&start,0);
+	OCL_NDRANGE(1);
+	clFinish(queue);
+	gettimeofday(&stop,0);
+	elapsed = time_subtract(&stop, &start, 0);
+
+	/* Check results */
+	OCL_MAP_BUFFER(1);
+	for(uint32_t i = 1; i < set_num * set_size; i += set_size){
+		//printf("%u ", ((uint32_t*)buf_data[1])[i]);
+		OCL_ASSERT(((uint32_t*)buf_data[1])[i] == set_size);
+	}
+	OCL_UNMAP_BUFFER(1);
+
+	return BANDWIDTH(set_num * set_size * reduce_loop, elapsed);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_uint, "Msum/S");
+
+double benchmark_workgroup_add_float(void)
+{
+	double elapsed = 0;
+	struct timeval start,stop;
+	const size_t set_size = 128;
+	const size_t set_num = 512;
+	const uint32_t reduce_loop = 10000;
+
+	/* Input set will be generated */
+	float* src = (float*)calloc(sizeof(float), set_num * set_size);
+	OCL_ASSERT(src != NULL);
+	for(uint32_t i = 0; i < set_num * set_size; i++)
+		src[i] = 1.0f;
+
+	/* Setup kernel and buffers */
+	OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce",
+			"bench_workgroup_reduce_add_float");
+
+	OCL_CREATE_BUFFER(buf[0], 0, (set_num * set_size) * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[1], 0, (set_num * set_size) * sizeof(float), NULL);
+
+	OCL_MAP_BUFFER(0);
+	memcpy(buf_data[0], src, set_num * set_size * sizeof(float));
+	OCL_UNMAP_BUFFER(0);
+
+	globals[0] = set_num * set_size;
+	locals[0] = set_size;
+
+	OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+	OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+	OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+
+	/* Measure performance */
+	gettimeofday(&start,0);
+	OCL_NDRANGE(1);
+	clFinish(queue);
+	gettimeofday(&stop,0);
+	elapsed = time_subtract(&stop, &start, 0);
+
+	/* Check results */
+	OCL_MAP_BUFFER(1);
+	for(uint32_t i = 1; i < set_num * set_size; i += set_size){
+		//printf("%f ", ((float*)buf_data[1])[i]);
+		OCL_ASSERT(((float*)buf_data[1])[i] == set_size);
+	}
+	OCL_UNMAP_BUFFER(1);
+
+	return BANDWIDTH(set_num * set_size * reduce_loop, elapsed);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_float, "Msum/S");
diff --git a/kernels/bench_workgroup_reduce.cl b/kernels/bench_workgroup_reduce.cl
new file mode 100644
index 0000000..5901076
--- /dev/null
+++ b/kernels/bench_workgroup_reduce.cl
@@ -0,0 +1,27 @@
+kernel void bench_workgroup_reduce_add_uint(
+	global uint *src,
+	global uint *dst,
+	uint reduce_loop)
+{
+	uint val = src[get_local_id(0)];
+	uint sum = work_group_reduce_add(val);
+
+	for(; reduce_loop > 0; reduce_loop--)
+		sum = work_group_reduce_add(val);
+
+	dst[get_global_id(0)] = sum;
+}
+
+kernel void bench_workgroup_reduce_add_float(
+	global float *src,
+	global float *dst,
+	uint reduce_loop)
+{
+   float val = src[get_local_id(0)];
+   float sum = work_group_reduce_add(val);
+
+   for(; reduce_loop > 0; reduce_loop--)
+	   sum = work_group_reduce_add(val);
+
+   dst[get_global_id(0)] = sum;
+}
-- 
2.1.4



More information about the Beignet mailing list