[Beignet] [PATCH] Add benchmark for workgroup functions

He Junyan junyan.he at inbox.com
Sun Dec 6 22:25:54 PST 2015


Hi Grigore,

I notice that you just reuse the kernel in utest as the benchmark kernel.
In this kernel, we just call the workgroup function once, while the time
diff calculated by your benchmark here includes the whole process of exec
a kernel on GPU.
The OCL_NDRANGE itself and the LOAD and STORE in the kernel may occupy more
time than the workgroup function. So I think it is hard for us to judge the
performance base on this time diff.
I think maybe you can re-write a kernel and call the workgroup function, for
example _add, more than 100 times within one kernel, and then the time diff may
be more valuable.


On Fri, Dec 04, 2015 at 03:37:28PM +0200, Grigore Lupescu wrote:
> Date: Fri,  4 Dec 2015 15:37:28 +0200
> From: Grigore Lupescu <grigore.lupescu at intel.com>
> To: beignet at lists.freedesktop.org
> Subject: [Beignet]  [PATCH] Add benchmark for workgroup functions
> X-Mailer: git-send-email 2.1.4
> 
> Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
> ---
>  benchmark/CMakeLists.txt                    |   3 +-
>  benchmark/benchmark_workgroup_functions.cpp | 176 ++++++++++++++++++++++++++++
>  2 files changed, 178 insertions(+), 1 deletion(-)
>  create mode 100644 benchmark/benchmark_workgroup_functions.cpp
> 
> diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
> index dd33829..fd7fd7d 100644
> --- a/benchmark/CMakeLists.txt
> +++ b/benchmark/CMakeLists.txt
> @@ -18,7 +18,8 @@ set (benchmark_sources
>    benchmark_copy_buffer_to_image.cpp
>    benchmark_copy_image_to_buffer.cpp
>    benchmark_copy_buffer.cpp
> -  benchmark_copy_image.cpp)
> +  benchmark_copy_image.cpp
> +  benchmark_workgroup_functions.cpp)
>  
>  
>  SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
> diff --git a/benchmark/benchmark_workgroup_functions.cpp b/benchmark/benchmark_workgroup_functions.cpp
> new file mode 100644
> index 0000000..81403a0
> --- /dev/null
> +++ b/benchmark/benchmark_workgroup_functions.cpp
> @@ -0,0 +1,176 @@
> +#include <cstdint>
> +#include <cstdlib>
> +#include <cstring>
> +#include <iostream>
> +#include "utest_helper.hpp"
> +#include <sys/time.h>
> +
> +double benchmark_workgroup_add_uint(void)
> +{
> +	cl_int ret;
> +	struct timeval start,stop;
> +	const size_t set_size = 256;
> +	const size_t set_num = set_size * set_size;
> +	size_t set_num_work = set_num;
> +	uint32_t* src = NULL; /* input set will be generated */
> +
> +	cl_mem sub_buf_in;
> +	cl_mem sub_buf_out;
> +	cl_buffer_region buf_region_in;
> +	cl_buffer_region buf_region_out;
> +
> +	buf_region_in.size = set_size * sizeof(uint32_t);
> +	buf_region_in.origin = 0;
> +	buf_region_out.size = set_size * sizeof(uint32_t);
> +	buf_region_out.origin = 0;
> +
> +	/* Each set is of the form (1, 0, 0, ..0) */
> +	src = (uint32_t*)calloc(sizeof(uint32_t), set_num * set_size);
> +	OCL_ASSERT(src != NULL);
> +	for(uint32_t i = 0; i < set_num * set_size; i++)
> +		if((i % set_size) == 0)
> +			src[i] = 1;
> +
> +	/* Setup kernel and buffers */
> +	OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
> +			"compiler_workgroup_reduce_add_uint");
> +	OCL_CREATE_BUFFER(buf[0], 0, (set_num * set_size) * sizeof(uint32_t), NULL);
> +	OCL_CREATE_BUFFER(buf[1], 0, (set_num * set_size) * sizeof(uint32_t), NULL);
> +
> +	OCL_MAP_BUFFER(0);
> +	memcpy(buf_data[0], src, set_num* set_size * sizeof(uint32_t));
> +	OCL_UNMAP_BUFFER(0);
> +
> +	globals[0] = set_size;
> +	locals[0] = set_size;
> +
> +	/* Measure performance */
> +	gettimeofday(&start,0);
> +	while(set_num_work > 0){
> +		/* Perform reductions, subBuffers with offsets */
> +		for(uint32_t i = 0; i < set_num; i++){
> +			sub_buf_in = clCreateSubBuffer(buf[0], 0,
> +					CL_BUFFER_CREATE_TYPE_REGION, &buf_region_in, &ret);
> +			OCL_ASSERT(ret == 0);
> +			sub_buf_out = clCreateSubBuffer(buf[1], 0,
> +					CL_BUFFER_CREATE_TYPE_REGION, &buf_region_out, &ret);
> +			OCL_ASSERT(ret == 0);
> +
> +			OCL_SET_ARG(0, sizeof(cl_mem), &sub_buf_in);
> +			OCL_SET_ARG(1, sizeof(cl_mem), &sub_buf_out);
> +			OCL_NDRANGE(1);
> +
> +			buf_region_in.origin += set_size * sizeof(uint32_t);
> +			buf_region_out.origin += set_size * sizeof(uint32_t);
> +		}
> +		/* Prepare memory for next set of reductions */
> +		OCL_MAP_BUFFER(0);
> +		OCL_MAP_BUFFER(1);
> +		for (uint32_t i = 0; i < set_num_work; i++) {
> +			((uint32_t *)buf_data[0])[i] =
> +					((uint32_t *)buf_data[1])[i * set_size];
> +		}
> +		OCL_UNMAP_BUFFER(0);
> +		OCL_UNMAP_BUFFER(1);
> +
> +		set_num_work /= set_size;
> +		buf_region_in.origin = 0;
> +		buf_region_out.origin = 0;
> +	}
> +	gettimeofday(&stop,0);
> +	double elapsed = time_subtract(&stop, &start, 0);
> +
> +	/* Check result, final sum */
> +	OCL_MAP_BUFFER(1);
> +	//printf("%u ", ((uint32_t *)buf_data[1])[0]);
> +	OCL_ASSERT(((uint32_t *)buf_data[1])[0] == set_num);
> +	OCL_UNMAP_BUFFER(1);
> +
> +	return BANDWIDTH(set_num * set_size * sizeof(uint32_t) * 100, elapsed);
> +}
> +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_uint, "Mops/S");
> +
> +double benchmark_workgroup_add_float(void)
> +{
> +	cl_int ret;
> +	struct timeval start,stop;
> +	const size_t set_size = 256;
> +	const size_t set_num = set_size * set_size;
> +	size_t set_num_work = set_num;
> +	float* src = NULL; /* input set will be generated */
> +
> +	cl_mem sub_buf_in;
> +	cl_mem sub_buf_out;
> +	cl_buffer_region buf_region_in;
> +	cl_buffer_region buf_region_out;
> +
> +	buf_region_in.size = set_size * sizeof(float);
> +	buf_region_in.origin = 0;
> +	buf_region_out.size = set_size * sizeof(float);
> +	buf_region_out.origin = 0;
> +
> +	/* Each set is of the form (1, 0, 0, ..0) */
> +	src = (float*)calloc(sizeof(float), set_num * set_size);
> +	OCL_ASSERT(src != NULL);
> +	for(uint32_t i = 0; i < set_num * set_size; i++)
> +		if((i % set_size) == 0)
> +			src[i] = 1;
> +
> +	/* Setup kernel and buffers */
> +	OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce",
> +			"compiler_workgroup_reduce_add_float");
> +	OCL_CREATE_BUFFER(buf[0], 0, (set_num * set_size) * sizeof(float), NULL);
> +	OCL_CREATE_BUFFER(buf[1], 0, (set_num * set_size) * sizeof(float), NULL);
> +
> +	OCL_MAP_BUFFER(0);
> +	memcpy(buf_data[0], src, set_num* set_size * sizeof(float));
> +	OCL_UNMAP_BUFFER(0);
> +
> +	globals[0] = set_size;
> +	locals[0] = set_size;
> +
> +	/* Measure performance */
> +	gettimeofday(&start,0);
> +	while(set_num_work > 0){
> +		/* Perform reductions, subBuffers with offsets */
> +		for(uint32_t i = 0; i < set_num; i++){
> +			sub_buf_in = clCreateSubBuffer(buf[0], 0,
> +					CL_BUFFER_CREATE_TYPE_REGION, &buf_region_in, &ret);
> +			OCL_ASSERT(ret == 0);
> +			sub_buf_out = clCreateSubBuffer(buf[1], 0,
> +					CL_BUFFER_CREATE_TYPE_REGION, &buf_region_out, &ret);
> +			OCL_ASSERT(ret == 0);
> +
> +			OCL_SET_ARG(0, sizeof(cl_mem), &sub_buf_in);
> +			OCL_SET_ARG(1, sizeof(cl_mem), &sub_buf_out);
> +			OCL_NDRANGE(1);
> +
> +			buf_region_in.origin += set_size * sizeof(float);
> +			buf_region_out.origin += set_size * sizeof(float);
> +		}
> +		/* Prepare memory for next set of reductions */
> +		OCL_MAP_BUFFER(0);
> +		OCL_MAP_BUFFER(1);
> +		for (uint32_t i = 0; i < set_num_work; i++) {
> +			((float *)buf_data[0])[i] =
> +					((float *)buf_data[1])[i * set_size];
> +		}
> +		OCL_UNMAP_BUFFER(0);
> +		OCL_UNMAP_BUFFER(1);
> +
> +		set_num_work /= set_size;
> +		buf_region_in.origin = 0;
> +		buf_region_out.origin = 0;
> +	}
> +	gettimeofday(&stop,0);
> +	double elapsed = time_subtract(&stop, &start, 0);
> +
> +	/* Check result, final sum */
> +	OCL_MAP_BUFFER(1);
> +	//printf("%f ", ((float *)buf_data[1])[0]);
> +	OCL_ASSERT(((float *)buf_data[1])[0] == set_num);
> +	OCL_UNMAP_BUFFER(1);
> +
> +	return BANDWIDTH(set_num * set_size * sizeof(float) * 100, elapsed);
> +}
> +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_float, "Mflops/S");
> -- 
> 2.1.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet




More information about the Beignet mailing list