[Beignet] [PATCH] benchmark test for global read and write bandwidth
Yang, Rong R
rong.r.yang at intel.com
Mon Aug 24 00:49:53 PDT 2015
Some comments, thanks.
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Meng Mengmeng
> Sent: Saturday, August 15, 2015 22:48
> To: beignet at lists.freedesktop.org
> Cc: Meng, Mengmeng
> Subject: [Beignet] [PATCH] benchmark test for global read and write
> bandwidth
>
> ---
> benchmark/CMakeLists.txt | 3 ++-
> benchmark/benchmark_io_test.cpp | 48
> +++++++++++++++++++++++++++++++++++++++++
> kernels/benchmark_io.cl | 26 ++++++++++++++++++++++
> 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100644
> benchmark/benchmark_io_test.cpp create mode 100644
> kernels/benchmark_io.cl
>
> diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index
> 3e43a21..1b9fe54 100644
> --- a/benchmark/CMakeLists.txt
> +++ b/benchmark/CMakeLists.txt
> @@ -16,7 +16,8 @@ set (benchmark_sources
> benchmark_read_buffer.cpp
> benchmark_read_image.cpp
> benchmark_copy_buffer_to_image.cpp
> - benchmark_copy_image_to_buffer.cpp)
> + benchmark_copy_image_to_buffer.cpp
> + benchmark_io_test)
>
>
> SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
> diff --git a/benchmark/benchmark_io_test.cpp
> b/benchmark/benchmark_io_test.cpp new file mode 100644 index
> 0000000..5c95ce3
> --- /dev/null
> +++ b/benchmark/benchmark_io_test.cpp
> @@ -0,0 +1,48 @@
> +#include "utests/utest_helper.hpp"
> +#include <sys/time.h>
> +
> +struct timeval start,stop;
> +const size_t n = 1024 * 1024;
> +int count = 16;
> +const size_t sz = 4 * n * count;
Why size is 4 * n * count?
> +
> +#define BENCH_address(V,T) \
> +static double benchmark ##V(void) \
> +{ \
> + \
> + OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_ONLY, sz * sizeof(float),
> +NULL); \
CPU also access buf[0], so the cl_mem_flags is not CL_MEM_READ_ONLY, right?
> + OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_ONLY, sz * sizeof(float),
> +NULL); \ \
CL_MEM_WRITE_ONLY?
> + OCL_CREATE_KERNEL_FROM_FILE("benchmark_io",T); \ \
> + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);\
> + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \ \
> + OCL_MAP_BUFFER(0); \
> + for (size_t i = 0; i < sz; i ++) { \
> + ((float *)(buf_data[0]))[i] = rand(); \
> + } \
> + OCL_UNMAP_BUFFER(0);\
> + \
> + globals[0] = n; \
> + locals[0] = 256; \
> + \
> + gettimeofday(&start,0); \
> + for (size_t i=0; i<100; i++) { \
> + OCL_NDRANGE(1);\
> + } \
> + OCL_FINISH();\
> + gettimeofday(&stop,0);\
> + \
> + clReleaseMemObject(buf[0]);\
> + free(buf_data[0]);\
> + buf_data[0] = NULL; \
Also need release buf[1]
> +\
> + double elapsed = time_subtract(&stop, &start, 0);\ \
> + return BANDWIDTH(sz * sizeof(float) * 1 * 100, elapsed);\ } \
> +MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(benchmark
> ##V,true);
> +
> +BENCH_address(_global_write,"benchmark_global_write")
> +BENCH_address(_global_read,"benchmark_global_read")
> diff --git a/kernels/benchmark_io.cl b/kernels/benchmark_io.cl new file
> mode 100644 index 0000000..6a3150b
> --- /dev/null
> +++ b/kernels/benchmark_io.cl
> @@ -0,0 +1,26 @@
> +#define COUNT 100
> +__kernel void
> +benchmark_global_write(__global float * src, __global float* dst) {
> + float sum = 0 ;
> + int id = (int)get_global_id(0);
> +
> + if (id%10 == 1)
> + dst[id] = src[id]/2 + 1;
> + else
> + dst[id] = src[id]/2 - 1;
One write with one read, it is not only write performance.
> +}
> +__kernel void
> +benchmark_global_read(__global float * src, __global float* dst) {
> + float sum = 0 ;
> + int id = (int)get_global_id(0);
> +
> + for (int i=1; i<COUNT; i++) {
> + sum = sum + src[id%i];
Only read 0~i-1 memory, there are lots cache hit, so I'm afraid it is not the realistic read performance, It is better to read different memory.
> + }
> +
> + if (id%10 == 1)
> + dst[id] = sum;
> +}
> +
> --
> 1.9.1
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list