[Beignet] [PATCH 1/2] Add new vload benchmark/test case.
Zhigang Gong
zhigang.gong at intel.com
Tue Aug 26 21:12:43 PDT 2014
Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
kernels/vload_bench.cl | 33 +++++++++++++++++++
utests/CMakeLists.txt | 1 +
utests/utest_helper.cpp | 5 +--
utests/vload_bench.cpp | 85 +++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 122 insertions(+), 2 deletions(-)
create mode 100644 kernels/vload_bench.cl
create mode 100644 utests/vload_bench.cpp
diff --git a/kernels/vload_bench.cl b/kernels/vload_bench.cl
new file mode 100644
index 0000000..c906c75
--- /dev/null
+++ b/kernels/vload_bench.cl
@@ -0,0 +1,33 @@
+#define VLOAD_BENCH(T, N, M) \
+__kernel void \
+vload_bench_##M ##T ##N(__global T* src, __global uint* dst, uint offset) \
+{ \
+ int id = (int)get_global_id(0); \
+ uint ##N srcV = 0; \
+ for(int i = 0; i < M; i++) \
+ { \
+ srcV += convert_uint ##N(vload ##N(id + (i & 0xFFFF), src + offset)); \
+ } \
+ vstore ##N(srcV, id, dst);\
+ /*if (id < 16)*/ \
+ /*printf("id %d %d %d\n", id, srcV.s0, srcV.s1);*/ \
+}
+
+#define VLOAD_BENCH_ALL_VECTOR(T, N_ITERATIONS) \
+ VLOAD_BENCH(T, 2, N_ITERATIONS) \
+ VLOAD_BENCH(T, 3, N_ITERATIONS) \
+ VLOAD_BENCH(T, 4, N_ITERATIONS) \
+ VLOAD_BENCH(T, 8, N_ITERATIONS) \
+ VLOAD_BENCH(T, 16, N_ITERATIONS)
+
+#define VLOAD_BENCH_ALL_TYPES(N_ITERATIONS) \
+ VLOAD_BENCH_ALL_VECTOR(uchar, N_ITERATIONS) \
+ VLOAD_BENCH_ALL_VECTOR(char, N_ITERATIONS) \
+ VLOAD_BENCH_ALL_VECTOR(ushort, N_ITERATIONS) \
+ VLOAD_BENCH_ALL_VECTOR(short, N_ITERATIONS) \
+ VLOAD_BENCH_ALL_VECTOR(uint, N_ITERATIONS) \
+ VLOAD_BENCH_ALL_VECTOR(int, N_ITERATIONS) \
+ VLOAD_BENCH_ALL_VECTOR(float, N_ITERATIONS)
+
+VLOAD_BENCH_ALL_TYPES(1)
+VLOAD_BENCH_ALL_TYPES(10000)
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 721e6f7..b30e6f9 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -184,6 +184,7 @@ set (utests_sources
image_1D_buffer.cpp
compare_image_2d_and_1d_array.cpp
compiler_constant_expr.cpp
+ vload_bench.cpp
utest_assert.cpp
utest.cpp
utest_file_map.cpp
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index cb4dd66..e4badea 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -262,9 +262,10 @@ cl_kernel_init(const char *file_name, const char *kernel_name, int format, const
goto error;
}
prevFileName = file_name;
+
+ /* OCL requires to build the program even if it is created from a binary */
+ OCL_CALL (clBuildProgram, program, 1, &device, build_opt, NULL, NULL);
}
- /* OCL requires to build the program even if it is created from a binary */
- OCL_CALL (clBuildProgram, program, 1, &device, build_opt, NULL, NULL);
/* Create a kernel from the program */
if (kernel)
diff --git a/utests/vload_bench.cpp b/utests/vload_bench.cpp
new file mode 100644
index 0000000..2acc063
--- /dev/null
+++ b/utests/vload_bench.cpp
@@ -0,0 +1,85 @@
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+#define N_ITERATIONS 10000
+
+#define T uint8_t
+template <typename T>
+static void vload_bench(const char *kernelFunc, uint32_t N, uint32_t offset, bool benchMode)
+{
+ const size_t n = benchMode ? (512 * 1024) : (8 * 1024);
+ struct timeval start, end;
+
+ // Setup kernel and buffers
+ std::string kernelName = kernelFunc + std::to_string(N);
+ OCL_CALL (cl_kernel_init, "vload_bench.cl", kernelName.c_str(), SOURCE, NULL);
+ //OCL_CREATE_KERNEL("compiler_array");
+ buf_data[0] = (T*) malloc(sizeof(T) * n);
+ for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = i; //rand() & ((1LL << N) - 1);
+ OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ free(buf_data[0]);
+ buf_data[0] = NULL;
+
+ // Run the kernel
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(uint32_t), &offset);
+ globals[0] = n / ((N + 1) & ~0x1);
+ locals[0] = 256;
+ if (benchMode)
+ gettimeofday(&start, NULL);
+ OCL_NDRANGE(1);
+ if (benchMode) {
+ OCL_FINISH();
+ gettimeofday(&end, NULL);
+ double elapsed = (end.tv_sec - start.tv_sec) * 1e6 + (end.tv_usec - start.tv_usec);
+ double bandwidth = (globals[0] * (N_ITERATIONS) * sizeof(T) * N) / elapsed;
+ printf("\t%2.1fGB/S\n", bandwidth/1000.);
+ } else {
+ // Check result
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ for (uint32_t i = 0; i < globals[0]; ++i) {
+ OCL_ASSERT(((T*)buf_data[0])[i + offset] == ((uint32_t*)buf_data[1])[i]);
+ }
+ }
+}
+
+#define VLOAD_TEST(T, kT) \
+static void vload_test_ ##kT(void) \
+{ \
+ uint8_t vectorSize[] = {2, 3, 4, 8, 16}; \
+ for(uint32_t i = 0; i < sizeof(vectorSize); i++) { \
+ for(uint32_t offset = 0; offset < vectorSize[i]; offset++) {\
+ vload_bench<T>("vload_bench_1" #kT, vectorSize[i], offset, false); \
+ }\
+ } \
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(vload_test_ ##kT, true)
+
+VLOAD_TEST(uint8_t, uchar)
+VLOAD_TEST(int8_t, char)
+VLOAD_TEST(uint16_t, ushort)
+VLOAD_TEST(int16_t, short)
+VLOAD_TEST(uint32_t, uint)
+VLOAD_TEST(int32_t, int)
+VLOAD_TEST(float, float)
+
+#define VLOAD_BENCH(T, kT) \
+static void vload_bench_ ##kT(void) \
+{ \
+ uint8_t vectorSize[] = {2, 3, 4, 8, 16}; \
+ printf("\n");\
+ for(uint32_t i = 0; i < sizeof(vectorSize); i++) { \
+ printf(" Vector size %d:\n", vectorSize[i]); \
+ for(uint32_t offset = 0; offset < vectorSize[i]; offset++) {\
+ printf("\tOffset %d :", offset); \
+ vload_bench<T>("vload_bench_10000" #kT, vectorSize[i], offset, true); \
+ }\
+ } \
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(vload_bench_ ##kT, true)
+VLOAD_BENCH(uint8_t, uchar)
+VLOAD_BENCH(uint16_t, ushort)
+VLOAD_BENCH(uint32_t, uint)
--
1.8.3.2
More information about the Beignet
mailing list