[Beignet] [PATCH 2/2] Utest: Add tset case for block read/write buffer
Xiuli Pan
xiuli.pan at intel.com
Thu May 19 21:55:55 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
kernels/compiler_subgroup_block_read.cl | 31 +++++
kernels/compiler_subgroup_block_write.cl | 27 +++++
utests/CMakeLists.txt | 2 +
utests/compiler_subgroup_block_read.cpp | 197 +++++++++++++++++++++++++++++++
utests/compiler_subgroup_block_write.cpp | 197 +++++++++++++++++++++++++++++++
5 files changed, 454 insertions(+)
create mode 100644 kernels/compiler_subgroup_block_read.cl
create mode 100644 kernels/compiler_subgroup_block_write.cl
create mode 100644 utests/compiler_subgroup_block_read.cpp
create mode 100644 utests/compiler_subgroup_block_write.cpp
diff --git a/kernels/compiler_subgroup_block_read.cl b/kernels/compiler_subgroup_block_read.cl
new file mode 100644
index 0000000..e68facb
--- /dev/null
+++ b/kernels/compiler_subgroup_block_read.cl
@@ -0,0 +1,31 @@
+__kernel void compiler_subgroup_block_read1(global uint *src, global uint *dst)
+{
+ int id = get_global_id(0);
+ global uint * p = src + get_sub_group_id() * get_max_sub_group_size();
+ uint tmp = intel_sub_group_block_read(p);
+ dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_block_read2(global uint *src, global uint2 *dst)
+{
+ int id = get_global_id(0);
+ global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*2;
+ uint2 tmp = intel_sub_group_block_read2(p);
+ dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_block_read4(global uint *src, global uint4 *dst)
+{
+ int id = get_global_id(0);
+ global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*4;
+ uint4 tmp = intel_sub_group_block_read4(p);
+ dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_block_read8(global uint *src, global uint8 *dst)
+{
+ int id = get_global_id(0);
+ global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*8;
+ uint8 tmp = intel_sub_group_block_read8(p);
+ dst[id] = tmp;
+}
diff --git a/kernels/compiler_subgroup_block_write.cl b/kernels/compiler_subgroup_block_write.cl
new file mode 100644
index 0000000..fcf4d53
--- /dev/null
+++ b/kernels/compiler_subgroup_block_write.cl
@@ -0,0 +1,27 @@
+__kernel void compiler_subgroup_block_write1(global uint *src, global uint *dst)
+{
+ int id = get_global_id(0);
+ global uint * p = dst + get_sub_group_id() * get_max_sub_group_size();
+ intel_sub_group_block_write(p,src[id]);
+}
+
+__kernel void compiler_subgroup_block_write2(global uint2 *src, global uint *dst)
+{
+ int id = get_global_id(0);
+ global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*2;
+ intel_sub_group_block_write2(p,src[id]);
+}
+
+__kernel void compiler_subgroup_block_write4(global uint4 *src, global uint *dst)
+{
+ int id = get_global_id(0);
+ global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*4;
+ intel_sub_group_block_write4(p,src[id]);
+}
+
+__kernel void compiler_subgroup_block_write8(global uint8 *src, global uint *dst)
+{
+ int id = get_global_id(0);
+ global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*8;
+ intel_sub_group_block_write8(p,src[id]);
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index e721179..53c3074 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -170,6 +170,8 @@ set (utests_sources
compiler_subgroup_reduce.cpp
compiler_subgroup_scan_exclusive.cpp
compiler_subgroup_scan_inclusive.cpp
+ compiler_subgroup_block_read.cpp
+ compiler_subgroup_block_write.cpp
compiler_async_stride_copy.cpp
compiler_insn_selection_min.cpp
compiler_insn_selection_max.cpp
diff --git a/utests/compiler_subgroup_block_read.cpp b/utests/compiler_subgroup_block_read.cpp
new file mode 100644
index 0000000..7cb4c17
--- /dev/null
+++ b/utests/compiler_subgroup_block_read.cpp
@@ -0,0 +1,197 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT 0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE 32
+#define WG_LOCAL_SIZE 32
+/*
+ * Generic compute-expected function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(T* input,
+ T* expected,
+ size_t VEC_SIZE,
+ size_t SIMD_SIZE)
+{
+ for(uint32_t i = 0; i < SIMD_SIZE; i++)
+ for(uint32_t j = 0; j < VEC_SIZE; j++)
+ expected[i * VEC_SIZE + j] = input[SIMD_SIZE * j + i];
+}
+
+/*
+ * Generic input-expected generate function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(T* &input,
+ T* &expected,
+ size_t VEC_SIZE,
+ size_t SIMD_SIZE)
+{
+ /* allocate input and expected arrays */
+ input = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+ expected = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+
+ /* base value for all data types */
+ T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+ /* seed for random inputs */
+ srand (time(NULL));
+
+ /* generate inputs and expected values */
+ for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+ {
+#if DEBUG_STDOUT
+ cout << endl << "IN: " << endl;
+#endif
+ SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+ /* input values */
+ for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+ {
+ for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+ {
+ /* initially 0, augment after */
+ input[(gid + lid)*VEC_SIZE + vsz] = 0;
+
+ /* check all data types, test ideal for QWORD types */
+ input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val);
+ /* add trailing random bits, tests GENERAL cases */
+ input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112);
+
+#if DEBUG_STDOUT
+ /* output generated input */
+ cout << setw(4) << input[(gid + lid)*VEC_SIZE + vsz] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+#endif
+ }
+ }
+
+ /* expected values */
+ compute_expected(input + gid * VEC_SIZE, expected + gid * VEC_SIZE, VEC_SIZE, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+ /* output expected input */
+ cout << endl << "EXP: " << endl;
+ for(uint32_t lid = 0; lid < SIMD_SIZE ; lid++){
+ for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+ cout << setw(4) << expected[(gid + lid)*VEC_SIZE + vsz] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+ }
+ cout << endl;
+#endif
+
+ }
+}
+
+/*
+ * Generic subgroup utest function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(T* input,
+ T* expected,
+ size_t VEC_SIZE)
+{
+ /* get simd size */
+ globals[0] = WG_GLOBAL_SIZE;
+ locals[0] = WG_LOCAL_SIZE;
+ size_t SIMD_SIZE = 0;
+ OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+ size_t buf_sz = VEC_SIZE * WG_GLOBAL_SIZE;
+ /* input and expected data */
+ generate_data(input, expected, VEC_SIZE, SIMD_SIZE);
+
+ /* prepare input for datatype */
+ OCL_CREATE_BUFFER(buf[0], 0, buf_sz * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, buf_sz * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ /* set input data for GPU */
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], input, buf_sz* sizeof(T));
+ OCL_UNMAP_BUFFER(0);
+
+ /* run the kernel on GPU */
+ OCL_NDRANGE(1);
+
+ /* check if mismatch */
+ OCL_MAP_BUFFER(1);
+ uint32_t mismatches = 0;
+
+ for (uint32_t i = 0; i < buf_sz; i++)
+ if(((T *)buf_data[1])[i] != *(expected + i))
+ {
+ /* found mismatch, increment */
+ mismatches++;
+
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " <<
+ ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+ }
+
+#if DEBUG_STDOUT
+ /* output mismatch count */
+ cout << "mismatches " << mismatches << endl;
+#endif
+
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_ASSERT(mismatches == 0);
+ free(input);
+ free(expected);
+}
+
+/*
+ * Workgroup broadcast 1D functions
+ */
+void compiler_subgroup_block_read1(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_block_read",
+ "compiler_subgroup_block_read1");
+ subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_block_read1);
+void compiler_subgroup_block_read2(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_block_read",
+ "compiler_subgroup_block_read2");
+ subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_block_read2);
+void compiler_subgroup_block_read4(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_block_read",
+ "compiler_subgroup_block_read4");
+ subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_block_read4);
+void compiler_subgroup_block_read8(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_block_read",
+ "compiler_subgroup_block_read8");
+ subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_block_read8);
diff --git a/utests/compiler_subgroup_block_write.cpp b/utests/compiler_subgroup_block_write.cpp
new file mode 100644
index 0000000..6491686
--- /dev/null
+++ b/utests/compiler_subgroup_block_write.cpp
@@ -0,0 +1,197 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT 0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE 32
+#define WG_LOCAL_SIZE 32
+/*
+ * Generic compute-expected function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(T* input,
+ T* expected,
+ size_t VEC_SIZE,
+ size_t SIMD_SIZE)
+{
+ for(uint32_t i = 0; i < SIMD_SIZE; i++)
+ for(uint32_t j = 0; j < VEC_SIZE; j++)
+ expected[SIMD_SIZE * j + i] = input[i * VEC_SIZE + j];
+}
+
+/*
+ * Generic input-expected generate function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(T* &input,
+ T* &expected,
+ size_t VEC_SIZE,
+ size_t SIMD_SIZE)
+{
+ /* allocate input and expected arrays */
+ input = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+ expected = new T[WG_GLOBAL_SIZE * VEC_SIZE];
+
+ /* base value for all data types */
+ T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+ /* seed for random inputs */
+ srand (time(NULL));
+
+ /* generate inputs and expected values */
+ for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+ {
+#if DEBUG_STDOUT
+ cout << endl << "IN: " << endl;
+#endif
+ SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+ /* input values */
+ for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+ {
+ for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+ {
+ /* initially 0, augment after */
+ input[(gid + lid)*VEC_SIZE + vsz] = 0;
+
+ /* check all data types, test ideal for QWORD types */
+ input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val);
+ /* add trailing random bits, tests GENERAL cases */
+ input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112);
+
+#if DEBUG_STDOUT
+ /* output generated input */
+ cout << setw(4) << input[(gid + lid)*VEC_SIZE + vsz] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+#endif
+ }
+ }
+
+ /* expected values */
+ compute_expected(input + gid * VEC_SIZE, expected + gid * VEC_SIZE, VEC_SIZE, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+ /* output expected input */
+ cout << endl << "EXP: " << endl;
+ for(uint32_t lid = 0; lid < SIMD_SIZE ; lid++){
+ for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++)
+ cout << setw(4) << expected[(gid + lid)*VEC_SIZE + vsz] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+ }
+ cout << endl;
+#endif
+
+ }
+}
+
+/*
+ * Generic subgroup utest function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(T* input,
+ T* expected,
+ size_t VEC_SIZE)
+{
+ /* get simd size */
+ globals[0] = WG_GLOBAL_SIZE;
+ locals[0] = WG_LOCAL_SIZE;
+ size_t SIMD_SIZE = 0;
+ OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+ size_t buf_sz = VEC_SIZE * WG_GLOBAL_SIZE;
+ /* input and expected data */
+ generate_data(input, expected, VEC_SIZE, SIMD_SIZE);
+
+ /* prepare input for datatype */
+ OCL_CREATE_BUFFER(buf[0], 0, buf_sz * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, buf_sz * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ /* set input data for GPU */
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], input, buf_sz* sizeof(T));
+ OCL_UNMAP_BUFFER(0);
+
+ /* run the kernel on GPU */
+ OCL_NDRANGE(1);
+
+ /* check if mismatch */
+ OCL_MAP_BUFFER(1);
+ uint32_t mismatches = 0;
+
+ for (uint32_t i = 0; i < buf_sz; i++)
+ if(((T *)buf_data[1])[i] != *(expected + i))
+ {
+ /* found mismatch, increment */
+ mismatches++;
+
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " <<
+ ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+ }
+
+#if DEBUG_STDOUT
+ /* output mismatch count */
+ cout << "mismatches " << mismatches << endl;
+#endif
+
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_ASSERT(mismatches == 0);
+ free(input);
+ free(expected);
+}
+
+/*
+ * Workgroup broadcast 1D functions
+ */
+void compiler_subgroup_block_write1(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_block_write",
+ "compiler_subgroup_block_write1");
+ subgroup_generic(input, expected, 1);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_block_write1);
+void compiler_subgroup_block_write2(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_block_write",
+ "compiler_subgroup_block_write2");
+ subgroup_generic(input, expected, 2);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_block_write2);
+void compiler_subgroup_block_write4(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_block_write",
+ "compiler_subgroup_block_write4");
+ subgroup_generic(input, expected, 4);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_block_write4);
+void compiler_subgroup_block_write8(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_block_write",
+ "compiler_subgroup_block_write8");
+ subgroup_generic(input, expected, 8);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_block_write8);
--
2.7.4
More information about the Beignet
mailing list