[Beignet] [PATCH 08/12] Utest: Add test case for sub_group functions
Xiuli Pan
xiuli.pan at intel.com
Thu May 26 03:14:23 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
Long type need to be fixed before gen8, so hide them now.
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
kernels/compiler_subgroup_broadcast.cl | 34 +++
kernels/compiler_subgroup_reduce.cl | 136 ++++++++++
kernels/compiler_subgroup_scan_exclusive.cl | 98 +++++++
kernels/compiler_subgroup_scan_inclusive.cl | 98 +++++++
utests/CMakeLists.txt | 4 +
utests/compiler_subgroup_broadcast.cpp | 181 +++++++++++++
utests/compiler_subgroup_reduce.cpp | 390 ++++++++++++++++++++++++++++
utests/compiler_subgroup_scan_exclusive.cpp | 350 +++++++++++++++++++++++++
utests/compiler_subgroup_scan_inclusive.cpp | 341 ++++++++++++++++++++++++
9 files changed, 1632 insertions(+)
create mode 100644 kernels/compiler_subgroup_broadcast.cl
create mode 100644 kernels/compiler_subgroup_reduce.cl
create mode 100644 kernels/compiler_subgroup_scan_exclusive.cl
create mode 100644 kernels/compiler_subgroup_scan_inclusive.cl
create mode 100644 utests/compiler_subgroup_broadcast.cpp
create mode 100644 utests/compiler_subgroup_reduce.cpp
create mode 100644 utests/compiler_subgroup_scan_exclusive.cpp
create mode 100644 utests/compiler_subgroup_scan_inclusive.cpp
diff --git a/kernels/compiler_subgroup_broadcast.cl b/kernels/compiler_subgroup_broadcast.cl
new file mode 100644
index 0000000..4f21cf5
--- /dev/null
+++ b/kernels/compiler_subgroup_broadcast.cl
@@ -0,0 +1,34 @@
+/*
+ * Subgroup broadcast 1D functions
+ */
+
+kernel void compiler_subgroup_broadcast_imm_int(global int *src,
+ global int *dst,
+ uint simd_id)
+{
+ uint index = get_global_id(0);
+
+ int val = src[index];
+ int broadcast_val = sub_group_broadcast(val, 10);
+ dst[index] = broadcast_val;
+}
+kernel void compiler_subgroup_broadcast_int(global int *src,
+ global int *dst,
+ uint simd_id)
+{
+ uint index = get_global_id(0);
+
+ int val = src[index];
+ int broadcast_val = sub_group_broadcast(val, simd_id);
+ dst[index] = broadcast_val;
+}
+kernel void compiler_subgroup_broadcast_long(global long *src,
+ global long *dst,
+ uint simd_id)
+{
+ uint index = get_global_id(0);
+
+ long val = src[index];
+ long broadcast_val = sub_group_broadcast(val, simd_id);
+ dst[index] = broadcast_val;
+}
diff --git a/kernels/compiler_subgroup_reduce.cl b/kernels/compiler_subgroup_reduce.cl
new file mode 100644
index 0000000..77ffb07
--- /dev/null
+++ b/kernels/compiler_subgroup_reduce.cl
@@ -0,0 +1,136 @@
+/*
+ * Subgroup any all functions
+ */
+kernel void compiler_subgroup_any(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int predicate = sub_group_any(val);
+ dst[get_global_id(0)] = predicate;
+}
+kernel void compiler_subgroup_all(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int predicate = sub_group_all(val);
+ dst[get_global_id(0)] = predicate;
+}
+
+/*
+ * Subgroup reduce add functions
+ */
+kernel void compiler_subgroup_reduce_add_char(global char *src, global char *dst) {
+ char val = src[get_global_id(0)];
+ char sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_uchar(global uchar *src, global uchar *dst) {
+ uchar val = src[get_global_id(0)];
+ uchar sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_short(global short *src, global short *dst) {
+ short val = src[get_global_id(0)];
+ short sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_ushort(global ushort *src, global ushort *dst) {
+ ushort val = src[get_global_id(0)];
+ ushort sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_int(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_uint(global uint *src, global uint *dst) {
+ uint val = src[get_global_id(0)];
+ uint sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_long(global long *src, global long *dst) {
+ long val = src[get_global_id(0)];
+ long sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_ulong(global ulong *src, global ulong *dst) {
+ ulong val = src[get_global_id(0)];
+ ulong sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_float(global float *src, global float *dst) {
+ float val = src[get_global_id(0)];
+ float sum = sub_group_reduce_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup reduce max functions
+ */
+kernel void compiler_subgroup_reduce_max_int(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int sum = sub_group_reduce_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_uint(global uint *src, global uint *dst) {
+ uint val = src[get_global_id(0)];
+ uint sum = sub_group_reduce_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_long(global long *src, global long *dst) {
+ long val = src[get_global_id(0)];
+ long sum = sub_group_reduce_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_ulong(global ulong *src, global ulong *dst) {
+ ulong val = src[get_global_id(0)];
+ ulong sum = sub_group_reduce_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_float(global float *src, global float *dst) {
+ float val = src[get_global_id(0)];
+ float sum = sub_group_reduce_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup reduce min functions
+ */
+kernel void compiler_subgroup_reduce_min_int(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int sum = sub_group_reduce_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_uint(global uint *src, global uint *dst) {
+ uint val = src[get_global_id(0)];
+ uint sum = sub_group_reduce_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_long(global long *src, global long *dst) {
+ long val = src[get_global_id(0)];
+ long sum = sub_group_reduce_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_ulong(global ulong *src, global ulong *dst) {
+ ulong val = src[get_global_id(0)];
+ ulong sum = sub_group_reduce_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_float(global float *src, global float *dst) {
+ float val = src[get_global_id(0)];
+ float sum = sub_group_reduce_min(val);
+ dst[get_global_id(0)] = sum;
+}
diff --git a/kernels/compiler_subgroup_scan_exclusive.cl b/kernels/compiler_subgroup_scan_exclusive.cl
new file mode 100644
index 0000000..afc00d0
--- /dev/null
+++ b/kernels/compiler_subgroup_scan_exclusive.cl
@@ -0,0 +1,98 @@
+/*
+ * Subgroup scan exclusive add functions
+ */
+kernel void compiler_subgroup_scan_exclusive_add_int(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int sum = sub_group_scan_exclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_uint(global uint *src, global uint *dst) {
+ uint val = src[get_global_id(0)];
+ uint sum = sub_group_scan_exclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_long(global long *src, global long *dst) {
+ long val = src[get_global_id(0)];
+ long sum = sub_group_scan_exclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_ulong(global ulong *src, global ulong *dst) {
+ ulong val = src[get_global_id(0)];
+ ulong sum = sub_group_scan_exclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_float(global float *src, global float *dst) {
+ float val = src[get_global_id(0)];
+ float sum = sub_group_scan_exclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan exclusive max functions
+ */
+kernel void compiler_subgroup_scan_exclusive_max_int(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int sum = sub_group_scan_exclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_uint(global uint *src, global uint *dst) {
+ uint val = src[get_global_id(0)];
+ uint sum = sub_group_scan_exclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_long(global long *src, global long *dst) {
+ long val = src[get_global_id(0)];
+ long sum = sub_group_scan_exclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_ulong(global ulong *src, global ulong *dst) {
+ ulong val = src[get_global_id(0)];
+ ulong sum = sub_group_scan_exclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_float(global float *src, global float *dst) {
+ float val = src[get_global_id(0)];
+ float sum = sub_group_scan_exclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan exclusive min functions
+ */
+kernel void compiler_subgroup_scan_exclusive_min_int(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int sum = sub_group_scan_exclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_uint(global uint *src, global uint *dst) {
+ uint val = src[get_global_id(0)];
+ uint sum = sub_group_scan_exclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_long(global long *src, global long *dst) {
+ long val = src[get_global_id(0)];
+ long sum = sub_group_scan_exclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_ulong(global ulong *src, global ulong *dst) {
+ ulong val = src[get_global_id(0)];
+ ulong sum = sub_group_scan_exclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_float(global float *src, global float *dst) {
+ float val = src[get_global_id(0)];
+ float sum = sub_group_scan_exclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
diff --git a/kernels/compiler_subgroup_scan_inclusive.cl b/kernels/compiler_subgroup_scan_inclusive.cl
new file mode 100644
index 0000000..da1a6e6
--- /dev/null
+++ b/kernels/compiler_subgroup_scan_inclusive.cl
@@ -0,0 +1,98 @@
+/*
+ * Subgroup scan inclusive add functions
+ */
+kernel void compiler_subgroup_scan_inclusive_add_int(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int sum = sub_group_scan_inclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_uint(global uint *src, global uint *dst) {
+ uint val = src[get_global_id(0)];
+ uint sum = sub_group_scan_inclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_long(global long *src, global long *dst) {
+ long val = src[get_global_id(0)];
+ long sum = sub_group_scan_inclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_ulong(global ulong *src, global ulong *dst) {
+ ulong val = src[get_global_id(0)];
+ ulong sum = sub_group_scan_inclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_float(global float *src, global float *dst) {
+ float val = src[get_global_id(0)];
+ float sum = sub_group_scan_inclusive_add(val);
+ dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan inclusive max functions
+ */
+kernel void compiler_subgroup_scan_inclusive_max_int(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int sum = sub_group_scan_inclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_uint(global uint *src, global uint *dst) {
+ uint val = src[get_global_id(0)];
+ uint sum = sub_group_scan_inclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_long(global long *src, global long *dst) {
+ long val = src[get_global_id(0)];
+ long sum = sub_group_scan_inclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_ulong(global ulong *src, global ulong *dst) {
+ ulong val = src[get_global_id(0)];
+ ulong sum = sub_group_scan_inclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_float(global float *src, global float *dst) {
+ float val = src[get_global_id(0)];
+ float sum = sub_group_scan_inclusive_max(val);
+ dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan inclusive min functions
+ */
+kernel void compiler_subgroup_scan_inclusive_min_int(global int *src, global int *dst) {
+ int val = src[get_global_id(0)];
+ int sum = sub_group_scan_inclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_uint(global uint *src, global uint *dst) {
+ uint val = src[get_global_id(0)];
+ uint sum = sub_group_scan_inclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_long(global long *src, global long *dst) {
+ long val = src[get_global_id(0)];
+ long sum = sub_group_scan_inclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_ulong(global ulong *src, global ulong *dst) {
+ ulong val = src[get_global_id(0)];
+ ulong sum = sub_group_scan_inclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_float(global float *src, global float *dst) {
+ float val = src[get_global_id(0)];
+ float sum = sub_group_scan_inclusive_min(val);
+ dst[get_global_id(0)] = sum;
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 7ea10e0..e721179 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -166,6 +166,10 @@ set (utests_sources
compiler_workgroup_reduce.cpp
compiler_workgroup_scan_exclusive.cpp
compiler_workgroup_scan_inclusive.cpp
+ compiler_subgroup_broadcast.cpp
+ compiler_subgroup_reduce.cpp
+ compiler_subgroup_scan_exclusive.cpp
+ compiler_subgroup_scan_inclusive.cpp
compiler_async_stride_copy.cpp
compiler_insn_selection_min.cpp
compiler_insn_selection_max.cpp
diff --git a/utests/compiler_subgroup_broadcast.cpp b/utests/compiler_subgroup_broadcast.cpp
new file mode 100644
index 0000000..f029483
--- /dev/null
+++ b/utests/compiler_subgroup_broadcast.cpp
@@ -0,0 +1,181 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT 0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE 30
+#define WG_LOCAL_SIZE 30
+/*
+ * Generic compute-expected function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(T* input,
+ T* expected,
+ size_t SIMD_ID,
+ size_t SIMD_SIZE)
+{
+ for(uint32_t i = 0; i < SIMD_SIZE; i++)
+ expected[i] = input[SIMD_ID];
+}
+
+/*
+ * Generic input-expected generate function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(T* &input,
+ T* &expected,
+ size_t SIMD_ID,
+ size_t SIMD_SIZE)
+{
+ /* allocate input and expected arrays */
+ input = new T[WG_GLOBAL_SIZE];
+ expected = new T[WG_GLOBAL_SIZE];
+
+ /* base value for all data types */
+ T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+ /* seed for random inputs */
+ srand (time(NULL));
+
+ /* generate inputs and expected values */
+ for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+ {
+#if DEBUG_STDOUT
+ cout << endl << "IN: " << endl;
+#endif
+ SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+ /* input values */
+ for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+ {
+ /* initially 0, augment after */
+ input[gid + lid] = 0;
+
+ /* check all data types, test ideal for QWORD types */
+ input[gid + lid] += ((rand() % 2 - 1) * base_val);
+ /* add trailing random bits, tests GENERAL cases */
+ input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+ /* output generated input */
+ cout << setw(4) << input[gid + lid] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+#endif
+ }
+
+ /* expected values */
+ compute_expected(input + gid, expected + gid, SIMD_ID, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+ /* output expected input */
+ cout << endl << "EXP: " << endl;
+ for(uint32_t lid = 0; lid < SIMD_SIZE; lid++){
+ cout << setw(4) << expected[gid + lid] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+ }
+ cout << endl;
+#endif
+
+ }
+}
+
+/*
+ * Generic subgroup utest function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(T* input,
+ T* expected)
+{
+ /* get simd size */
+ globals[0] = WG_GLOBAL_SIZE;
+ locals[0] = WG_LOCAL_SIZE;
+ size_t SIMD_SIZE = 0;
+ OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+ cl_uint SIMD_ID = 10;
+ /* input and expected data */
+ generate_data(input, expected, SIMD_ID, SIMD_SIZE);
+
+ /* prepare input for datatype */
+ OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_uint), &SIMD_ID);
+
+ /* set input data for GPU */
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], input, WG_GLOBAL_SIZE* sizeof(T));
+ OCL_UNMAP_BUFFER(0);
+
+ /* run the kernel on GPU */
+ OCL_NDRANGE(1);
+
+ /* check if mismatch */
+ OCL_MAP_BUFFER(1);
+ uint32_t mismatches = 0;
+
+ for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+ if(((T *)buf_data[1])[i] != *(expected + i))
+ {
+ /* found mismatch, increment */
+ mismatches++;
+
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " <<
+ ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+ }
+
+#if DEBUG_STDOUT
+ /* output mismatch count */
+ cout << "mismatches " << mismatches << endl;
+#endif
+
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup broadcast 1D functions
+ */
+void compiler_subgroup_broadcast_imm_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+ "compiler_subgroup_broadcast_imm_int");
+ subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_imm_int);
+void compiler_subgroup_broadcast_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+ "compiler_subgroup_broadcast_int");
+ subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_int);
+void compiler_subgroup_broadcast_long(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+ "compiler_subgroup_broadcast_long");
+ subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_broadcast_long);
diff --git a/utests/compiler_subgroup_reduce.cpp b/utests/compiler_subgroup_reduce.cpp
new file mode 100644
index 0000000..54863f6
--- /dev/null
+++ b/utests/compiler_subgroup_reduce.cpp
@@ -0,0 +1,390 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT 0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE 30
+#define WG_LOCAL_SIZE 30
+enum WG_FUNCTION
+{
+ WG_ANY,
+ WG_ALL,
+ WG_REDUCE_ADD,
+ WG_REDUCE_MIN,
+ WG_REDUCE_MAX
+};
+
+/*
+ * Generic compute-expected function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+ T* input,
+ T* expected,
+ size_t SIMD_SIZE)
+{
+ if(wg_func == WG_ANY)
+ {
+ T wg_predicate = input[0];
+ for(uint32_t i = 1; i < SIMD_SIZE; i++)
+ wg_predicate = (int)wg_predicate || (int)input[i];
+ for(uint32_t i = 0; i < SIMD_SIZE; i++)
+ expected[i] = wg_predicate;
+ }
+ else if(wg_func == WG_ALL)
+ {
+ T wg_predicate = input[0];
+ for(uint32_t i = 1; i < SIMD_SIZE; i++)
+ wg_predicate = (int)wg_predicate && (int)input[i];
+ for(uint32_t i = 0; i < SIMD_SIZE; i++)
+ expected[i] = wg_predicate;
+ }
+ else if(wg_func == WG_REDUCE_ADD)
+ {
+ T wg_sum = input[0];
+ for(uint32_t i = 1; i < SIMD_SIZE; i++)
+ wg_sum += input[i];
+ for(uint32_t i = 0; i < SIMD_SIZE; i++)
+ expected[i] = wg_sum;
+ }
+ else if(wg_func == WG_REDUCE_MAX)
+ {
+ T wg_max = input[0];
+ for(uint32_t i = 1; i < SIMD_SIZE; i++)
+ wg_max = max(input[i], wg_max);
+ for(uint32_t i = 0; i < SIMD_SIZE; i++)
+ expected[i] = wg_max;
+ }
+ else if(wg_func == WG_REDUCE_MIN)
+ {
+ T wg_min = input[0];
+ for(uint32_t i = 1; i < SIMD_SIZE; i++)
+ wg_min = min(input[i], wg_min);
+ for(uint32_t i = 0; i < SIMD_SIZE; i++)
+ expected[i] = wg_min;
+ }
+}
+
+/*
+ * Generic input-expected generate function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+ T* &input,
+ T* &expected,
+ size_t SIMD_SIZE)
+{
+ input = new T[WG_GLOBAL_SIZE];
+ expected = new T[WG_GLOBAL_SIZE];
+
+ /* base value for all data types */
+ T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+ /* seed for random inputs */
+ srand (time(NULL));
+
+ /* generate inputs and expected values */
+ for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+ {
+#if DEBUG_STDOUT
+ cout << endl << "IN: " << endl;
+#endif
+ SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+ /* input values */
+ for (uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+ /* initially 0, augment after */
+ input[gid + lid] = 0;
+
+ if (numeric_limits<T>::is_integer) {
+ /* check all data types, test ideal for QWORD types */
+ input[gid + lid] += ((rand() % 2 - 1) * base_val);
+ /* add trailing random bits, tests GENERAL cases */
+ input[gid + lid] += (rand() % 112);
+ /* always last bit is 1, ideal test ALL/ANY */
+ } else {
+ input[gid + lid] += rand();
+ input[gid + lid] += rand() / ((float)RAND_MAX + 1);
+ }
+
+#if DEBUG_STDOUT
+ /* output generated input */
+ cout << setw(4) << input[gid + lid] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+#endif
+ }
+
+ /* expected values */
+ compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+ /* output expected input */
+ cout << endl << "EXP: " << endl;
+ for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+ cout << setw(4) << expected[gid + lid] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+ }
+ cout << endl;
+#endif
+
+ }
+}
+
+/*
+ * Generic subgroup utest function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(WG_FUNCTION wg_func,
+ T* input,
+ T* expected)
+{
+ /* get simd size */
+ globals[0] = WG_GLOBAL_SIZE;
+ locals[0] = WG_LOCAL_SIZE;
+ size_t SIMD_SIZE = 0;
+ OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+ /* input and expected data */
+ generate_data(wg_func, input, expected, SIMD_SIZE);
+
+ /* prepare input for data type */
+ OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+
+ /* set input data for GPU */
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+ OCL_UNMAP_BUFFER(0);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ /* run the kernel on GPU */
+ OCL_NDRANGE(1);
+
+ /* check if mismatch */
+ OCL_MAP_BUFFER(1);
+ uint32_t mismatches = 0;
+
+ for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+ if(((T *)buf_data[1])[i] != *(expected + i))
+ {
+ /* found mismatch on integer, increment */
+ if (numeric_limits<T>::is_integer) {
+ mismatches++;
+
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " << ((T *)buf_data[1])[i]
+ << " != " << *(expected + i) << endl;
+#endif
+ }
+ /* float error is tolerable though */
+ else {
+ float num_computed = ((T *)buf_data[1])[i];
+ float num_expected = *(expected + i);
+ float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+ if (num_diff > 0.01f) {
+ mismatches++;
+
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " << ((T *)buf_data[1])[i]
+ << " != " << *(expected + i) << endl;
+#endif
+ }
+ }
+ }
+
+#if DEBUG_STDOUT
+ /* output mismatch count */
+ cout << "mismatches " << mismatches << endl;
+#endif
+
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup any/all utest functions
+ */
+void compiler_subgroup_any(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_any");
+ subgroup_generic(WG_ANY, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_any);
+void compiler_subgroup_all(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_all");
+ subgroup_generic(WG_ALL, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_all);
+/*
+ * Workgroup reduce add utest functions
+ */
+void compiler_subgroup_reduce_add_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_add_int");
+ subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_int);
+void compiler_subgroup_reduce_add_uint(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_add_uint");
+ subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_uint);
+void compiler_subgroup_reduce_add_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_add_long");
+ subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_add_long);
+void compiler_subgroup_reduce_add_ulong(void)
+{
+ cl_ulong *input = NULL;
+ cl_ulong *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_add_ulong");
+ subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_add_ulong);
+void compiler_subgroup_reduce_add_float(void)
+{
+ cl_float *input = NULL;
+ cl_float *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_add_float");
+ subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_float);
+
+/*
+ * Workgroup reduce max utest functions
+ */
+void compiler_subgroup_reduce_max_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_max_int");
+ subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_int);
+void compiler_subgroup_reduce_max_uint(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_max_uint");
+ subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_uint);
+void compiler_subgroup_reduce_max_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_max_long");
+ subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_max_long);
+void compiler_subgroup_reduce_max_ulong(void)
+{
+ cl_ulong *input = NULL;
+ cl_ulong *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_max_ulong");
+ subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_max_ulong);
+void compiler_subgroup_reduce_max_float(void)
+{
+ cl_float *input = NULL;
+ cl_float *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_max_float");
+ subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_float);
+
+/*
+ * Workgroup reduce min utest functions
+ */
+void compiler_subgroup_reduce_min_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_min_int");
+ subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_int);
+void compiler_subgroup_reduce_min_uint(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_min_uint");
+ subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_uint);
+void compiler_subgroup_reduce_min_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_min_long");
+ subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_min_long);
+void compiler_subgroup_reduce_min_ulong(void)
+{
+ cl_ulong *input = NULL;
+ cl_ulong *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_min_ulong");
+ subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_min_ulong);
+void compiler_subgroup_reduce_min_float(void)
+{
+ cl_float *input = NULL;
+ cl_float *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+ "compiler_subgroup_reduce_min_float");
+ subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_float);
diff --git a/utests/compiler_subgroup_scan_exclusive.cpp b/utests/compiler_subgroup_scan_exclusive.cpp
new file mode 100644
index 0000000..abcec6e
--- /dev/null
+++ b/utests/compiler_subgroup_scan_exclusive.cpp
@@ -0,0 +1,350 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT 0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE 30
+#define WG_LOCAL_SIZE 30
+
+enum WG_FUNCTION
+{
+ WG_SCAN_EXCLUSIVE_ADD,
+ WG_SCAN_EXCLUSIVE_MAX,
+ WG_SCAN_EXCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+ T* input,
+ T* expected,
+ size_t SIMD_SIZE)
+{
+ if(wg_func == WG_SCAN_EXCLUSIVE_ADD)
+ {
+ expected[0] = 0;
+ expected[1] = input[0];
+ for(uint32_t i = 2; i < SIMD_SIZE; i++)
+ expected[i] = input[i - 1] + expected[i - 1];
+ }
+ else if(wg_func == WG_SCAN_EXCLUSIVE_MAX)
+ {
+ if(numeric_limits<T>::is_integer)
+ expected[0] = numeric_limits<T>::min();
+ else
+ expected[0] = - numeric_limits<T>::infinity();
+
+ expected[1] = input[0];
+ for(uint32_t i = 2; i < SIMD_SIZE; i++)
+ expected[i] = max(input[i - 1], expected[i - 1]);
+ }
+ else if(wg_func == WG_SCAN_EXCLUSIVE_MIN)
+ {
+ if(numeric_limits<T>::is_integer)
+ expected[0] = numeric_limits<T>::max();
+ else
+ expected[0] = numeric_limits<T>::infinity();
+
+ expected[1] = input[0];
+ for(uint32_t i = 2; i < SIMD_SIZE; i++)
+ expected[i] = min(input[i - 1], expected[i - 1]);
+ }
+}
+
+/*
+ * Generic subgroup utest function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+ T* &input,
+ T* &expected,
+ size_t SIMD_SIZE)
+{
+ input = new T[WG_GLOBAL_SIZE];
+ expected = new T[WG_GLOBAL_SIZE];
+
+ /* base value for all data types */
+ T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+ /* seed for random inputs */
+ srand (time(NULL));
+
+ /* generate inputs and expected values */
+ for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+ {
+#if DEBUG_STDOUT
+ cout << endl << "IN: " << endl;
+#endif
+ SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+ /* input values */
+ for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+ {
+ /* initially 0, augment after */
+ input[gid + lid] = 0;
+ /* check all data types, test ideal for QWORD types */
+ input[gid + lid] += ((rand() % 2 - 1) * base_val);
+ /* add trailing random bits, tests GENERAL cases */
+ input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+ /* output generated input */
+ cout << setw(4) << input[gid + lid] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+#endif
+ }
+
+ /* expected values */
+ compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+ /* output expected input */
+ cout << endl << "EXP: " << endl;
+ for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+ cout << setw(4) << expected[gid + lid] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+ }
+ cout << endl;
+#endif
+
+ }
+}
+
+/*
+ * Generic subgroup utest function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(WG_FUNCTION wg_func,
+ T* input,
+ T* expected)
+{
+ /* get simd size */
+ globals[0] = WG_GLOBAL_SIZE;
+ locals[0] = WG_LOCAL_SIZE;
+ size_t SIMD_SIZE = 0;
+ OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+ /* input and expected data */
+ generate_data(wg_func, input, expected, SIMD_SIZE);
+
+ /* prepare input for data type */
+ OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ /* set input data for GPU */
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+ OCL_UNMAP_BUFFER(0);
+
+ /* run the kernel on GPU */
+ OCL_NDRANGE(1);
+
+ /* check if mismatch */
+ OCL_MAP_BUFFER(1);
+ uint32_t mismatches = 0;
+
+ for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+ if(((T *)buf_data[1])[i] != *(expected + i))
+ {
+ /* found mismatch on integer, increment */
+ if(numeric_limits<T>::is_integer){
+ mismatches++;
+
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " <<
+ ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+ }
+ /* float error is tolerable though */
+ else {
+ float num_computed = ((T *)buf_data[1])[i];
+ float num_expected = *(expected + i);
+ float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+ if(num_diff > 0.01f){
+ mismatches++;
+
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " <<
+ ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+ }
+ }
+ }
+
+#if DEBUG_STDOUT
+ /* output mismatch count */
+ cout << "mismatches " << mismatches << endl;
+#endif
+
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup scan_exclusive add utest functions
+ */
+void compiler_subgroup_scan_exclusive_add_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_add_int");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_int);
+void compiler_subgroup_scan_exclusive_add_uint(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_add_uint");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_uint);
+void compiler_subgroup_scan_exclusive_add_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_add_long");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_add_long);
+void compiler_subgroup_scan_exclusive_add_ulong(void)
+{
+ cl_ulong *input = NULL;
+ cl_ulong *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_add_ulong");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_add_ulong);
+void compiler_subgroup_scan_exclusive_add_float(void)
+{
+ cl_float *input = NULL;
+ cl_float *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_add_float");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_float);
+
+/*
+ * Workgroup scan_exclusive max utest functions
+ */
+void compiler_subgroup_scan_exclusive_max_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_max_int");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_int);
+void compiler_subgroup_scan_exclusive_max_uint(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_max_uint");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_uint);
+void compiler_subgroup_scan_exclusive_max_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_max_long");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_max_long);
+void compiler_subgroup_scan_exclusive_max_ulong(void)
+{
+ cl_ulong *input = NULL;
+ cl_ulong *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_max_ulong");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_max_ulong);
+void compiler_subgroup_scan_exclusive_max_float(void)
+{
+ cl_float *input = NULL;
+ cl_float *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_max_float");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_float);
+
+/*
+ * Workgroup scan_exclusive min utest functions
+ */
+void compiler_subgroup_scan_exclusive_min_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_min_int");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_int);
+void compiler_subgroup_scan_exclusive_min_uint(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_min_uint");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_uint);
+void compiler_subgroup_scan_exclusive_min_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_min_long");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_min_long);
+void compiler_subgroup_scan_exclusive_min_ulong(void)
+{
+ cl_ulong *input = NULL;
+ cl_ulong *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_min_ulong");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_min_ulong);
+void compiler_subgroup_scan_exclusive_min_float(void)
+{
+ cl_float *input = NULL;
+ cl_float *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+ "compiler_subgroup_scan_exclusive_min_float");
+ subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_float);
diff --git a/utests/compiler_subgroup_scan_inclusive.cpp b/utests/compiler_subgroup_scan_inclusive.cpp
new file mode 100644
index 0000000..1528f09
--- /dev/null
+++ b/utests/compiler_subgroup_scan_inclusive.cpp
@@ -0,0 +1,341 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT 0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE 30
+#define WG_LOCAL_SIZE 30
+
+enum WG_FUNCTION
+{
+ WG_SCAN_INCLUSIVE_ADD,
+ WG_SCAN_INCLUSIVE_MAX,
+ WG_SCAN_INCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+ T* input,
+ T* expected,
+ size_t SIMD_SIZE)
+{
+ if(wg_func == WG_SCAN_INCLUSIVE_ADD)
+ {
+ expected[0] = input[0];
+ for(uint32_t i = 1; i < SIMD_SIZE; i++)
+ expected[i] = input[i] + expected[i - 1];
+ }
+ else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
+ {
+ expected[0] = input[0];
+ for(uint32_t i = 1; i < SIMD_SIZE; i++)
+ expected[i] = max(input[i], expected[i - 1]);
+ }
+ else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
+ {
+ expected[0] = input[0];
+ for(uint32_t i = 1; i < SIMD_SIZE; i++)
+ expected[i] = min(input[i], expected[i - 1]);
+ }
+}
+
+/*
+ * Generic input-expected generate function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+ T* &input,
+ T* &expected,
+ size_t SIMD_SIZE)
+{
+ input = new T[WG_GLOBAL_SIZE];
+ expected = new T[WG_GLOBAL_SIZE];
+
+ /* base value for all data types */
+ T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+ /* seed for random inputs */
+ srand (time(NULL));
+
+ /* generate inputs and expected values */
+ for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+ {
+#if DEBUG_STDOUT
+ cout << endl << "IN: " << endl;
+#endif
+ SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+ /* input values */
+ for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+ {
+ /* initially 0, augment after */
+ input[gid + lid] = 0;
+
+ /* check all data types, test ideal for QWORD types */
+ input[gid + lid] += ((rand() % 2 - 1) * base_val);
+ /* add trailing random bits, tests GENERAL cases */
+ input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+ /* output generated input */
+ cout << setw(4) << input[gid + lid] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+#endif
+ }
+
+ /* expected values */
+ compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+ /* output expected input */
+ cout << endl << "EXP: " << endl;
+ for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+ cout << setw(4) << expected[gid + lid] << ", " ;
+ if((lid + 1) % 8 == 0)
+ cout << endl;
+ }
+ cout << endl;
+#endif
+
+ }
+}
+
+/*
+ * Generic subgroup utest function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(WG_FUNCTION wg_func,
+ T* input,
+ T* expected)
+{
+ /* get simd size */
+ globals[0] = WG_GLOBAL_SIZE;
+ locals[0] = WG_LOCAL_SIZE;
+ size_t SIMD_SIZE = 0;
+ OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+ /* input and expected data */
+ generate_data(wg_func, input, expected, SIMD_SIZE);
+
+ /* prepare input for data type */
+ OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+ /* set input data for GPU */
+ OCL_MAP_BUFFER(0);
+ memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+ OCL_UNMAP_BUFFER(0);
+
+ /* run the kernel on GPU */
+ OCL_NDRANGE(1);
+
+ /* check if mismatch */
+ OCL_MAP_BUFFER(1);
+ uint32_t mismatches = 0;
+
+ for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+ if(((T *)buf_data[1])[i] != *(expected + i))
+ {
+ /* found mismatch on integer, increment */
+ if(numeric_limits<T>::is_integer){
+ mismatches++;
+
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " <<
+ ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+ }
+ /* float error is tolerable though */
+ else {
+ float num_computed = ((T *)buf_data[1])[i];
+ float num_expected = *(expected + i);
+ float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+ if(num_diff > 0.01f){
+ mismatches++;
+
+#if DEBUG_STDOUT
+ /* output mismatch */
+ cout << "Err at " << i << ", " <<
+ ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+ }
+ }
+ }
+
+#if DEBUG_STDOUT
+ /* output mismatch count */
+ cout << "mismatches " << mismatches << endl;
+#endif
+
+ OCL_UNMAP_BUFFER(1);
+
+ OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup scan_inclusive add utest functions
+ */
+void compiler_subgroup_scan_inclusive_add_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_add_int");
+ subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_int);
+void compiler_subgroup_scan_inclusive_add_uint(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_add_uint");
+ subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_uint);
+void compiler_subgroup_scan_inclusive_add_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_add_long");
+ subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_add_long);
+void compiler_subgroup_scan_inclusive_add_ulong(void)
+{
+ cl_ulong *input = NULL;
+ cl_ulong *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_add_ulong");
+ subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_add_ulong);
+void compiler_subgroup_scan_inclusive_add_float(void)
+{
+ cl_float *input = NULL;
+ cl_float *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_add_float");
+ subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_float);
+
+/*
+ * Workgroup scan_inclusive max utest functions
+ */
+void compiler_subgroup_scan_inclusive_max_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_max_int");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_int);
+void compiler_subgroup_scan_inclusive_max_uint(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_max_uint");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_uint);
+void compiler_subgroup_scan_inclusive_max_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_max_long");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_max_long);
+void compiler_subgroup_scan_inclusive_max_ulong(void)
+{
+ cl_ulong *input = NULL;
+ cl_ulong *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_max_ulong");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_max_ulong);
+void compiler_subgroup_scan_inclusive_max_float(void)
+{
+ cl_float *input = NULL;
+ cl_float *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_max_float");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_float);
+
+/*
+ * Workgroup scan_inclusive min utest functions
+ */
+void compiler_subgroup_scan_inclusive_min_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_min_int");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_int);
+void compiler_subgroup_scan_inclusive_min_uint(void)
+{
+ cl_uint *input = NULL;
+ cl_uint *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_min_uint");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_uint);
+void compiler_subgroup_scan_inclusive_min_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_min_long");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_min_long);
+void compiler_subgroup_scan_inclusive_min_ulong(void)
+{
+ cl_ulong *input = NULL;
+ cl_ulong *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_min_ulong");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_min_ulong);
+void compiler_subgroup_scan_inclusive_min_float(void)
+{
+ cl_float *input = NULL;
+ cl_float *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+ "compiler_subgroup_scan_inclusive_min_float");
+ subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_float);
+
--
2.7.4
More information about the Beignet
mailing list