[Beignet] [PATCH 4/4] Utest: Add test case for sub_group functions

Xiuli Pan xiuli.pan at intel.com
Tue May 17 00:20:33 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

Long type need to be fixed before gen8, so hide them now.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 kernels/compiler_subgroup_broadcast.cl      |  34 +++
 kernels/compiler_subgroup_reduce.cl         | 136 ++++++++++
 kernels/compiler_subgroup_scan_exclusive.cl |  98 +++++++
 kernels/compiler_subgroup_scan_inclusive.cl |  98 +++++++
 utests/CMakeLists.txt                       |   4 +
 utests/compiler_subgroup_broadcast.cpp      | 181 +++++++++++++
 utests/compiler_subgroup_reduce.cpp         | 390 ++++++++++++++++++++++++++++
 utests/compiler_subgroup_scan_exclusive.cpp | 350 +++++++++++++++++++++++++
 utests/compiler_subgroup_scan_inclusive.cpp | 341 ++++++++++++++++++++++++
 9 files changed, 1632 insertions(+)
 create mode 100644 kernels/compiler_subgroup_broadcast.cl
 create mode 100644 kernels/compiler_subgroup_reduce.cl
 create mode 100644 kernels/compiler_subgroup_scan_exclusive.cl
 create mode 100644 kernels/compiler_subgroup_scan_inclusive.cl
 create mode 100644 utests/compiler_subgroup_broadcast.cpp
 create mode 100644 utests/compiler_subgroup_reduce.cpp
 create mode 100644 utests/compiler_subgroup_scan_exclusive.cpp
 create mode 100644 utests/compiler_subgroup_scan_inclusive.cpp

diff --git a/kernels/compiler_subgroup_broadcast.cl b/kernels/compiler_subgroup_broadcast.cl
new file mode 100644
index 0000000..4f21cf5
--- /dev/null
+++ b/kernels/compiler_subgroup_broadcast.cl
@@ -0,0 +1,34 @@
+/*
+ * Subgroup broadcast 1D functions
+ */
+
+kernel void compiler_subgroup_broadcast_imm_int(global int *src,
+                                                global int *dst,
+                                                uint simd_id)
+{
+  uint index = get_global_id(0);
+
+  int val = src[index];
+  int broadcast_val = sub_group_broadcast(val, 10);
+  dst[index] = broadcast_val;
+}
+kernel void compiler_subgroup_broadcast_int(global int *src,
+                                                global int *dst,
+                                                uint simd_id)
+{
+  uint index = get_global_id(0);
+
+  int val = src[index];
+  int broadcast_val = sub_group_broadcast(val, simd_id);
+  dst[index] = broadcast_val;
+}
+kernel void compiler_subgroup_broadcast_long(global long *src,
+                                                global long *dst,
+                                                uint simd_id)
+{
+  uint index = get_global_id(0);
+
+  long val = src[index];
+  long broadcast_val = sub_group_broadcast(val, simd_id);
+  dst[index] = broadcast_val;
+}
diff --git a/kernels/compiler_subgroup_reduce.cl b/kernels/compiler_subgroup_reduce.cl
new file mode 100644
index 0000000..77ffb07
--- /dev/null
+++ b/kernels/compiler_subgroup_reduce.cl
@@ -0,0 +1,136 @@
+/*
+ * Subgroup any all functions
+ */
+kernel void compiler_subgroup_any(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int predicate = sub_group_any(val);
+  dst[get_global_id(0)] = predicate;
+}
+kernel void compiler_subgroup_all(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int predicate = sub_group_all(val);
+  dst[get_global_id(0)] = predicate;
+}
+
+/*
+ * Subgroup reduce add functions
+ */
+kernel void compiler_subgroup_reduce_add_char(global char *src, global char *dst) {
+  char val = src[get_global_id(0)];
+  char sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_uchar(global uchar *src, global uchar *dst) {
+  uchar val = src[get_global_id(0)];
+  uchar sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_short(global short *src, global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_ushort(global ushort *src, global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_add_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_reduce_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup reduce max functions
+ */
+kernel void compiler_subgroup_reduce_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup reduce min functions
+ */
+kernel void compiler_subgroup_reduce_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_reduce_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
diff --git a/kernels/compiler_subgroup_scan_exclusive.cl b/kernels/compiler_subgroup_scan_exclusive.cl
new file mode 100644
index 0000000..afc00d0
--- /dev/null
+++ b/kernels/compiler_subgroup_scan_exclusive.cl
@@ -0,0 +1,98 @@
+/*
+ * Subgroup scan exclusive add functions
+ */
+kernel void compiler_subgroup_scan_exclusive_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan exclusive max functions
+ */
+kernel void compiler_subgroup_scan_exclusive_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan exclusive min functions
+ */
+kernel void compiler_subgroup_scan_exclusive_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
diff --git a/kernels/compiler_subgroup_scan_inclusive.cl b/kernels/compiler_subgroup_scan_inclusive.cl
new file mode 100644
index 0000000..da1a6e6
--- /dev/null
+++ b/kernels/compiler_subgroup_scan_inclusive.cl
@@ -0,0 +1,98 @@
+/*
+ * Subgroup scan inclusive add functions
+ */
+kernel void compiler_subgroup_scan_inclusive_add_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_add_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_inclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan inclusive max functions
+ */
+kernel void compiler_subgroup_scan_inclusive_max_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_max_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_inclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+/*
+ * Subgroup scan inclusive min functions
+ */
+kernel void compiler_subgroup_scan_inclusive_min_int(global int *src, global int *dst) {
+  int val = src[get_global_id(0)];
+  int sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_uint(global uint *src, global uint *dst) {
+  uint val = src[get_global_id(0)];
+  uint sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_long(global long *src, global long *dst) {
+  long val = src[get_global_id(0)];
+  long sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_ulong(global ulong *src, global ulong *dst) {
+  ulong val = src[get_global_id(0)];
+  ulong sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_inclusive_min_float(global float *src, global float *dst) {
+  float val = src[get_global_id(0)];
+  float sum = sub_group_scan_inclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 7ea10e0..e721179 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -166,6 +166,10 @@ set (utests_sources
   compiler_workgroup_reduce.cpp
   compiler_workgroup_scan_exclusive.cpp
   compiler_workgroup_scan_inclusive.cpp
+  compiler_subgroup_broadcast.cpp
+  compiler_subgroup_reduce.cpp
+  compiler_subgroup_scan_exclusive.cpp
+  compiler_subgroup_scan_inclusive.cpp
   compiler_async_stride_copy.cpp
   compiler_insn_selection_min.cpp
   compiler_insn_selection_max.cpp
diff --git a/utests/compiler_subgroup_broadcast.cpp b/utests/compiler_subgroup_broadcast.cpp
new file mode 100644
index 0000000..f029483
--- /dev/null
+++ b/utests/compiler_subgroup_broadcast.cpp
@@ -0,0 +1,181 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  30
+#define WG_LOCAL_SIZE   30
+/*
+ * Generic compute-expected function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(T* input,
+                             T* expected,
+                             size_t SIMD_ID,
+                             size_t SIMD_SIZE)
+{
+  for(uint32_t i = 0; i < SIMD_SIZE; i++)
+    expected[i] = input[SIMD_ID];
+}
+
+/*
+ * Generic input-expected generate function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(T* &input,
+                          T* &expected,
+                          size_t SIMD_ID,
+                          size_t SIMD_SIZE)
+{
+  /* allocate input and expected arrays */
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+    {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+
+      /* check all data types, test ideal for QWORD types */
+      input[gid + lid] += ((rand() % 2 - 1) * base_val);
+      /* add trailing random bits, tests GENERAL cases */
+      input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(input + gid, expected + gid, SIMD_ID, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++){
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic subgroup utest function for op BROADCAST type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(T* input,
+                             T* expected)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  cl_uint SIMD_ID = 10;
+  /* input and expected data */
+  generate_data(input, expected, SIMD_ID, SIMD_SIZE);
+
+  /* prepare input for datatype */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_uint), &SIMD_ID);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input,  WG_GLOBAL_SIZE* sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch, increment */
+      mismatches++;
+
+#if DEBUG_STDOUT
+      /* output mismatch */
+      cout << "Err at " << i << ", " <<
+        ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup broadcast 1D functions
+ */
+void compiler_subgroup_broadcast_imm_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+                              "compiler_subgroup_broadcast_imm_int");
+  subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_imm_int);
+void compiler_subgroup_broadcast_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+                              "compiler_subgroup_broadcast_int");
+  subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_int);
+void compiler_subgroup_broadcast_long(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+                              "compiler_subgroup_broadcast_long");
+  subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_broadcast_long);
diff --git a/utests/compiler_subgroup_reduce.cpp b/utests/compiler_subgroup_reduce.cpp
new file mode 100644
index 0000000..c161831
--- /dev/null
+++ b/utests/compiler_subgroup_reduce.cpp
@@ -0,0 +1,390 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  30
+#define WG_LOCAL_SIZE   30
+enum WG_FUNCTION
+{
+  WG_ANY,
+  WG_ALL,
+  WG_REDUCE_ADD,
+  WG_REDUCE_MIN,
+  WG_REDUCE_MAX
+};
+
+/*
+ * Generic compute-expected function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected,
+                    size_t SIMD_SIZE)
+{
+  if(wg_func == WG_ANY)
+  {
+    T wg_predicate = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_predicate = (int)wg_predicate || (int)input[i];
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_predicate;
+  }
+  else if(wg_func == WG_ALL)
+  {
+    T wg_predicate = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_predicate = (int)wg_predicate && (int)input[i];
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_predicate;
+  }
+  else if(wg_func == WG_REDUCE_ADD)
+  {
+    T wg_sum = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_sum += input[i];
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_sum;
+  }
+  else if(wg_func == WG_REDUCE_MAX)
+  {
+    T wg_max = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_max = max(input[i], wg_max);
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_max;
+  }
+  else if(wg_func == WG_REDUCE_MIN)
+  {
+    T wg_min = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      wg_min = min(input[i], wg_min);
+    for(uint32_t i = 0; i < SIMD_SIZE; i++)
+      expected[i] = wg_min;
+  }
+}
+
+/*
+ * Generic input-expected generate function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected,
+                   size_t SIMD_SIZE)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for (uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+
+      if (numeric_limits<T>::is_integer) {
+        /* check all data types, test ideal for QWORD types */
+        input[gid + lid] += ((rand() % 2 - 1) * base_val);
+        /* add trailing random bits, tests GENERAL cases */
+        input[gid + lid] += (rand() % 112);
+        /* always last bit is 1, ideal test ALL/ANY */
+      } else {
+        input[gid + lid] += rand();
+        input[gid + lid] += rand() / ((float)RAND_MAX + 1);
+      }
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic workgroup utest function for op REDUCE/ANY/ALL
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  /* input and expected data */
+  generate_data(wg_func, input, expected, SIMD_SIZE);
+
+  /* prepare input for data type */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch on integer, increment */
+      if (numeric_limits<T>::is_integer) {
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << i << ", " << ((T *)buf_data[1])[i]
+             << " != " << *(expected + i) << endl;
+#endif
+      }
+      /* float error is tolerable though */
+      else {
+        float num_computed = ((T *)buf_data[1])[i];
+        float num_expected = *(expected + i);
+        float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+        if (num_diff > 0.01f) {
+          mismatches++;
+
+#if DEBUG_STDOUT
+          /* output mismatch */
+          cout << "Err at " << i << ", " << ((T *)buf_data[1])[i]
+               << " != " << *(expected + i) << endl;
+#endif
+        }
+      }
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup any/all utest functions
+ */
+void compiler_subgroup_any(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_any");
+  subgroup_generic(WG_ANY, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_any);
+void compiler_subgroup_all(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_all");
+  subgroup_generic(WG_ALL, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_all);
+/*
+ * Workgroup reduce add utest functions
+ */
+void compiler_subgroup_reduce_add_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_int");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_int);
+void compiler_subgroup_reduce_add_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_uint");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_uint);
+void compiler_subgroup_reduce_add_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_long");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_add_long);
+void compiler_subgroup_reduce_add_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_ulong");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_add_ulong);
+void compiler_subgroup_reduce_add_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_add_float");
+  subgroup_generic(WG_REDUCE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_add_float);
+
+/*
+ * Workgroup reduce max utest functions
+ */
+void compiler_subgroup_reduce_max_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_int");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_int);
+void compiler_subgroup_reduce_max_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_uint");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_uint);
+void compiler_subgroup_reduce_max_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_long");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_max_long);
+void compiler_subgroup_reduce_max_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_ulong");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_max_ulong);
+void compiler_subgroup_reduce_max_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_max_float");
+  subgroup_generic(WG_REDUCE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_max_float);
+
+/*
+ * Workgroup reduce min utest functions
+ */
+void compiler_subgroup_reduce_min_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_int");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_int);
+void compiler_subgroup_reduce_min_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_uint");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_uint);
+void compiler_subgroup_reduce_min_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_long");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_min_long);
+void compiler_subgroup_reduce_min_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_ulong");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_reduce_min_ulong);
+void compiler_subgroup_reduce_min_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_reduce",
+                              "compiler_subgroup_reduce_min_float");
+  subgroup_generic(WG_REDUCE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_reduce_min_float);
diff --git a/utests/compiler_subgroup_scan_exclusive.cpp b/utests/compiler_subgroup_scan_exclusive.cpp
new file mode 100644
index 0000000..abcec6e
--- /dev/null
+++ b/utests/compiler_subgroup_scan_exclusive.cpp
@@ -0,0 +1,350 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  30
+#define WG_LOCAL_SIZE   30
+
+enum WG_FUNCTION
+{
+  WG_SCAN_EXCLUSIVE_ADD,
+  WG_SCAN_EXCLUSIVE_MAX,
+  WG_SCAN_EXCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected,
+                    size_t SIMD_SIZE)
+{
+  if(wg_func == WG_SCAN_EXCLUSIVE_ADD)
+  {
+    expected[0] = 0;
+    expected[1] = input[0];
+    for(uint32_t i = 2; i < SIMD_SIZE; i++)
+      expected[i] = input[i - 1] + expected[i - 1];
+  }
+  else if(wg_func == WG_SCAN_EXCLUSIVE_MAX)
+  {
+    if(numeric_limits<T>::is_integer)
+      expected[0] = numeric_limits<T>::min();
+    else
+      expected[0] = - numeric_limits<T>::infinity();
+
+    expected[1] = input[0];
+    for(uint32_t i = 2; i < SIMD_SIZE; i++)
+      expected[i] = max(input[i - 1], expected[i - 1]);
+  }
+  else if(wg_func == WG_SCAN_EXCLUSIVE_MIN)
+  {
+    if(numeric_limits<T>::is_integer)
+      expected[0] = numeric_limits<T>::max();
+    else
+      expected[0] = numeric_limits<T>::infinity();
+
+    expected[1] = input[0];
+    for(uint32_t i = 2; i < SIMD_SIZE; i++)
+      expected[i] = min(input[i - 1], expected[i - 1]);
+  }
+}
+
+/*
+ * Generic subgroup utest function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected,
+                   size_t SIMD_SIZE)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+    {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+      /* check all data types, test ideal for QWORD types */
+      input[gid + lid] += ((rand() % 2 - 1) * base_val);
+      /* add trailing random bits, tests GENERAL cases */
+      input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic subgroup utest function for op SCAN EXCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  /* input and expected data */
+  generate_data(wg_func, input, expected, SIMD_SIZE);
+
+  /* prepare input for data type */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch on integer, increment */
+      if(numeric_limits<T>::is_integer){
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << i << ", " <<
+          ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+      }
+      /* float error is tolerable though */
+      else {
+          float num_computed = ((T *)buf_data[1])[i];
+          float num_expected = *(expected + i);
+          float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+          if(num_diff > 0.01f){
+            mismatches++;
+
+#if DEBUG_STDOUT
+          /* output mismatch */
+          cout << "Err at " << i << ", " <<
+            ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+        }
+      }
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup scan_exclusive add utest functions
+ */
+void compiler_subgroup_scan_exclusive_add_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_int");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_int);
+void compiler_subgroup_scan_exclusive_add_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_uint");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_uint);
+void compiler_subgroup_scan_exclusive_add_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_long");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_add_long);
+void compiler_subgroup_scan_exclusive_add_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_ulong");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_add_ulong);
+void compiler_subgroup_scan_exclusive_add_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_add_float");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_add_float);
+
+/*
+ * Workgroup scan_exclusive max utest functions
+ */
+void compiler_subgroup_scan_exclusive_max_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_int");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_int);
+void compiler_subgroup_scan_exclusive_max_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_uint");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_uint);
+void compiler_subgroup_scan_exclusive_max_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_long");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_max_long);
+void compiler_subgroup_scan_exclusive_max_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_ulong");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_max_ulong);
+void compiler_subgroup_scan_exclusive_max_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_max_float");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_max_float);
+
+/*
+ * Workgroup scan_exclusive min utest functions
+ */
+void compiler_subgroup_scan_exclusive_min_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_int");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_int);
+void compiler_subgroup_scan_exclusive_min_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_uint");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_uint);
+void compiler_subgroup_scan_exclusive_min_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_long");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_min_long);
+void compiler_subgroup_scan_exclusive_min_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_ulong");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_exclusive_min_ulong);
+void compiler_subgroup_scan_exclusive_min_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_exclusive",
+                              "compiler_subgroup_scan_exclusive_min_float");
+  subgroup_generic(WG_SCAN_EXCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_exclusive_min_float);
diff --git a/utests/compiler_subgroup_scan_inclusive.cpp b/utests/compiler_subgroup_scan_inclusive.cpp
new file mode 100644
index 0000000..1528f09
--- /dev/null
+++ b/utests/compiler_subgroup_scan_inclusive.cpp
@@ -0,0 +1,341 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+#include <iomanip>
+#include <algorithm>
+
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* set to 1 for debug, output of input-expected data */
+#define DEBUG_STDOUT    0
+
+/* NDRANGE */
+#define WG_GLOBAL_SIZE  30
+#define WG_LOCAL_SIZE   30
+
+enum WG_FUNCTION
+{
+  WG_SCAN_INCLUSIVE_ADD,
+  WG_SCAN_INCLUSIVE_MAX,
+  WG_SCAN_INCLUSIVE_MIN
+};
+
+/*
+ * Generic compute-expected function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void compute_expected(WG_FUNCTION wg_func,
+                    T* input,
+                    T* expected,
+                    size_t SIMD_SIZE)
+{
+  if(wg_func == WG_SCAN_INCLUSIVE_ADD)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      expected[i] = input[i] + expected[i - 1];
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      expected[i] = max(input[i], expected[i - 1]);
+  }
+  else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
+  {
+    expected[0] = input[0];
+    for(uint32_t i = 1; i < SIMD_SIZE; i++)
+      expected[i] = min(input[i], expected[i - 1]);
+  }
+}
+
+/*
+ * Generic input-expected generate function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void generate_data(WG_FUNCTION wg_func,
+                   T* &input,
+                   T* &expected,
+                   size_t SIMD_SIZE)
+{
+  input = new T[WG_GLOBAL_SIZE];
+  expected = new T[WG_GLOBAL_SIZE];
+
+  /* base value for all data types */
+  T base_val = (long)7 << (sizeof(T) * 5 - 3);
+
+  /* seed for random inputs */
+  srand (time(NULL));
+
+  /* generate inputs and expected values */
+  for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE)
+  {
+#if DEBUG_STDOUT
+    cout << endl << "IN: " << endl;
+#endif
+    SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE;
+
+    /* input values */
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++)
+    {
+      /* initially 0, augment after */
+      input[gid + lid] = 0;
+
+      /* check all data types, test ideal for QWORD types */
+      input[gid + lid] += ((rand() % 2 - 1) * base_val);
+      /* add trailing random bits, tests GENERAL cases */
+      input[gid + lid] += (rand() % 112);
+
+#if DEBUG_STDOUT
+      /* output generated input */
+      cout << setw(4) << input[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+#endif
+    }
+
+    /* expected values */
+    compute_expected(wg_func, input + gid, expected + gid, SIMD_SIZE);
+
+#if DEBUG_STDOUT
+    /* output expected input */
+    cout << endl << "EXP: " << endl;
+    for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) {
+      cout << setw(4) << expected[gid + lid] << ", " ;
+      if((lid + 1) % 8 == 0)
+        cout << endl;
+    }
+    cout << endl;
+#endif
+
+  }
+}
+
+/*
+ * Generic subgroup utest function for op SCAN INCLUSIVE type
+ * and any variable type
+ */
+template<class T>
+static void subgroup_generic(WG_FUNCTION wg_func,
+                       T* input,
+                       T* expected)
+{
+  /* get simd size */
+  globals[0] = WG_GLOBAL_SIZE;
+  locals[0] = WG_LOCAL_SIZE;
+  size_t SIMD_SIZE = 0;
+  OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL);
+
+  /* input and expected data */
+  generate_data(wg_func, input, expected, SIMD_SIZE);
+
+  /* prepare input for data type */
+  OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  /* set input data for GPU */
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+  OCL_UNMAP_BUFFER(0);
+
+  /* run the kernel on GPU */
+  OCL_NDRANGE(1);
+
+  /* check if mismatch */
+  OCL_MAP_BUFFER(1);
+  uint32_t mismatches = 0;
+
+  for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+    if(((T *)buf_data[1])[i] != *(expected + i))
+    {
+      /* found mismatch on integer, increment */
+      if(numeric_limits<T>::is_integer){
+        mismatches++;
+
+#if DEBUG_STDOUT
+        /* output mismatch */
+        cout << "Err at " << i << ", " <<
+          ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+      }
+      /* float error is tolerable though */
+      else {
+          float num_computed = ((T *)buf_data[1])[i];
+          float num_expected = *(expected + i);
+          float num_diff = abs(num_computed - num_expected) / abs(num_expected);
+          if(num_diff > 0.01f){
+            mismatches++;
+
+#if DEBUG_STDOUT
+          /* output mismatch */
+          cout << "Err at " << i << ", " <<
+            ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+#endif
+        }
+      }
+    }
+
+#if DEBUG_STDOUT
+  /* output mismatch count */
+  cout << "mismatches " << mismatches << endl;
+#endif
+
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_ASSERT(mismatches == 0);
+}
+
+/*
+ * Workgroup scan_inclusive add utest functions
+ */
+void compiler_subgroup_scan_inclusive_add_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_int");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_int);
+void compiler_subgroup_scan_inclusive_add_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_uint");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_uint);
+void compiler_subgroup_scan_inclusive_add_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_long");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_add_long);
+void compiler_subgroup_scan_inclusive_add_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_ulong");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_add_ulong);
+void compiler_subgroup_scan_inclusive_add_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_add_float");
+  subgroup_generic(WG_SCAN_INCLUSIVE_ADD, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_add_float);
+
+/*
+ * Workgroup scan_inclusive max utest functions
+ */
+void compiler_subgroup_scan_inclusive_max_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_int");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_int);
+void compiler_subgroup_scan_inclusive_max_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_uint");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_uint);
+void compiler_subgroup_scan_inclusive_max_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_long");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_max_long);
+void compiler_subgroup_scan_inclusive_max_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_ulong");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_max_ulong);
+void compiler_subgroup_scan_inclusive_max_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_max_float");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MAX, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_max_float);
+
+/*
+ * Workgroup scan_inclusive min utest functions
+ */
+void compiler_subgroup_scan_inclusive_min_int(void)
+{
+  cl_int *input = NULL;
+  cl_int *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_int");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_int);
+void compiler_subgroup_scan_inclusive_min_uint(void)
+{
+  cl_uint *input = NULL;
+  cl_uint *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_uint");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_uint);
+void compiler_subgroup_scan_inclusive_min_long(void)
+{
+  cl_long *input = NULL;
+  cl_long *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_long");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_min_long);
+void compiler_subgroup_scan_inclusive_min_ulong(void)
+{
+  cl_ulong *input = NULL;
+  cl_ulong *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_ulong");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_scan_inclusive_min_ulong);
+void compiler_subgroup_scan_inclusive_min_float(void)
+{
+  cl_float *input = NULL;
+  cl_float *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_scan_inclusive",
+                              "compiler_subgroup_scan_inclusive_min_float");
+  subgroup_generic(WG_SCAN_INCLUSIVE_MIN, input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_scan_inclusive_min_float);
+
-- 
2.7.4



More information about the Beignet mailing list