[Piglit] [PATCH] cl: Add tests for function calls

Matt Arsenault arsenm2 at gmail.com
Tue Sep 19 01:53:24 UTC 2017


Passes on ROCm, I haven't tried clover recently. Last
time I did it errored because the AsmParser wasn't properly
initialized.

v2: Fix non-unique test names, Wrap noinline in unguarded macro,
use prettier test names, use device_regex (effectively restricting to ROCm)
---
 tests/cl/program/execute/call-clobbers-amdgcn.cl |  68 +++
 tests/cl/program/execute/calls-struct.cl         | 179 +++++++
 tests/cl/program/execute/calls-workitem-id.cl    |  77 +++
 tests/cl/program/execute/calls.cl                | 607 +++++++++++++++++++++++
 tests/cl/program/execute/tail-calls.cl           | 305 ++++++++++++
 5 files changed, 1236 insertions(+)
 create mode 100644 tests/cl/program/execute/call-clobbers-amdgcn.cl
 create mode 100644 tests/cl/program/execute/calls-struct.cl
 create mode 100644 tests/cl/program/execute/calls-workitem-id.cl
 create mode 100644 tests/cl/program/execute/calls.cl
 create mode 100644 tests/cl/program/execute/tail-calls.cl

diff --git a/tests/cl/program/execute/call-clobbers-amdgcn.cl b/tests/cl/program/execute/call-clobbers-amdgcn.cl
new file mode 100644
index 000000000..400771795
--- /dev/null
+++ b/tests/cl/program/execute/call-clobbers-amdgcn.cl
@@ -0,0 +1,68 @@
+/*!
+
+[config]
+name: amdgcn call clobbers
+clc_version_min: 10
+device_regex: gfx[\d]*
+
+[test]
+name: callee saved sgpr
+kernel_name: call_clobber_s40
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 0xabcd1234
+
+[test]
+name: callee saved vgpr
+kernel_name: call_clobber_v40
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 0xabcd1234
+
+!*/
+
+#ifndef __AMDGCN__
+#error This test is only for amdgcn
+#endif
+
+__attribute__((noinline))
+void clobber_s40()
+{
+    __asm volatile("s_mov_b32 s40, 0xdead" : : : "s40");
+}
+
+kernel void call_clobber_s40(__global int* ret)
+{
+    __asm volatile("s_mov_b32 s40, 0xabcd1234" : : : "s40");
+
+    clobber_s40();
+
+    int tmp;
+
+    __asm volatile("v_mov_b32 %0, s40"
+                  : "=v"(tmp)
+                  :
+                  : "s40");
+    *ret = tmp;
+}
+
+__attribute__((noinline))
+void clobber_v40()
+{
+    __asm volatile("v_mov_b32 v40, 0xdead" : : : "v40");
+}
+
+kernel void call_clobber_v40(__global int* ret)
+{
+    __asm volatile("v_mov_b32 v40, 0xabcd1234" : : : "v40");
+
+    clobber_v40();
+
+    int tmp;
+    __asm volatile("v_mov_b32 %0, v40"
+                  : "=v"(tmp)
+                  :
+                  : "v40");
+    *ret = tmp;
+}
+
diff --git a/tests/cl/program/execute/calls-struct.cl b/tests/cl/program/execute/calls-struct.cl
new file mode 100644
index 000000000..04f769dac
--- /dev/null
+++ b/tests/cl/program/execute/calls-struct.cl
@@ -0,0 +1,179 @@
+/*!
+
+[config]
+name: calls with structs
+clc_version_min: 10
+
+[test]
+name: byval struct
+kernel_name: call_i32_func_byval_Char_IntArray
+dimensions: 1
+global_size: 16 0 0
+
+arg_out: 0 buffer int[16]        \
+ 1021 1022 1023 1024 1025 1026 1027 1028 \
+ 1029 1030 1031 1032 1033 1034 1035 1036
+
+arg_out: 1 buffer int[16] \
+  14   14   14   14 \
+  14   14   14   14 \
+  14   14   14   14 \
+  14   14   14   14 \
+
+arg_in: 2 buffer int[16] \
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+
+
+[test]
+name: sret struct
+kernel_name: call_sret_Char_IntArray_func
+dimensions: 1
+global_size: 16 0 0
+
+arg_out: 0 buffer int[16]        \
+ 921 922 923 924 925 926 927 928 \
+ 929 930 931 932 933 934 935 936
+
+arg_in: 1 buffer int[16] \
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+
+
+[test]
+name: byval struct and sret struct
+kernel_name: call_sret_Char_IntArray_func_byval_Char_IntArray
+dimensions: 1
+global_size: 16 0 0
+
+arg_out: 0 buffer int[16]        \
+  86 87 88 89   \
+  90 91 92 93   \
+  94 95 96 97   \
+  98 99 100 101
+
+arg_out: 1 buffer int[16]        \
+  134  135  136  137  \
+  138  139  140  141  \
+  142  143  144  145  \
+  146  147  148  149
+
+arg_in: 2 buffer int[16] \
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+
+!*/
+
+#define NOINLINE __attribute__((noinline))
+
+typedef struct ByVal_Char_IntArray {
+    char c;
+    int i[4];
+} ByVal_Char_IntArray;
+
+NOINLINE
+int i32_func_byval_Char_IntArray(ByVal_Char_IntArray st)
+{
+    st.i[0] += 100;
+
+    int sum = 0;
+    for (int i = 0; i < 4; ++i)
+    {
+        sum += st.i[i];
+    }
+
+    sum += st.c;
+    return sum;
+}
+
+kernel void call_i32_func_byval_Char_IntArray(global int* out0,
+                                              global int* out1,
+                                              global int* input)
+{
+    ByVal_Char_IntArray st;
+    st.c = 15;
+
+    int id = get_global_id(0);
+
+    int val = input[id];
+    st.i[0] = 14;
+    st.i[1] = -8;
+    st.i[2] = val;
+    st.i[3] = 900;
+
+    int result = i32_func_byval_Char_IntArray(st);
+    out0[id] = result;
+    out1[id] = st.i[0];
+}
+
+NOINLINE
+ByVal_Char_IntArray sret_Char_IntArray_func(global int* input, int id)
+{
+    ByVal_Char_IntArray st;
+    st.c = 15;
+
+    int val = input[id];
+    st.i[0] = 14;
+    st.i[1] = -8;
+    st.i[2] = val;
+    st.i[3] = 900;
+
+    return st;
+}
+
+kernel void call_sret_Char_IntArray_func(global int* output, global int* input)
+{
+    int id = get_global_id(0);
+    ByVal_Char_IntArray st = sret_Char_IntArray_func(input, id);
+
+    int sum = 0;
+    for (int i = 0; i < 4; ++i)
+    {
+        sum += st.i[i];
+    }
+
+    sum += st.c;
+    output[id] = sum;
+}
+
+NOINLINE
+ByVal_Char_IntArray sret_Char_IntArray_func_byval_Char_IntArray(ByVal_Char_IntArray st)
+{
+    st.c += 15;
+
+    st.i[0] += 14;
+    st.i[1] -= 8;
+    st.i[2] += 9;
+    st.i[3] += 18;
+
+    return st;
+}
+
+kernel void call_sret_Char_IntArray_func_byval_Char_IntArray(global int* output0,
+                                                             global int* output1,
+                                                             global int* input)
+{
+    int id = get_global_id(0);
+
+    volatile ByVal_Char_IntArray st0;
+    st0.c = -20;
+
+    int val = input[id];
+    st0.i[0] = 14;
+    st0.i[1] = -8;
+    st0.i[2] = val;
+    st0.i[3] = 100;
+
+    ByVal_Char_IntArray st1 = sret_Char_IntArray_func_byval_Char_IntArray(st0);
+
+    int sum0 = 0;
+    int sum1 = 0;
+    for (int i = 0; i < 4; ++i)
+    {
+        sum0 += st0.i[i];
+        sum1 += st1.i[i];
+    }
+
+    sum0 += st0.c;
+    sum1 += st1.c;
+
+    output0[id] = sum0;
+    output1[id] = sum1;
+}
diff --git a/tests/cl/program/execute/calls-workitem-id.cl b/tests/cl/program/execute/calls-workitem-id.cl
new file mode 100644
index 000000000..7edfad7e9
--- /dev/null
+++ b/tests/cl/program/execute/calls-workitem-id.cl
@@ -0,0 +1,77 @@
+/*!
+
+[config]
+name: calls workitem IDs
+clc_version_min: 10
+
+[test]
+name: Callee function use get_global_id(0)
+kernel_name: kernel_call_pass_get_global_id_0
+dimensions: 1
+global_size: 64 0 0
+arg_out: 0 buffer uint[64] \
+  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 \
+ 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 \
+ 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 \
+ 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
+
+[test]
+name: Callee function use get_global_id 0..2
+kernel_name: kernel_call_pass_get_global_id_012
+dimensions: 3
+global_size: 8 4 2
+arg_out: 0 buffer uint[64] \
+  0  1  2  3  4  5  6  7  0  1  2  3  4  5  6  7 \
+  0  1  2  3  4  5  6  7  0  1  2  3  4  5  6  7 \
+  0  1  2  3  4  5  6  7  0  1  2  3  4  5  6  7 \
+  0  1  2  3  4  5  6  7  0  1  2  3  4  5  6  7
+
+arg_out: 1 buffer uint[64] \
+  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  1 \
+  2  2  2  2  2  2  2  2  3  3  3  3  3  3  3  3 \
+  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  1 \
+  2  2  2  2  2  2  2  2  3  3  3  3  3  3  3  3
+
+arg_out: 2 buffer uint[64] \
+  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 \
+  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 \
+  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 \
+  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
+
+!*/
+
+#define NOINLINE __attribute__((noinline))
+
+NOINLINE
+void func_get_global_id_0(volatile global uint* out)
+{
+    uint gid = get_global_id(0);
+    out[gid] = gid;
+}
+
+kernel void kernel_call_pass_get_global_id_0(global uint *out)
+{
+    func_get_global_id_0(out);
+}
+
+NOINLINE
+void func_get_global_id_012(volatile global uint* out0,
+                            volatile global uint* out1,
+                            volatile global uint* out2)
+{
+    uint id0 = get_global_id(0);
+    uint id1 = get_global_id(1);
+    uint id2 = get_global_id(2);
+    uint flat_id = (id2 * get_global_size(1) + id1) * get_global_size(0) + id0;
+
+    out0[flat_id] = id0;
+    out1[flat_id] = id1;
+    out2[flat_id] = id2;
+}
+
+kernel void kernel_call_pass_get_global_id_012(global uint *out0,
+                                               global uint *out1,
+                                               global uint *out2)
+{
+    func_get_global_id_012(out0, out1, out2);
+}
diff --git a/tests/cl/program/execute/calls.cl b/tests/cl/program/execute/calls.cl
new file mode 100644
index 000000000..f4f55be31
--- /dev/null
+++ b/tests/cl/program/execute/calls.cl
@@ -0,0 +1,607 @@
+/*!
+
+[config]
+name: calls
+clc_version_min: 10
+
+[test]
+name: Call void_func_void
+kernel_name: call_void_func_void
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 12345
+
+[test]
+name: Call i32_func_void
+kernel_name: call_i32_func_void
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 0x12345
+
+[test]
+name: Call i64_func_void
+kernel_name: call_i64_func_void
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer long[1] 0x100000000000
+
+
+[test]
+name: Call call_i32_func_void_callee_stack
+kernel_name: call_i32_func_void_callee_stack
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 290
+
+[test]
+name: Call call_i32_func_p0i32_i32_caller_stack
+kernel_name: call_i32_func_p0i32_i32_caller_stack
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 175
+
+[test]
+name: Call call_i32_func_p0i32_i32_indirect_kernel_stack
+kernel_name: call_i32_func_p0i32_i32_indirect_kernel_stack
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 241
+
+[test]
+name: Call call_i32_func_p0i32_i32_indirect_function_stack
+kernel_name: call_i32_func_p0i32_i32_indirect_function_stack
+dimensions: 1
+global_size: 1 0 0
+arg_out: 0 buffer int[1] 291
+
+[test]
+name: callee stack corruption
+kernel_name: kernel_call_nested_stack_usage
+dimensions: 1
+global_size: 10 0 0
+
+arg_out: 0 buffer int4[10] \
+    53    48   156   160  \
+    84   248   102   150  \
+   102    56   217   106  \
+   100   123   151   139  \
+    80   150   135   163  \
+   223    99   117   199  \
+   187   262   223   169  \
+   277   129    73   121  \
+   162   165   138   137  \
+   204   207   223   145  \
+
+
+arg_in: 1 buffer int4[10] \
+     0    13    76    46  \
+     4    74    33    63  \
+    26     9    95     7  \
+    41    54    47    29  \
+    15    68    38    39  \
+    91    43    14    95  \
+    44    83    69    70  \
+    89    54    14    45  \
+    77    63    21    21  \
+    64    70    80    70
+
+arg_in: 2 buffer int4[10] \
+    53    22     4    68  \
+    76   100    36    24  \
+    50    38    27    92  \
+    18    15    57    81  \
+    50    14    59    85  \
+    41    13    89     9  \
+    99    96    85    29  \
+    99    21    45    31  \
+     8    39    96    95  \
+    76    67    63     5
+
+[test]
+name: nested calls
+kernel_name: kernel_nested_calls
+dimensions: 1
+global_size: 4 0 0
+
+arg_out: 0 buffer int[4] \
+  1    7   155     -4
+
+arg_in: 1 buffer int[4] \
+  0   100  1234  -912
+
+arg_in: 2 buffer int[4] \
+  1    4      2    45
+
+
+[test]
+name: Kernel call stack argument
+kernel_name: kernel_call_stack_arg
+dimensions: 1
+global_size: 10 0 0
+
+
+arg_out: 0 buffer int4[10] \
+ 11440  1348 29304 16698  \
+ 47975  3626 30850 13224  \
+  8235 30495 31995  1455  \
+ 16048 40512 33992  7028  \
+  9450  5356 21330 23130  \
+ 21120 35186 52896 49968  \
+ 34083 28520     0     0  \
+ 12384 41492  4420 17880  \
+ 37310 19320 37518 13175  \
+ 23852 16014 22734 24284  \
+
+
+arg_in: 1 buffer int4[10] \
+     0    13    76    46  \
+    63    76   100    36  \
+    27    92    53    46  \
+    53    50    96    75  \
+    99    41    14    57  \
+    35    45    81    94  \
+    80    71    74     1  \
+    78    73    32    42  \
+    60    17    83    15  \
+    13    53    31    59
+
+arg_in: 2 buffer int4[10] \
+    53    22     4    68  \
+    24    99    72    76  \
+    95     5    76    77  \
+    56    89    63    85  \
+    25    49    46    97  \
+    65    21    68    91  \
+    89    53    46     6  \
+    68    68    20    84  \
+    99    25    23    10  \
+    52    43    26    37
+
+arg_in: 3 buffer int4[10] \
+    68    94    38    52  \
+    65     7    63    89  \
+    83    12     1    69  \
+    16    21    72    13  \
+    12    20    32    63  \
+    25    86    47    51  \
+    72    49    67    68  \
+    71    83     9     8  \
+    22    64    70    80  \
+    39    45    48    39
+
+arg_in: 4 buffer int4[10] \
+    83     3     5    53  \
+    27    44    77    48  \
+    87    63    74    73  \
+     9    27     0    41  \
+    12    65    62    81  \
+    60    82    76    46  \
+    20    92    87    89  \
+    77    63    21    21  \
+    70    76    67    63  \
+    28     7    37    25
+
+arg_in: 5 buffer int4[10] \
+    67     0    38     6  \
+    24    27    36    16  \
+   100    89    23    30  \
+     2    71    94    24  \
+    25    48    39    20  \
+    96    63    44    83  \
+    54    14    45    99  \
+     8    39    96    95  \
+     5    60    22    32  \
+    67    68    51    73
+
+arg_in: 6 buffer int4[10] \
+    42    69    59    93  \
+    49    90    91     6  \
+    35    51    59    85  \
+    18    32    89    65  \
+     2    91    43    14  \
+    69    70    99    96  \
+    21    45    31    51  \
+    39    27    69    28  \
+    70    11    77    53  \
+    72    95    46    94
+
+arg_in: 7 buffer int4[10] \
+    85    53     9    66  \
+    91    50    52    32  \
+    41    84    27    41  \
+    15    68    38    39  \
+    95    41    13    89  \
+    85    29    54    51  \
+    89    44    47    81  \
+    78    79    42    28  \
+    55    59    33    71  \
+    32    46    52    66
+
+arg_in: 8 buffer int4[10] \
+    42    70    91    76  \
+    99    49    26     9  \
+    54    47    29    18  \
+    50    14    59    85  \
+     9    16     7    36  \
+    10    41    58    88  \
+    36    21   100    15  \
+    19     1    19    99  \
+    14    16    49    86  \
+    40    61    99    15
+
+arg_in: 9 buffer int4[10] \
+    26     4    74    33  \
+    95     7    50    38  \
+    15    57    81     3  \
+    59    96    56    14  \
+    25    13    79    45  \
+    44    73    87    72  \
+    63    62     0     0  \
+    24    82    13    40  \
+    82    56    74    31  \
+    67    34    54    52
+
+!*/
+
+// The inline asm is necessary to defeat interprocedural sparse
+// conditional constant propagation eliminating some of the trivial
+// calls.
+#ifdef __AMDGCN__
+#define USE_ASM 1
+#endif
+
+#define NOINLINE __attribute__((noinline))
+
+NOINLINE
+void void_func_void(void)
+{
+#if USE_ASM
+  __asm("");
+#endif
+}
+
+kernel void call_void_func_void(__global int* ret)
+{
+  void_func_void();
+  *ret = 12345;
+}
+
+NOINLINE
+int i32_func_void(void)
+{
+    int ret;
+#if USE_ASM
+    __asm("v_mov_b32 %0, 0x12345" : "=v"(ret));
+#else
+    ret = 0x12345;
+#endif
+
+    return ret;
+}
+
+kernel void call_i32_func_void(__global int* ret)
+{
+    *ret = i32_func_void();
+}
+
+NOINLINE
+long i64_func_void(void)
+{
+    long ret;
+#if USE_ASM
+    __asm("v_lshlrev_b64 %0, 44, 1" : "=v"(ret));
+#else
+    ret = 1ull << 44;
+#endif
+    return ret;
+}
+
+kernel void call_i64_func_void(__global long* ret)
+{
+    *ret = i64_func_void();
+}
+
+
+NOINLINE
+int i32_func_void_callee_stack(void)
+{
+    int ret;
+#if USE_ASM
+    __asm("v_mov_b32 %0, 0x64" : "=v"(ret));
+#else
+    ret = 0x64;
+#endif
+
+    volatile int alloca[20];
+
+    for (int i = 0; i < 20; ++i)
+    {
+        alloca[i] = i;
+    }
+
+    for (int i = 0; i < 20; ++i)
+    {
+        ret += alloca[i];
+    }
+
+    return ret;
+}
+
+kernel void call_i32_func_void_callee_stack(__global int* ret)
+{
+    volatile int alloca[10];
+
+    for (int i = 0; i < 10; ++i)
+    {
+        alloca[i] = 0xffff;
+    }
+
+
+    *ret = i32_func_void_callee_stack();
+}
+
+NOINLINE
+int i32_func_p0i32_i32_caller_stack(volatile int* stack, int n)
+{
+    int ret;
+#if USE_ASM
+    __asm("v_mov_b32 %0, 0x64" : "=v"(ret));
+#else
+    ret = 0x64;
+#endif
+
+    for (int i = 0; i < n; ++i)
+    {
+        ret += stack[i];
+    }
+
+    return ret;
+}
+
+kernel void call_i32_func_p0i32_i32_caller_stack(__global int* ret)
+{
+    volatile int alloca[10];
+
+    for (int i = 0; i < 10; ++i)
+    {
+        alloca[i] = 3 + i;
+    }
+
+    *ret = i32_func_p0i32_i32_caller_stack(alloca, 10);
+}
+
+NOINLINE
+int i32_func_p0i32_i32_indirect_stack(volatile int* stack, int n)
+{
+    int ret;
+#if USE_ASM
+    __asm("v_mov_b32 %0, 0x64" : "=v"(ret));
+#else
+    ret = 0x64;
+#endif
+    for (int i = 0; i < n; ++i)
+    {
+        ret += stack[i];
+    }
+
+    return ret;
+}
+
+// Access a stack object in the parent kernel's frame.
+NOINLINE
+int i32_func_p0i32_i32_pass_kernel_stack(volatile int* stack, int n)
+{
+    int ret;
+#if USE_ASM
+    __asm("v_mov_b32 %0, 0x42" : "=v"(ret));
+#else
+    ret = 0x42;
+#endif
+
+    volatile int local_object[10];
+    for (int i = 0; i < 10; ++i)
+        local_object[i] = -1;
+
+    ret += i32_func_p0i32_i32_indirect_stack(stack, n);
+
+    return ret;
+}
+
+kernel void call_i32_func_p0i32_i32_indirect_kernel_stack(volatile __global int* ret)
+{
+    volatile int alloca[10];
+
+    for (int i = 0; i < 10; ++i)
+    {
+        alloca[i] = 3 + i;
+    }
+
+    *ret = i32_func_p0i32_i32_pass_kernel_stack(alloca, 10);
+}
+
+// Access a stack object in a parent non-kernel function's stack frame.
+NOINLINE
+int i32_func_void_pass_function_stack()
+{
+    int ret;
+#if USE_ASM
+    __asm("v_mov_b32 %0, 0x42" : "=v"(ret));
+#else
+    ret = 0x42;
+#endif
+
+    volatile int local_object[10];
+    for (int i = 0; i < 10; ++i)
+        local_object[i] = 8 + i;
+
+    ret += i32_func_p0i32_i32_indirect_stack(local_object, 10);
+    return ret;
+}
+
+kernel void call_i32_func_p0i32_i32_indirect_function_stack(__global int* ret)
+{
+  *ret = i32_func_void_pass_function_stack();
+}
+
+NOINLINE
+int4 v4i32_func_v4i32_v4i32_stack(int4 arg0, int4 arg1)
+{
+    // Force stack usage.
+    volatile int4 args[8] = { arg0, arg1 };
+
+    int4 total = 0;
+    for (int i = 0; i < 8; ++i)
+    {
+        total += args[i];
+    }
+
+    return total;
+}
+
+// Make sure using stack in a callee function from a callee function
+// doesn't corrupt caller's stack objects.
+NOINLINE
+int4 nested_stack_usage_v4i32_func_v4i32_v4i32(int4 arg0, int4 arg1)
+{
+    volatile int stack_object[4];
+    for (int i = 0; i < 4; ++i) {
+        const int test_val = 0x04030200 | i;
+        stack_object[i] = test_val;
+    }
+
+    arg0 *= 2;
+
+    int4 result = v4i32_func_v4i32_v4i32_stack(arg0, arg1);
+
+    // Check for stack corruption
+    for (int i = 0; i < 4; ++i)
+    {
+        const int test_val = 0x04030200 | i;
+        if (stack_object[i] != test_val)
+            result = -1;
+    }
+
+    return result;
+}
+
+kernel void kernel_call_nested_stack_usage(global int4* output,
+                                           global int4* input0,
+                                           global int4* input1)
+{
+    int id = get_global_id(0);
+    output[id] = nested_stack_usage_v4i32_func_v4i32_v4i32(
+        input0[id],
+        input1[id]);
+}
+
+NOINLINE
+int func_div_add(int x, int y)
+{
+    return x / y + 4;
+}
+
+NOINLINE
+int call_i32_func_i32_i32(int x, int y, volatile int* ptr)
+{
+    int tmp = func_div_add(x, y) >> 2;
+    return tmp + *ptr;
+}
+
+kernel void kernel_nested_calls(global int* output,
+                                global int* input0,
+                                global int* input1)
+{
+    int id = get_global_id(0);
+    volatile int zero = 0;
+    output[id] = call_i32_func_i32_i32(input0[id], input1[id], &zero);
+}
+
+NOINLINE
+int4 v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32(
+    int4 arg0, int4 arg1, int4 arg2, int4 arg3,
+    int4 arg4, int4 arg5, int4 arg6, int4 arg7,
+    int4 arg8)
+{
+    // Try to make sure we can't clobber the incoming stack arguments
+    // with local stack objects.
+    volatile int4 args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 };
+    volatile int4 last_arg = arg8;
+
+    int4 total = 0;
+    for (int i = 0; i < 8; ++i)
+    {
+        total += args[i];
+    }
+
+    return total * last_arg;
+}
+
+ // Test argument passed on stack, but doesn't use byval.
+NOINLINE
+int4 stack_arg_v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32(
+    int4 arg0, int4 arg1, int4 arg2, int4 arg3,
+    int4 arg4, int4 arg5, int4 arg6, int4 arg7,
+    int4 arg8)
+{
+    volatile int stack_object[8];
+    for (int i = 0; i < 8; ++i) {
+        const int test_val = 0x04030200 | i;
+        stack_object[i] = test_val;
+    }
+
+    arg0 *= 2;
+
+    int4 result = v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32(
+        arg0, arg1, arg2, arg3, arg4,
+        arg5, arg6, arg7, arg8);
+
+    // Check for stack corruption.
+    for (int i = 0; i < 8; ++i)
+    {
+        const int test_val = 0x04030200 | i;
+        if (stack_object[i] != test_val)
+            result = -1;
+    }
+
+    return result;
+}
+
+kernel void kernel_call_stack_arg(global int4* output,
+                                  global int4* input0,
+                                  global int4* input1,
+                                  global int4* input2,
+                                  global int4* input3,
+                                  global int4* input4,
+                                  global int4* input5,
+                                  global int4* input6,
+                                  global int4* input7,
+                                  global int4* input8)
+{
+    int id = get_global_id(0);
+
+    volatile int stack_object[8];
+    for (int i = 0; i < 8; ++i) {
+        const int test_val = 0x05060700 | i;
+        stack_object[i] = test_val;
+    }
+
+    output[id] = stack_arg_v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32(
+        input0[id],
+        input1[id],
+        input2[id],
+        input3[id],
+        input4[id],
+        input5[id],
+        input6[id],
+        input7[id],
+        input8[id]);
+
+    // Check for stack corruption.
+    for (int i = 0; i < 8; ++i)
+    {
+        const int test_val = 0x05060700 | i;
+        if (stack_object[i] != test_val)
+            output[id] = -1;
+    }
+
+}
diff --git a/tests/cl/program/execute/tail-calls.cl b/tests/cl/program/execute/tail-calls.cl
new file mode 100644
index 000000000..cf33373cc
--- /dev/null
+++ b/tests/cl/program/execute/tail-calls.cl
@@ -0,0 +1,305 @@
+/*!
+
+[config]
+name: tail calls
+clc_version_min: 10
+dimensions: 1
+
+[test]
+name: Basic tail call
+kernel_name: kernel_call_tailcall
+global_size: 4 0 0
+
+arg_out: 0 buffer int[4] \
+  4    11   107     -12
+
+arg_in: 1 buffer int[4] \
+  0   100  1234  -912
+
+arg_in: 2 buffer int[4] \
+  1    4      2    45
+
+[test]
+name: Tail call with more arguments than caller
+kernel_name: kernel_call_tailcall_extra_arg
+global_size: 4 0 0
+
+arg_out: 0 buffer int[4] \
+  2    112   1340   -882
+
+arg_in: 1 buffer int[4] \
+  0   100  1234  -912
+
+arg_in: 2 buffer int[4] \
+  1    4      2    45
+
+[test]
+name: Tail call with fewer arguments than acller
+kernel_name: kernel_call_tailcall_fewer_args
+global_size: 4 0 0
+
+arg_out: 0 buffer int[4] \
+  4    8   81   -10
+
+arg_in: 1 buffer int[4] \
+  0   100  1234  -912
+
+arg_in: 2 buffer int[4] \
+  1    4      2    45
+
+arg_in: 3 buffer int[4] \
+  3    8      4    9
+
+[test]
+name: Tail call with stack passed argument
+kernel_name: kernel_call_tailcall_stack_passed_args
+global_size: 10 0 0
+
+arg_out: 0 buffer int4[10] \
+ 11440  8762 10296 13156  \
+ 19649 31311 18081 24745  \
+ 10476 11772 17766 11070  \
+ 22165 18005 28665 35945  \
+   624   938   768   990  \
+ 30618 28791 30240 31815  \
+ 49851 47676 46806 47676  \
+  4400  4272  3392  2632  \
+ 10582  8712  8514  7854  \
+ 19737 21199 23865 18533  \
+
+
+arg_in: 1 buffer int4[10] \
+     0    13    76    46  \
+     4    74    33    63  \
+    26     9    95     7  \
+    41    54    47    29  \
+    15    68    38    39  \
+    91    43    14    95  \
+    44    83    69    70  \
+    89    54    14    45  \
+    77    63    21    21  \
+    64    70    80    70
+
+arg_in: 2 buffer int4[10] \
+    53    22     4    68  \
+    76   100    36    24  \
+    50    38    27    92  \
+    18    15    57    81  \
+    50    14    59    85  \
+    41    13    89     9  \
+    99    96    85    29  \
+    99    21    45    31  \
+     8    39    96    95  \
+    76    67    63     5
+
+arg_in: 3 buffer int4[10] \
+    68    94    38    52  \
+    99    72    76    65  \
+    53    46    95     5  \
+     3    53    50    96  \
+    59    96    56    14  \
+    16     7    36    25  \
+    54    51    10    41  \
+    51    89    44    47  \
+    39    27    69    28  \
+    60    22    32    70
+
+arg_in: 4 buffer int4[10] \
+    83     3     5    53  \
+     7    63    89    27  \
+    76    77    83    12  \
+    75    56    89    63  \
+    99    41    14    57  \
+    13    79    45    35  \
+    58    88    44    73  \
+    81    36    21   100  \
+    78    79    42    28  \
+    11    77    53    55
+
+arg_in: 5 buffer int4[10] \
+    67     0    38     6  \
+    44    77    48    24  \
+     1    69    87    63  \
+    85    16    21    72  \
+    25    49    46    97  \
+    45    81    94    65  \
+    87    72    80    71  \
+    15    63    62     0  \
+    19     1    19    99  \
+    59    33    71    14
+
+arg_in: 6 buffer int4[10] \
+    42    69    59    93  \
+    27    36    16    49  \
+    74    73   100    89  \
+    13     9    27     0  \
+    12    20    32    63  \
+    21    68    91    25  \
+    74     1    89    53  \
+     0    78    73    32  \
+    24    82    13    40  \
+    16    49    86    82
+
+arg_in: 7 buffer int4[10] \
+    85    53     9    66  \
+    90    91     6    91  \
+    23    30    35    51  \
+    41     2    71    94  \
+    12    65    62    81  \
+    86    47    51    60  \
+    46     6    72    49  \
+    42    68    68    20  \
+    60    17    83    15  \
+    56    74    31    13
+
+arg_in: 8 buffer int4[10] \
+    42    70    91    76  \
+    50    52    32    99  \
+    59    85    41    84  \
+    24    18    32    89  \
+    25    48    39    20  \
+    82    76    46    96  \
+    67    68    20    92  \
+    84    71    83     9  \
+    99    25    23    10  \
+    53    31    59    52
+
+arg_in: 9 buffer int[10] \
+   26  \
+   49  \
+   27  \
+   65  \
+    2  \
+   63  \
+   87  \
+    8  \
+   22  \
+   43
+
+!*/
+
+#define NOINLINE __attribute__((noinline))
+
+NOINLINE
+int i32_func_i32_i32(int x, int y)
+{
+    return x / y + 4;
+}
+
+NOINLINE
+int i32_func_i32_i32_i32(int x, int y, int z)
+{
+    return x / y + z;
+}
+
+// Test a basic tail call
+NOINLINE
+int tailcall_i32_func_i32_i32(int x, int y)
+{
+    x += 5;
+    y += 10;
+    return i32_func_i32_i32(x, y);
+}
+
+// Test a basic tail call with more arguments in the callee than
+// caller.
+NOINLINE
+int tailcall_i32_func_i32_i32_extra_arg(int x, int y)
+{
+    int z = x + y + 1;
+    x += 5;
+    y += 10;
+    return i32_func_i32_i32_i32(x, y, z);
+}
+
+// Test a basic tail call with fewere arguments in the callee than
+// caller.
+NOINLINE
+int tailcall_i32_func_i32_i32_i32_fewer_args(int x, int y, int z)
+{
+    x += 5;
+    y += 10;
+    return i32_func_i32_i32(x, y + z);
+}
+
+kernel void kernel_call_tailcall(global int* output,
+                                 global int* input0,
+                                 global int* input1)
+{
+    int id = get_global_id(0);
+    output[id] = tailcall_i32_func_i32_i32(input0[id], input1[id]);
+}
+
+kernel void kernel_call_tailcall_extra_arg(global int* output,
+                                           global int* input0,
+                                           global int* input1)
+{
+    int id = get_global_id(0);
+    output[id] = tailcall_i32_func_i32_i32_extra_arg(input0[id], input1[id]);
+}
+
+kernel void kernel_call_tailcall_fewer_args(global int* output,
+                                            global int* input0,
+                                            global int* input1,
+                                            global int* input2)
+{
+    int id = get_global_id(0);
+    output[id] = tailcall_i32_func_i32_i32_i32_fewer_args(input0[id], input1[id], input2[id]);
+}
+
+NOINLINE
+int4 v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_i32(
+    int4 arg0, int4 arg1, int4 arg2, int4 arg3,
+    int4 arg4, int4 arg5, int4 arg6, int4 arg7,
+    int arg8)
+{
+    // Try to make sure we can't clobber the incoming stack arguments
+    // with local stack objects.
+    volatile int4 args[8] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 };
+    volatile int scalar_arg = arg8;
+
+    int4 total = 0;
+    for (int i = 0; i < 8; ++i)
+    {
+        total += args[i];
+    }
+
+    return total * scalar_arg;
+}
+
+// Test a basic tail call
+NOINLINE
+int4 tailcall_v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_i32(
+    int4 arg0, int4 arg1, int4 arg2, int4 arg3,
+    int4 arg4, int4 arg5, int4 arg6, int4 arg7,
+    int arg8)
+{
+    arg0 *= 2;
+    return v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_i32(
+        arg0, arg1, arg2, arg3, arg4,
+        arg5, arg6, arg7, arg8);
+}
+
+kernel void kernel_call_tailcall_stack_passed_args(global int4* output,
+                                                   global int4* input0,
+                                                   global int4* input1,
+                                                   global int4* input2,
+                                                   global int4* input3,
+                                                   global int4* input4,
+                                                   global int4* input5,
+                                                   global int4* input6,
+                                                   global int4* input7,
+                                                   global int* input8)
+{
+    int id = get_global_id(0);
+    output[id] = tailcall_v4i32_func_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_v4i32_i32(
+        input0[id],
+        input1[id],
+        input2[id],
+        input3[id],
+        input4[id],
+        input5[id],
+        input6[id],
+        input7[id],
+        input8[id]);
+}
-- 
2.11.0



More information about the Piglit mailing list