[Beignet] [PATCH] Fix atomic test failed in GT1.

Tue Jul 2 00:22:24 PDT 2013

Barrier only ensure one work group finish, can't guarantee all work item's atomic ops
have finished before the last atomic_add.
So use atomic_xchg to update first work group's local buffer to other global buffer position.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 kernels/compiler_atomic_functions.cl |   19 +++++++++++++------
 utests/compiler_atomic_functions.cpp |   23 ++++++++++++-----------
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
index 61ce2f4..fbc16fb 100644
--- a/kernels/compiler_atomic_functions.cl
+++ b/kernels/compiler_atomic_functions.cl
@@ -1,14 +1,21 @@
 __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
   int lid = get_local_id(0);
   int i = lid % 12;
-  atomic_xchg(&tmp[4], -1);
+  if(lid == 0) {
+    for(int j=0; j<12; j=j+1) {
+      atomic_xchg(&tmp[j], 0);
+    }
+    atomic_xchg(&tmp[4], -1);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
   switch(i) {
     case 0: atomic_inc(&tmp[i]); break;
     case 1: atomic_dec(&tmp[i]); break;
     case 2: atomic_add(&tmp[i], src[lid]); break;
     case 3: atomic_sub(&tmp[i], src[lid]); break;
-    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 4))); break;
-    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 4)); break;
+    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 16))); break;
+    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 16)); break;
     case 6: atomic_xor(&tmp[i], src[lid]); break;
     case 7: atomic_min(&tmp[i], -src[lid]); break;
     case 8: atomic_max(&tmp[i], src[lid]); break;
@@ -23,8 +30,8 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
     case 1: atomic_dec(&dst[i]); break;
     case 2: atomic_add(&dst[i], src[lid]); break;
     case 3: atomic_sub(&dst[i], src[lid]); break;
-    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 4))); break;
-    case 5: atomic_or (&dst[i], src[lid]<<(lid / 4)); break;
+    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 16))); break;
+    case 5: atomic_or (&dst[i], src[lid]<<(lid / 16)); break;
     case 6: atomic_xor(&dst[i], src[lid]); break;
     case 7: atomic_min(&dst[i], -src[lid]); break;
     case 8: atomic_max(&dst[i], src[lid]); break;
@@ -38,6 +45,6 @@ __kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __g
 
   if(get_global_id(0) == 0) {
     for(i=0; i<12; i=i+1)
-      atomic_add(&dst[i], tmp[i]);
+      atomic_xchg(&dst[i+12], tmp[i]);
   }
 }
diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
index 571e0c6..65f1c5a 100644
--- a/utests/compiler_atomic_functions.cpp
+++ b/utests/compiler_atomic_functions.cpp
@@ -4,12 +4,12 @@
 #include <string.h>
 
 #define GROUP_NUM 16
-#define LOCAL_SIZE 64
+#define LOCAL_SIZE 256
 static void cpu_compiler_atomic(int *dst, int *src)
 {
   dst[4] = 0xffffffff;
   int tmp[16] = { 0 };
-
+  tmp[4] = -1;
   for(int j=0; j<LOCAL_SIZE; j++) {
     int i = j % 12;
 
@@ -18,8 +18,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
       case 1: tmp[i] -= 1; break;
       case 2: tmp[i] += src[j]; break;
       case 3: tmp[i] -= src[j]; break;
-      case 4: tmp[i] &= ~(src[j]<<(j>>2)); break;
-      case 5: tmp[i] |= src[j]<<(j>>2); break;
+      case 4: tmp[i] &= ~(src[j]<<(j>>4)); break;
+      case 5: tmp[i] |= src[j]<<(j>>4); break;
       case 6: tmp[i] ^= src[j]; break;
       case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
       case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
@@ -39,8 +39,8 @@ static void cpu_compiler_atomic(int *dst, int *src)
         case 1: dst[i] -= 1; break;
         case 2: dst[i] += src[j]; break;
         case 3: dst[i] -= src[j]; break;
-        case 4: dst[i] &= ~(src[j]<<(j>>2)); break;
-        case 5: dst[i] |= src[j]<<(j>>2); break;
+        case 4: dst[i] &= ~(src[j]<<(j>>4)); break;
+        case 5: dst[i] |= src[j]<<(j>>4); break;
         case 6: dst[i] ^= src[j]; break;
         case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
         case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
@@ -53,27 +53,28 @@ static void cpu_compiler_atomic(int *dst, int *src)
   }
 
   for(int i=0; i<12; i++)
-    dst[i] += tmp[i];
+    dst[i+12] = tmp[i];
 }
 
 static void compiler_atomic_functions(void)
 {
   const size_t n = GROUP_NUM * LOCAL_SIZE;
-  int cpu_dst[16] = {0}, cpu_src[256];
+  int cpu_dst[24] = {0}, cpu_src[256];
 
   globals[0] = n;
   locals[0] = LOCAL_SIZE;
 
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("compiler_atomic_functions");
-  OCL_CREATE_BUFFER(buf[0], 0, 16 * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[0], 0, 24 * sizeof(int), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, 16 * sizeof(int), NULL);
   OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
 
   OCL_MAP_BUFFER(0);
-  memset(buf_data[0], 0, 16 * sizeof(int));
+  memset(buf_data[0], 0, 24 * sizeof(int));
+  ((int *)buf_data[0])[4] = -1;
   OCL_UNMAP_BUFFER(0);
 
   OCL_MAP_BUFFER(1);
@@ -86,7 +87,7 @@ static void compiler_atomic_functions(void)
   OCL_MAP_BUFFER(0);
 
   // Check results
-  for(int i=0; i<12; i++) {
+  for(int i=0; i<24; i++) {
     //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
     OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
   }
-- 
1.7.10.4