[Beignet] [PATCH 1/2] Backend: Add intel_sub_group_shuffle_down/up/xor with shuffle

Xiuli Pan xiuli.pan at intel.com
Mon Jul 18 07:06:28 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

We first get two result for the two src with shuffle then selection then
select the result with range condition, if we use if else with shuffle
the src may be influced by the if else prediction and the result may be
wrong.
Using the old shuffle with xor for shuffle_xor.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/libocl/script/ocl_simd.def   |  9 +++++++
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 40 ++++++++++++++++++++++++++++++++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h  |  9 +++++++
 3 files changed, 58 insertions(+)

diff --git a/backend/src/libocl/script/ocl_simd.def b/backend/src/libocl/script/ocl_simd.def
index e26243e..aa47735 100644
--- a/backend/src/libocl/script/ocl_simd.def
+++ b/backend/src/libocl/script/ocl_simd.def
@@ -2,3 +2,12 @@
 floatn intel_sub_group_shuffle(floatn x, uint c)
 intn intel_sub_group_shuffle(intn x, uint c)
 uintn intel_sub_group_shuffle(uintn x, uint c)
+floatn intel_sub_group_shuffle_down(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_down(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_down(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_up(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_up(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_up(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_xor(floatn x, uint c)
+intn intel_sub_group_shuffle_xor(intn x, uint c)
+uintn intel_sub_group_shuffle_xor(uintn x, uint c)
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index b066502..ad30c3d 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -18,6 +18,7 @@
 
 #include "ocl_simd.h"
 #include "ocl_workitem.h"
+#include "ocl_as.h"
 
 uint get_max_sub_group_size(void)
 {
@@ -216,3 +217,42 @@ OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 dat
 {
   __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data);
 }
+
+PURE CONST uint __gen_ocl_sub_group_shuffle_delta(uint x, uint y, uint c, uint inRange);
+static OVERLOADABLE INLINE uint as_uint(uint x)
+{
+  return x;
+}
+#define SHUFFLE_DOWN(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
+  TYPE res0, res1; \
+  res0 = intel_sub_group_shuffle(x, (get_sub_group_local_id() + c)%get_max_sub_group_size()); \
+  res1 = intel_sub_group_shuffle(y, (get_sub_group_local_id() + c)%get_max_sub_group_size()); \
+  bool inRange = ((int)c + (int)get_sub_group_local_id() > 0) && (((int)c + (int)get_sub_group_local_id() < (int) get_max_sub_group_size())); \
+  return inRange ? res0 : res1; \
+}
+SHUFFLE_DOWN(float)
+SHUFFLE_DOWN(int)
+SHUFFLE_DOWN(uint)
+#undef SHUFFLE_DOWN
+
+#define SHUFFLE_UP(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) { \
+  TYPE res0, res1; \
+  res0 = intel_sub_group_shuffle(x, (get_max_sub_group_size() + get_sub_group_local_id() - c)%get_max_sub_group_size()); \
+  res1 = intel_sub_group_shuffle(y, (get_max_sub_group_size() + get_sub_group_local_id() - c)%get_max_sub_group_size()); \
+  bool inRange = ((int)c - (int)get_sub_group_local_id() > 0) && (((int)c - (int)get_sub_group_local_id() < (int) get_max_sub_group_size())); \
+  return inRange ? res0 : res1; \
+}
+SHUFFLE_UP(float)
+SHUFFLE_UP(int)
+SHUFFLE_UP(uint)
+#undef SHUFFLE_UP
+#define SHUFFLE_XOR(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
+  return intel_sub_group_shuffle(x, (get_sub_group_local_id() ^ c) % get_max_sub_group_size()); \
+}
+SHUFFLE_XOR(float)
+SHUFFLE_XOR(int)
+SHUFFLE_XOR(uint)
+#undef SHUFFLE_XOR
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 799f772..15da0e7 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -132,6 +132,15 @@ OVERLOADABLE double sub_group_scan_exclusive_max(double x);
 OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);
 
 /* blocak read/write */
 OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
-- 
2.5.0



More information about the Beignet mailing list