[Beignet] [PATCH 3/4] Backend: Add intel_sub_group_shuffle_down/up/xor

Xiuli Pan xiuli.pan at intel.com
Fri Jun 3 02:43:48 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

Using a function shuffle delta for down/up, using some flags for current
and down/up src switch. The flags and index is pre caculated in libocl.
The shuffle delta only handle flag mask the dst with different src.
Using the old shuffle with xor for shuffle_xor.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 65 ++++++++++++++++++++++++++++++
 backend/src/ir/instruction.cpp             | 44 ++++++++++++++++++++
 backend/src/ir/instruction.hpp             |  9 +++++
 backend/src/ir/instruction.hxx             |  1 +
 backend/src/libocl/script/ocl_simd.def     |  9 +++++
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl   | 34 ++++++++++++++++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h    |  9 +++++
 backend/src/llvm/llvm_gen_backend.cpp      | 13 ++++++
 backend/src/llvm/llvm_gen_ocl_function.hxx |  2 +
 9 files changed, 186 insertions(+)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 39688ad..006e956 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -6691,6 +6691,70 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
   };
 
+  /*! SIMD shuffle delta pattern */
+  DECL_PATTERN(SimdShuffleDeltaInstruction)
+  {
+    bool emitOne(Selection::Opaque &sel, const ir::SimdShuffleDeltaInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_U32);
+      const GenRegister srcx = sel.selReg(insn.getSrc(0), TYPE_U32);
+      const GenRegister srcy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      const GenRegister index = sel.selReg(insn.getSrc(2), TYPE_U32);
+      const GenRegister inRange = sel.selReg(insn.getSrc(3), TYPE_U32);
+      const GenRegister constZero = GenRegister::immud(0);;
+      const GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      bool hasShiftL = false;
+
+      sel.push();
+      sel.curr.predicate = GEN_PREDICATE_NONE;
+      /* First shuffle for srcx */
+      if (sel.isScalarReg(insn.getSrc(0))) {
+        sel.MOV(dst, srcx);
+      } else {
+        if (index.file == GEN_IMMEDIATE_VALUE) {
+          sel.push();
+          uint32_t offset = index.value.ud % sel.curr.execWidth;
+          GenRegister reg = GenRegister::subphysicaloffset(srcx, offset);
+          reg.vstride = GEN_VERTICAL_STRIDE_0;
+          reg.hstride = GEN_HORIZONTAL_STRIDE_0;
+          reg.width = GEN_WIDTH_1;
+          sel.MOV(dst, reg);
+          sel.push();
+        } else {
+          sel.SHL(shiftL, index, GenRegister::immud(0x2));
+          hasShiftL = true;
+          sel.SIMD_SHUFFLE(dst, srcx, shiftL);
+        }
+      }
+      sel.curr.flag = 0;
+      sel.curr.subFlag = 1;
+      sel.CMP(GEN_CONDITIONAL_EQ, inRange, constZero);
+      sel.curr.predicate = GEN_PREDICATE_NORMAL;
+      /* Now shuffle for srcy */
+      if (sel.isScalarReg(insn.getSrc(1))) {
+        sel.MOV(dst, srcy);
+      } else {
+        if (index.file == GEN_IMMEDIATE_VALUE) {
+          sel.push();
+          uint32_t offset = index.value.ud % sel.curr.execWidth;
+          GenRegister reg = GenRegister::subphysicaloffset(srcy, offset);
+          reg.vstride = GEN_VERTICAL_STRIDE_0;
+          reg.hstride = GEN_HORIZONTAL_STRIDE_0;
+          reg.width = GEN_WIDTH_1;
+          sel.MOV(dst, reg);
+          sel.pop();
+        } else {
+          if (!hasShiftL)
+            sel.SHL(shiftL, index, GenRegister::immud(0x2));
+          sel.SIMD_SHUFFLE(dst, srcy, shiftL);
+        }
+      }
+      sel.pop();
+      return true;
+    }
+    DECL_CTOR(SimdShuffleDeltaInstruction, 1, 1);
+  };
 
   /*! Sort patterns */
   INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) {
@@ -6735,6 +6799,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     this->insert<PrintfInstructionPattern>();
     this->insert<MediaBlockReadInstructionPattern>();
     this->insert<MediaBlockWriteInstructionPattern>();
+    this->insert<SimdShuffleDeltaInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index ed64580..a274626 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1136,6 +1136,35 @@ namespace ir {
       uint8_t vec_size;
     };
 
+    class ALIGNED_INSTRUCTION SimdShuffleDeltaInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<SimdShuffleDeltaInstruction >,
+      public NDstPolicy<SimdShuffleDeltaInstruction, 1>
+    {
+    public:
+
+      INLINE SimdShuffleDeltaInstruction(Register dst, Tuple srcTuple, uint8_t srcNum) {
+        this->opcode = OP_SIMD_SHUFFLE_DELTA;
+        this->dst[0] = dst;
+        this->src = srcTuple;
+        this->srcNum = srcNum;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << " %" << this->getDst(fn, 0);
+        out << " {";
+        for (uint32_t i = 0; i < srcNum; ++i)
+          out << "%" << this->getSrc(fn, i) << (i != (srcNum-1u) ? " " : "");
+        out << "}";
+      }
+
+      Tuple src;
+      Register dst[1];
+      uint8_t srcNum;
+    };
+
+
 #undef ALIGNED_INSTRUCTION
 
     /////////////////////////////////////////////////////////////////////////
@@ -1679,6 +1708,14 @@ namespace ir {
       return true;
     }
 
+    INLINE bool SimdShuffleDeltaInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      if (this->srcNum != 4) {
+        whyNot = "Wrong number of source.";
+        return false;
+      }
+      return true;
+    }
+
 #undef CHECK_TYPE
 
     /////////////////////////////////////////////////////////////////////////
@@ -2154,6 +2191,10 @@ START_INTROSPECTION(MediaBlockWriteInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(MediaBlockWriteInstruction)
 
+START_INTROSPECTION(SimdShuffleDeltaInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SimdShuffleDeltaInstruction)
+
 #undef END_INTROSPECTION
 #undef START_INTROSPECTION
 #undef DECL_INSN
@@ -2691,6 +2732,9 @@ DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
     return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size).convert();
   }
 
+  Instruction SIMD_SHUFFLE_DELTA(Register dst, Tuple srcTuple, uint8_t srcNum) {
+    return internal::SimdShuffleDeltaInstruction(dst, srcTuple, srcNum).convert();
+  }
 
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
     const Function &fn = insn.getFunction();
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index b2b0b49..7ee59a2 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -653,6 +653,13 @@ namespace ir {
     uint8_t getVectorSize() const;
   };
 
+  /*! simd shuffle */
+  class SimdShuffleDeltaInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Specialize the instruction. Also performs typechecking first based on the
    *  opcode. Crashes if it fails
    */
@@ -889,6 +896,8 @@ namespace ir {
   Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum);
   /*! media block write */
   Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size);
+  /*! sub_group_shuffle_delta dst srctupel */
+  Instruction SIMD_SHUFFLE_DELTA(Register dst, Tuple srcTuple, uint8_t srcNum);
 } /* namespace ir */
 } /* namespace gbe */
 
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 7d755ae..35f9623 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -116,3 +116,4 @@ DECL_INSN(SUBGROUP, SubGroupInstruction)
 DECL_INSN(PRINTF, PrintfInstruction)
 DECL_INSN(MBREAD, MediaBlockReadInstruction)
 DECL_INSN(MBWRITE, MediaBlockWriteInstruction)
+DECL_INSN(SIMD_SHUFFLE_DELTA, SimdShuffleDeltaInstruction)
diff --git a/backend/src/libocl/script/ocl_simd.def b/backend/src/libocl/script/ocl_simd.def
index e26243e..aa47735 100644
--- a/backend/src/libocl/script/ocl_simd.def
+++ b/backend/src/libocl/script/ocl_simd.def
@@ -2,3 +2,12 @@
 floatn intel_sub_group_shuffle(floatn x, uint c)
 intn intel_sub_group_shuffle(intn x, uint c)
 uintn intel_sub_group_shuffle(uintn x, uint c)
+floatn intel_sub_group_shuffle_down(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_down(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_down(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_up(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_up(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_up(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_xor(floatn x, uint c)
+intn intel_sub_group_shuffle_xor(intn x, uint c)
+uintn intel_sub_group_shuffle_xor(uintn x, uint c)
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 753a045..ed76b7a 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -18,6 +18,7 @@
 
 #include "ocl_simd.h"
 #include "ocl_workitem.h"
+#include "ocl_as.h"
 
 uint get_max_sub_group_size(void)
 {
@@ -208,3 +209,36 @@ OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord)
 {
   return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y);
 }
+
+PURE CONST uint __gen_ocl_sub_group_shuffle_delta(uint x, uint y, uint c, uint inRange);
+static OVERLOADABLE INLINE uint as_uint(uint x)
+{
+  return x;
+}
+#define SHUFFLE_DOWN(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
+  uint inRange = ((int)c + (int)get_sub_group_local_id() < get_max_sub_group_size()); \
+  return as_##TYPE(__gen_ocl_sub_group_shuffle_delta(as_uint(x), as_uint(y), (get_sub_group_local_id() + c) % get_max_sub_group_size(), inRange ));\
+}
+SHUFFLE_DOWN(float)
+SHUFFLE_DOWN(int)
+SHUFFLE_DOWN(uint)
+#undef SHUFFLE_DOWN
+
+#define SHUFFLE_UP(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) { \
+  uint inRange = ((int)c - (int)get_sub_group_local_id() > 0); \
+  return as_##TYPE(__gen_ocl_sub_group_shuffle_delta(as_uint(x), as_uint(y), (get_max_sub_group_size() + get_sub_group_local_id() - c) % get_max_sub_group_size(), inRange ));\
+}
+SHUFFLE_UP(float)
+SHUFFLE_UP(int)
+SHUFFLE_UP(uint)
+#undef SHUFFLE_UP
+#define SHUFFLE_XOR(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
+  return intel_sub_group_shuffle(x, (get_sub_group_local_id() ^ c) % get_max_sub_group_size()); \
+}
+SHUFFLE_XOR(float)
+SHUFFLE_XOR(int)
+SHUFFLE_XOR(uint)
+#undef SHUFFLE_XOR
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 799f772..15da0e7 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -132,6 +132,15 @@ OVERLOADABLE double sub_group_scan_exclusive_max(double x);
 OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);
 
 /* blocak read/write */
 OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 38c0f2b..870f35e 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3750,6 +3750,7 @@ namespace gbe
       case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
       case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
       case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+      case GEN_OCL_SUB_GROUP_SHUFFLE_DELTA:
         this->newRegister(&I);
         break;
       case GEN_OCL_PRINTF:
@@ -4897,6 +4898,18 @@ namespace gbe
             this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
           case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
             this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
+          case GEN_OCL_SUB_GROUP_SHUFFLE_DELTA:
+          {
+            vector<ir::Register> srcTupleData;
+            for (uint32_t i = 0; i < 4; ++i) {
+              srcTupleData.push_back(this->getRegister(*AI));
+              ++AI;
+            }
+            const ir::Register dst = this->getRegister(&I);
+            ir::Tuple tuple = ctx.arrayTuple(&srcTupleData[0], 4);
+            ctx.SIMD_SHUFFLE_DELTA(dst, tuple, 4);
+            break;
+          }
           default: break;
         }
       }
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 456ab58..c8e40d3 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -228,5 +228,7 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, __gen_ocl_sub_group_block_w
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, __gen_ocl_sub_group_block_write_image4)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, __gen_ocl_sub_group_block_write_image8)
 
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SHUFFLE_DELTA, __gen_ocl_sub_group_shuffle_delta)
+
 // common function
 DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
-- 
2.7.4



More information about the Beignet mailing list