[Beignet] [PATCH V2 5/6] Backend: Add intel_sub_group_shuffle_down/up/xor

Xiuli Pan xiuli.pan at intel.com
Thu Jul 7 03:09:34 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

Using a function shuffle delta for down/up, using some flags for current
and down/up src switch. The flags and index is pre caculated in libocl.
The shuffle delta only handle flag mask the dst with different src.
Using the old shuffle with xor for shuffle_xor.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 65 ++++++++++++++++++++++++++++++
 backend/src/ir/instruction.cpp             | 44 ++++++++++++++++++++
 backend/src/ir/instruction.hpp             |  9 +++++
 backend/src/ir/instruction.hxx             |  1 +
 backend/src/libocl/script/ocl_simd.def     |  9 +++++
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl   | 34 ++++++++++++++++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h    |  9 +++++
 backend/src/llvm/llvm_gen_backend.cpp      | 13 ++++++
 backend/src/llvm/llvm_gen_ocl_function.hxx |  2 +
 9 files changed, 186 insertions(+)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index e342161..7b646e0 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -6738,6 +6738,70 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
   };
 
+  /*! SIMD shuffle delta pattern */
+  DECL_PATTERN(SimdShuffleDeltaInstruction)
+  {
+    bool emitOne(Selection::Opaque &sel, const ir::SimdShuffleDeltaInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_U32);
+      const GenRegister srcx = sel.selReg(insn.getSrc(0), TYPE_U32);
+      const GenRegister srcy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      const GenRegister index = sel.selReg(insn.getSrc(2), TYPE_U32);
+      const GenRegister inRange = sel.selReg(insn.getSrc(3), TYPE_U32);
+      const GenRegister constZero = GenRegister::immud(0);;
+      const GenRegister shiftL = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      bool hasShiftL = false;
+
+      sel.push();
+      sel.curr.predicate = GEN_PREDICATE_NONE;
+      /* First shuffle for srcx */
+      if (sel.isScalarReg(insn.getSrc(0))) {
+        sel.MOV(dst, srcx);
+      } else {
+        if (index.file == GEN_IMMEDIATE_VALUE) {
+          sel.push();
+          uint32_t offset = index.value.ud % sel.curr.execWidth;
+          GenRegister reg = GenRegister::subphysicaloffset(srcx, offset);
+          reg.vstride = GEN_VERTICAL_STRIDE_0;
+          reg.hstride = GEN_HORIZONTAL_STRIDE_0;
+          reg.width = GEN_WIDTH_1;
+          sel.MOV(dst, reg);
+          sel.push();
+        } else {
+          sel.SHL(shiftL, index, GenRegister::immud(0x2));
+          hasShiftL = true;
+          sel.SIMD_SHUFFLE(dst, srcx, shiftL);
+        }
+      }
+      sel.curr.flag = 0;
+      sel.curr.subFlag = 1;
+      sel.CMP(GEN_CONDITIONAL_EQ, inRange, constZero);
+      sel.curr.predicate = GEN_PREDICATE_NORMAL;
+      /* Now shuffle for srcy */
+      if (sel.isScalarReg(insn.getSrc(1))) {
+        sel.MOV(dst, srcy);
+      } else {
+        if (index.file == GEN_IMMEDIATE_VALUE) {
+          sel.push();
+          uint32_t offset = index.value.ud % sel.curr.execWidth;
+          GenRegister reg = GenRegister::subphysicaloffset(srcy, offset);
+          reg.vstride = GEN_VERTICAL_STRIDE_0;
+          reg.hstride = GEN_HORIZONTAL_STRIDE_0;
+          reg.width = GEN_WIDTH_1;
+          sel.MOV(dst, reg);
+          sel.pop();
+        } else {
+          if (!hasShiftL)
+            sel.SHL(shiftL, index, GenRegister::immud(0x2));
+          sel.SIMD_SHUFFLE(dst, srcy, shiftL);
+        }
+      }
+      sel.pop();
+      return true;
+    }
+    DECL_CTOR(SimdShuffleDeltaInstruction, 1, 1);
+  };
 
   /*! Sort patterns */
   INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) {
@@ -6782,6 +6846,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     this->insert<PrintfInstructionPattern>();
     this->insert<MediaBlockReadInstructionPattern>();
     this->insert<MediaBlockWriteInstructionPattern>();
+    this->insert<SimdShuffleDeltaInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index ed64580..a274626 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1136,6 +1136,35 @@ namespace ir {
       uint8_t vec_size;
     };
 
+    class ALIGNED_INSTRUCTION SimdShuffleDeltaInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<SimdShuffleDeltaInstruction >,
+      public NDstPolicy<SimdShuffleDeltaInstruction, 1>
+    {
+    public:
+
+      INLINE SimdShuffleDeltaInstruction(Register dst, Tuple srcTuple, uint8_t srcNum) {
+        this->opcode = OP_SIMD_SHUFFLE_DELTA;
+        this->dst[0] = dst;
+        this->src = srcTuple;
+        this->srcNum = srcNum;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << " %" << this->getDst(fn, 0);
+        out << " {";
+        for (uint32_t i = 0; i < srcNum; ++i)
+          out << "%" << this->getSrc(fn, i) << (i != (srcNum-1u) ? " " : "");
+        out << "}";
+      }
+
+      Tuple src;
+      Register dst[1];
+      uint8_t srcNum;
+    };
+
+
 #undef ALIGNED_INSTRUCTION
 
     /////////////////////////////////////////////////////////////////////////
@@ -1679,6 +1708,14 @@ namespace ir {
       return true;
     }
 
+    INLINE bool SimdShuffleDeltaInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      if (this->srcNum != 4) {
+        whyNot = "Wrong number of source.";
+        return false;
+      }
+      return true;
+    }
+
 #undef CHECK_TYPE
 
     /////////////////////////////////////////////////////////////////////////
@@ -2154,6 +2191,10 @@ START_INTROSPECTION(MediaBlockWriteInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(MediaBlockWriteInstruction)
 
+START_INTROSPECTION(SimdShuffleDeltaInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SimdShuffleDeltaInstruction)
+
 #undef END_INTROSPECTION
 #undef START_INTROSPECTION
 #undef DECL_INSN
@@ -2691,6 +2732,9 @@ DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
     return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size).convert();
   }
 
+  Instruction SIMD_SHUFFLE_DELTA(Register dst, Tuple srcTuple, uint8_t srcNum) {
+    return internal::SimdShuffleDeltaInstruction(dst, srcTuple, srcNum).convert();
+  }
 
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
     const Function &fn = insn.getFunction();
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index b2b0b49..7ee59a2 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -653,6 +653,13 @@ namespace ir {
     uint8_t getVectorSize() const;
   };
 
+  /*! simd shuffle */
+  class SimdShuffleDeltaInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Specialize the instruction. Also performs typechecking first based on the
    *  opcode. Crashes if it fails
    */
@@ -889,6 +896,8 @@ namespace ir {
   Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum);
   /*! media block write */
   Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size);
+  /*! sub_group_shuffle_delta dst srctupel */
+  Instruction SIMD_SHUFFLE_DELTA(Register dst, Tuple srcTuple, uint8_t srcNum);
 } /* namespace ir */
 } /* namespace gbe */
 
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 7d755ae..35f9623 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -116,3 +116,4 @@ DECL_INSN(SUBGROUP, SubGroupInstruction)
 DECL_INSN(PRINTF, PrintfInstruction)
 DECL_INSN(MBREAD, MediaBlockReadInstruction)
 DECL_INSN(MBWRITE, MediaBlockWriteInstruction)
+DECL_INSN(SIMD_SHUFFLE_DELTA, SimdShuffleDeltaInstruction)
diff --git a/backend/src/libocl/script/ocl_simd.def b/backend/src/libocl/script/ocl_simd.def
index e26243e..aa47735 100644
--- a/backend/src/libocl/script/ocl_simd.def
+++ b/backend/src/libocl/script/ocl_simd.def
@@ -2,3 +2,12 @@
 floatn intel_sub_group_shuffle(floatn x, uint c)
 intn intel_sub_group_shuffle(intn x, uint c)
 uintn intel_sub_group_shuffle(uintn x, uint c)
+floatn intel_sub_group_shuffle_down(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_down(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_down(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_up(floatn x, floatn y, uint c)
+intn intel_sub_group_shuffle_up(intn x, intn y, uint c)
+uintn intel_sub_group_shuffle_up(uintn x, uintn y, uint c)
+floatn intel_sub_group_shuffle_xor(floatn x, uint c)
+intn intel_sub_group_shuffle_xor(intn x, uint c)
+uintn intel_sub_group_shuffle_xor(uintn x, uint c)
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index b066502..6aee94e 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -18,6 +18,7 @@
 
 #include "ocl_simd.h"
 #include "ocl_workitem.h"
+#include "ocl_as.h"
 
 uint get_max_sub_group_size(void)
 {
@@ -216,3 +217,36 @@ OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 dat
 {
   __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data);
 }
+
+PURE CONST uint __gen_ocl_sub_group_shuffle_delta(uint x, uint y, uint c, uint inRange);
+static OVERLOADABLE INLINE uint as_uint(uint x)
+{
+  return x;
+}
+#define SHUFFLE_DOWN(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
+  uint inRange = ((int)c + (int)get_sub_group_local_id() < get_max_sub_group_size()); \
+  return as_##TYPE(__gen_ocl_sub_group_shuffle_delta(as_uint(x), as_uint(y), (get_sub_group_local_id() + c) % get_max_sub_group_size(), inRange ));\
+}
+SHUFFLE_DOWN(float)
+SHUFFLE_DOWN(int)
+SHUFFLE_DOWN(uint)
+#undef SHUFFLE_DOWN
+
+#define SHUFFLE_UP(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) { \
+  uint inRange = ((int)c - (int)get_sub_group_local_id() > 0); \
+  return as_##TYPE(__gen_ocl_sub_group_shuffle_delta(as_uint(x), as_uint(y), (get_max_sub_group_size() + get_sub_group_local_id() - c) % get_max_sub_group_size(), inRange ));\
+}
+SHUFFLE_UP(float)
+SHUFFLE_UP(int)
+SHUFFLE_UP(uint)
+#undef SHUFFLE_UP
+#define SHUFFLE_XOR(TYPE) \
+OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
+  return intel_sub_group_shuffle(x, (get_sub_group_local_id() ^ c) % get_max_sub_group_size()); \
+}
+SHUFFLE_XOR(float)
+SHUFFLE_XOR(int)
+SHUFFLE_XOR(uint)
+#undef SHUFFLE_XOR
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 799f772..15da0e7 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -132,6 +132,15 @@ OVERLOADABLE double sub_group_scan_exclusive_max(double x);
 OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c);
+OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
+OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
+OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);
 
 /* blocak read/write */
 OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 41cb783..8f0bcea 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3733,6 +3733,7 @@ namespace gbe
       case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
       case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
       case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+      case GEN_OCL_SUB_GROUP_SHUFFLE_DELTA:
         this->newRegister(&I);
         break;
       case GEN_OCL_PRINTF:
@@ -4897,6 +4898,18 @@ namespace gbe
             this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
           case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
             this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
+          case GEN_OCL_SUB_GROUP_SHUFFLE_DELTA:
+          {
+            vector<ir::Register> srcTupleData;
+            for (uint32_t i = 0; i < 4; ++i) {
+              srcTupleData.push_back(this->getRegister(*AI));
+              ++AI;
+            }
+            const ir::Register dst = this->getRegister(&I);
+            ir::Tuple tuple = ctx.arrayTuple(&srcTupleData[0], 4);
+            ctx.SIMD_SHUFFLE_DELTA(dst, tuple, 4);
+            break;
+          }
           default: break;
         }
       }
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 48a72d1..dbd25b0 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -234,5 +234,7 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, __gen_ocl_sub_group_block_w
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, __gen_ocl_sub_group_block_write_image4)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, __gen_ocl_sub_group_block_write_image8)
 
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SHUFFLE_DELTA, __gen_ocl_sub_group_shuffle_delta)
+
 // common function
 DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
-- 
2.7.4



More information about the Beignet mailing list