[Beignet] [PATCH 1/2] add 3 simd level built-in functions: shuffle, simdsize and simdid

Guo Yejun yejun.guo at intel.com
Thu Mar 19 22:57:33 PDT 2015


uint __gen_ocl_get_simd_size();
returns 8 if SIMD8, returns 16 if SIMD16

uint __gen_ocl_get_simd_id();
return value ranges from 0 to simdsize - 1

floatN __gen_ocl_simd_shuffle(floatN x, uint c);
intN   __gen_ocl_simd_shuffle(intN x, uint c);
uintN  __gen_ocl_simd_shuffle(uintN x, uint c);
the value of x of the c-th channel of the SIMD is returned, for all SIMD channels,
the behavior is undefined if c is larger than simdsize - 1

Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 backend/src/backend/gen8_context.cpp               |  29 ++++-
 backend/src/backend/gen_context.cpp                | 127 +++++++++++++++------
 backend/src/backend/gen_context.hpp                |   1 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   1 +
 backend/src/backend/gen_insn_selection.cpp         |  60 ++++++++++
 backend/src/backend/gen_insn_selection.hxx         |   2 +
 backend/src/backend/program.h                      |   1 +
 backend/src/ir/context.hpp                         |   6 +
 backend/src/ir/instruction.cpp                     |  32 ++++++
 backend/src/ir/instruction.hpp                     |  17 +++
 backend/src/ir/instruction.hxx                     |   3 +
 backend/src/ir/liveness.cpp                        |   5 +
 backend/src/ir/profile.cpp                         |   2 +
 backend/src/ir/profile.hpp                         |   5 +-
 backend/src/libocl/CMakeLists.txt                  |   2 +-
 backend/src/libocl/include/ocl.h                   |   1 +
 backend/src/libocl/include/ocl_misc.h              |   8 --
 backend/src/libocl/script/ocl_simd.def             |   4 +
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl           |  19 +++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h            |  34 ++++++
 backend/src/llvm/llvm_gen_backend.cpp              |  27 +++++
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   4 +
 src/cl_command_queue_gen7.c                        |   8 ++
 23 files changed, 351 insertions(+), 47 deletions(-)
 create mode 100644 backend/src/libocl/script/ocl_simd.def
 create mode 100644 backend/src/libocl/tmpl/ocl_simd.tmpl.cl
 create mode 100644 backend/src/libocl/tmpl/ocl_simd.tmpl.h

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 3f57cf6..144fd00 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -240,6 +240,9 @@ namespace gbe
   }
 
   void Gen8Context::emitBinaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
     switch (insn.opcode) {
       case SEL_OP_SEL_INT64:
       case SEL_OP_I64AND:
@@ -250,14 +253,34 @@ namespace gbe
         break;
       case SEL_OP_UPSAMPLE_LONG:
       {
-        const GenRegister dst = ra->genReg(insn.dst(0));
-        const GenRegister src0 = ra->genReg(insn.src(0));
-        const GenRegister src1 = ra->genReg(insn.src(1));
         p->MOV(dst, src0);
         p->SHL(dst, dst, GenRegister::immud(32));
         p->ADD(dst, dst, src1);
         break;
       }
+      case SEL_OP_SIMD_SHUFFLE:
+      {
+        uint32_t simd = p->curr.execWidth;
+        if (src1.file == GEN_IMMEDIATE_VALUE) {
+          uint32_t offset = src1.value.ud % simd;
+          uint32_t nr = src0.nr;
+          uint32_t subnr = src0.subnr;
+          subnr = subnr + offset;
+          if (subnr > 8) {
+            nr = nr + 1;
+            subnr = subnr - 8;
+          }
+          p->MOV(dst, GenRegister::ud1grf(nr, subnr));
+        } else {
+          uint32_t base = src0.nr * 32 + src0.subnr * 4;
+          GenRegister baseReg = GenRegister::immuw(base);
+          const GenRegister a0 = GenRegister::addr8(0);
+          p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+          GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+          p->MOV(dst, indirect);
+        }
+        break;
+      }
       default:
         GenContext::emitBinaryInstruction(insn);
     }
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index cdf581c..25c7a5a 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -198,6 +198,22 @@ namespace gbe
     this->labelPos.insert(std::make_pair(label, p->store.size()));
   }
 
+  void GenContext::emitNullaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    switch (insn.opcode) {
+      case SEL_OP_SIMD_ID:
+        {
+          const GenRegister selLaneID = this->simdWidth == 8 ?
+                                GenRegister::ud8grf(ir::ocl::laneid) :
+                                GenRegister::ud16grf(ir::ocl::laneid);
+          const GenRegister laneID = ra->genReg(selLaneID);
+          p->MOV(dst, laneID);
+        }
+        break;
+      default: NOT_IMPLEMENTED;
+    }
+  }
+
   void GenContext::emitUnaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
@@ -583,6 +599,46 @@ namespace gbe
           p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
         }
         break;
+      case SEL_OP_SIMD_SHUFFLE:
+        {
+          uint32_t simd = p->curr.execWidth;
+          if (src1.file == GEN_IMMEDIATE_VALUE) {
+            uint32_t offset = src1.value.ud % simd;
+            uint32_t nr = src0.nr;
+            uint32_t subnr = src0.subnr;
+            subnr = subnr + offset;
+            if (subnr > 8) {
+              nr = nr + 1;
+              subnr = subnr - 8;
+            }
+            p->MOV(dst, GenRegister::ud1grf(nr, subnr));
+          } else {
+            uint32_t base = src0.nr * 32 + src0.subnr * 4;
+            GenRegister baseReg = GenRegister::immuw(base);
+            const GenRegister a0 = GenRegister::addr8(0);
+
+            p->push();
+              if (simd == 8) {
+                p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+                GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+                p->MOV(dst, indirect);
+              }
+              else if (simd == 16) {
+                p->curr.execWidth = 8;
+                p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+                GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+                p->MOV(dst, indirect);
+
+                p->curr.quarterControl = 1;
+                p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+                p->MOV(GenRegister::offset(dst, 1, 0), indirect);
+              }
+              else
+                NOT_IMPLEMENTED;
+            p->pop();
+          }
+        }
+        break;
       default: NOT_IMPLEMENTED;
     }
   }
@@ -2023,41 +2079,46 @@ namespace gbe
     } else
   
     fn.foreachInstruction([&](ir::Instruction &insn) {
-      const uint32_t srcNum = insn.getSrcNum();
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
-        const ir::Register reg = insn.getSrc(srcID);
-        if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
-          if (srcID != 0) continue;
-          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
-          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
-          ir::ImageInfoKey key(bti, type);
-          const ir::Register imageInfo = insn.getSrc(0);
-          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
-            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
-            insertCurbeReg(imageInfo, offset);
+      if (insn.getOpcode() == ir::OP_SIMD_ID) {
+        if (curbeRegs.find(laneid) == curbeRegs.end())
+          allocCurbeReg(laneid, GBE_CURBE_LANE_ID);
+      } else {
+        const uint32_t srcNum = insn.getSrcNum();
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const ir::Register reg = insn.getSrc(srcID);
+          if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
+            if (srcID != 0) continue;
+            const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
+            const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
+            ir::ImageInfoKey key(bti, type);
+            const ir::Register imageInfo = insn.getSrc(0);
+            if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
+              uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
+              insertCurbeReg(imageInfo, offset);
+            }
+            continue;
           }
-          continue;
+          if (fn.isSpecialReg(reg) == false) continue;
+          if (curbeRegs.find(reg) != curbeRegs.end()) continue;
+          if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
+          INSERT_REG(lsize0, LOCAL_SIZE_X)
+          INSERT_REG(lsize1, LOCAL_SIZE_Y)
+          INSERT_REG(lsize2, LOCAL_SIZE_Z)
+          INSERT_REG(gsize0, GLOBAL_SIZE_X)
+          INSERT_REG(gsize1, GLOBAL_SIZE_Y)
+          INSERT_REG(gsize2, GLOBAL_SIZE_Z)
+          INSERT_REG(goffset0, GLOBAL_OFFSET_X)
+          INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
+          INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
+          INSERT_REG(workdim, WORK_DIM)
+          INSERT_REG(numgroup0, GROUP_NUM_X)
+          INSERT_REG(numgroup1, GROUP_NUM_Y)
+          INSERT_REG(numgroup2, GROUP_NUM_Z)
+          INSERT_REG(stackptr, STACK_POINTER)
+          INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
+          INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
+          do {} while(0);
         }
-        if (fn.isSpecialReg(reg) == false) continue;
-        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
-        if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
-        INSERT_REG(lsize0, LOCAL_SIZE_X)
-        INSERT_REG(lsize1, LOCAL_SIZE_Y)
-        INSERT_REG(lsize2, LOCAL_SIZE_Z)
-        INSERT_REG(gsize0, GLOBAL_SIZE_X)
-        INSERT_REG(gsize1, GLOBAL_SIZE_Y)
-        INSERT_REG(gsize2, GLOBAL_SIZE_Z)
-        INSERT_REG(goffset0, GLOBAL_OFFSET_X)
-        INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
-        INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
-        INSERT_REG(workdim, WORK_DIM)
-        INSERT_REG(numgroup0, GROUP_NUM_X)
-        INSERT_REG(numgroup1, GROUP_NUM_Y)
-        INSERT_REG(numgroup2, GROUP_NUM_Z)
-        INSERT_REG(stackptr, STACK_POINTER)
-        INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
-        INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
-        do {} while(0);
       }
     });
 #undef INSERT_REG
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6ca88db..3ac675e 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -124,6 +124,7 @@ namespace gbe
 
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
+    virtual void emitNullaryInstruction(const SelectionInstruction &insn);
     virtual void emitUnaryInstruction(const SelectionInstruction &insn);
     virtual void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
     virtual void emitBinaryInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index d054820..fd7e1a4 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -1,5 +1,6 @@
 //                 Family     Latency     SIMD16     SIMD8
 DECL_GEN7_SCHEDULE(Label,           0,         0,        0)
+DECL_GEN7_SCHEDULE(Nullary,         20,        4,        2)
 DECL_GEN7_SCHEDULE(Unary,           20,        4,        2)
 DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        40,      20)
 DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index c240261..1586098 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -477,6 +477,8 @@ namespace gbe
     /*! To make function prototypes more readable */
     typedef const GenRegister &Reg;
 
+#define ALU0(OP) \
+  INLINE void OP(Reg dst) { ALU0(SEL_OP_##OP, dst); }
 #define ALU1(OP) \
   INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); }
 #define ALU1WithTemp(OP) \
@@ -530,12 +532,15 @@ namespace gbe
     ALU2WithTemp(HADD)
     ALU2WithTemp(RHADD)
     ALU2(UPSAMPLE_LONG)
+    ALU2(SIMD_SHUFFLE)
+    ALU0(SIMD_ID)
     ALU1WithTemp(CONVI_TO_I64)
     ALU1WithTemp(CONVF_TO_I64)
     ALU1(CONVI64_TO_I)
     I64Shift(I64SHL)
     I64Shift(I64SHR)
     I64Shift(I64ASR)
+#undef ALU0
 #undef ALU1
 #undef ALU1WithTemp
 #undef ALU2
@@ -622,6 +627,8 @@ namespace gbe
     void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
     /*! Extended math function (1 argument) */
     void MATH(Reg dst, uint32_t function, Reg src);
+    /*! Encode nullary instructions */
+    void ALU0(SelectionOpcode opcode, Reg dst);
     /*! Encode unary instructions */
     void ALU1(SelectionOpcode opcode, Reg dst, Reg src);
     /*! Encode unary with temp reg instructions */
@@ -1435,6 +1442,11 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
+  void Selection::Opaque::ALU0(SelectionOpcode opcode, Reg dst) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 1, 0);
+    insn->dst(0) = dst;
+  }
+
   void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
     SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
     insn->dst(0) = dst;
@@ -2054,6 +2066,42 @@ namespace gbe
 #define DECL_CTOR(FAMILY, INSN_NUM, COST) \
   FAMILY##Pattern(void) : OneToManyPattern<FAMILY##Pattern, ir::FAMILY>(INSN_NUM, COST) {}
 
+  /*! Nullary instruction patterns */
+  class NullaryInstructionPattern : public SelectionPattern
+  {
+  public:
+    NullaryInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::NullaryInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::NullaryInstruction &insn = cast<NullaryInstruction>(dag.insn);
+      const Opcode opcode = insn.getOpcode();
+      const Type type = insn.getType();
+      GenRegister dst = sel.selReg(insn.getDst(0), type);
+
+      sel.push();
+      switch (opcode) {
+        case ir::OP_SIMD_SIZE:
+          {
+            const GenRegister src = GenRegister::immud(sel.curr.execWidth);
+            sel.curr.execWidth = 1;
+            sel.MOV(dst, src);
+          }
+          break;
+        case ir::OP_SIMD_ID:
+          sel.SIMD_ID(dst);
+          break;
+        default: NOT_SUPPORTED;
+      }
+      sel.pop();
+      return true;
+    }
+  };
+
   /*! Unary instruction patterns */
   DECL_PATTERN(UnaryInstruction)
   {
@@ -2563,6 +2611,17 @@ namespace gbe
         case OP_UPSAMPLE_LONG:
           sel.UPSAMPLE_LONG(dst, src0, src1);
           break;
+        case OP_SIMD_SHUFFLE:
+          {
+            if (src1.file == GEN_IMMEDIATE_VALUE) {
+              sel.SIMD_SHUFFLE(dst, src0, src1);
+            } else {
+              GenRegister shiftL = GenRegister::udxgrf(sel.curr.execWidth, sel.reg(FAMILY_DWORD));
+              sel.SHL(shiftL, src1, GenRegister::immud(0x2));
+              sel.SIMD_SHUFFLE(dst, src0, shiftL);
+            }
+          }
+          break;
         default: NOT_IMPLEMENTED;
       }
       sel.pop();
@@ -4789,6 +4848,7 @@ namespace gbe
     this->insert<GetImageInfoInstructionPattern>();
     this->insert<ReadARFInstructionPattern>();
     this->insert<RegionInstructionPattern>();
+    this->insert<NullaryInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 09f5aaf..87ccee3 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -77,6 +77,8 @@ DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
 DECL_SELECTION_IR(I64HADD, I64HADDInstruction)
 DECL_SELECTION_IR(I64RHADD, I64RHADDInstruction)
 DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
+DECL_SELECTION_IR(SIMD_SHUFFLE, BinaryInstruction)
+DECL_SELECTION_IR(SIMD_ID, NullaryInstruction)
 DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
 DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
 DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index dc5662f..c4023ec 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -99,6 +99,7 @@ enum gbe_curbe_type {
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_ZERO,
   GBE_CURBE_ONE,
+  GBE_CURBE_LANE_ID,
   GBE_CURBE_SLM_OFFSET,
 };
 
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index cf5109d..af65ff3 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -176,6 +176,12 @@ namespace ir {
     DECL_THREE_SRC_INSN(MAD);
 #undef DECL_THREE_SRC_INSN
 
+    /*! For all nullary functions */
+    void ALU0(Opcode opcode, Type type, Register dst) {
+      const Instruction insn = gbe::ir::ALU0(opcode, type, dst);
+      this->append(insn);
+    }
+
     /*! For all unary functions */
     void ALU1(Opcode opcode, Type type, Register dst, Register src) {
       const Instruction insn = gbe::ir::ALU1(opcode, type, dst, src);
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 797552f..9c3331b 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -131,6 +131,17 @@ namespace ir {
       Register src[srcNum]; //!< Indices of the sources
     };
 
+    /*! All 0-source arithmetic instructions */
+    class ALIGNED_INSTRUCTION NullaryInstruction : public NaryInstruction<0>
+    {
+    public:
+      NullaryInstruction(Opcode opcode, Type type, Register dst) {
+        this->opcode = opcode;
+        this->type = type;
+        this->dst[0] = dst;
+      }
+    };
+
     /*! All 1-source arithmetic instructions */
     class ALIGNED_INSTRUCTION UnaryInstruction : public NaryInstruction<1>
     {
@@ -1305,6 +1316,10 @@ namespace ir {
     }; \
   }
 
+START_INTROSPECTION(NullaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(NullaryInstruction)
+
 START_INTROSPECTION(UnaryInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(UnaryInstruction)
@@ -1532,6 +1547,7 @@ END_FUNCTION(Instruction, Register)
     return reinterpret_cast<const internal::CLASS*>(this)->CALL; \
   }
 
+DECL_MEM_FN(NullaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(UnaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(BinaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(BinaryInstruction, bool, commutes(void), commutes())
@@ -1586,6 +1602,21 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   // Implements the emission functions
   ///////////////////////////////////////////////////////////////////////////
 
+  // For all nullary functions with given opcode
+  Instruction ALU0(Opcode opcode, Type type, Register dst) {
+    return internal::NullaryInstruction(opcode, type, dst).convert();
+  }
+
+  // All unary functions
+#define DECL_EMIT_FUNCTION(NAME) \
+  Instruction NAME(Type type, Register dst) { \
+    return ALU0(OP_##NAME, type, dst);\
+  }
+
+  DECL_EMIT_FUNCTION(SIMD_SIZE)
+
+#undef DECL_EMIT_FUNCTION
+
   // For all unary functions with given opcode
   Instruction ALU1(Opcode opcode, Type type, Register dst, Register src) {
     return internal::UnaryInstruction(opcode, type, dst, src).convert();
@@ -1645,6 +1676,7 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   DECL_EMIT_FUNCTION(RHADD)
   DECL_EMIT_FUNCTION(I64HADD)
   DECL_EMIT_FUNCTION(I64RHADD)
+  DECL_EMIT_FUNCTION(SIMD_SHUFFLE)
 
 #undef DECL_EMIT_FUNCTION
 
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 24d27aa..6dd3e81 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -198,6 +198,15 @@ namespace ir {
   /*! Output the instruction string in the given stream */
   std::ostream &operator<< (std::ostream &out, const Instruction &proxy);
 
+  /*! Nullary instruction instructions are typed. */
+  class NullaryInstruction : public Instruction {
+  public:
+    /*! Get the type manipulated by the instruction */
+    Type getType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Unary instructions are typed. dst and sources share the same type */
   class UnaryInstruction : public Instruction {
   public:
@@ -558,6 +567,12 @@ namespace ir {
   /// All emission functions
   ///////////////////////////////////////////////////////////////////////////
 
+  /*! alu0.type dst */
+  Instruction ALU0(Opcode opcode, Type type, Register dst);
+  /*! simd_size.type dst */
+  Instruction SIMD_SIZE(Type type, Register dst);
+  /*! simd_id.type dst */
+  Instruction SIMD_ID(Type type, Register dst);
   /*! alu1.type dst src */
   Instruction ALU1(Opcode opcode, Type type, Register dst, Register src);
   /*! mov.type dst src */
@@ -670,6 +685,8 @@ namespace ir {
   Instruction GT(Type type, Register dst, Register src0, Register src1);
   /*! ord.type dst src0 src1 */
   Instruction ORD(Type type, Register dst, Register src0, Register src1);
+  /*! simd_shuffle.type dst src0 src1 */
+  Instruction SIMD_SHUFFLE(Type type, Register dst, Register src0, Register src1);
   /*! BITCAST.{dstType <- srcType} dst src */
   Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum);
   /*! cvt.{dstType <- srcType} dst src */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index de4abfb..76269bd 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -25,6 +25,8 @@
  * \file instruction.hxx
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
+DECL_INSN(SIMD_SIZE, NullaryInstruction)
+DECL_INSN(SIMD_ID, NullaryInstruction)
 DECL_INSN(MOV, UnaryInstruction)
 DECL_INSN(COS, UnaryInstruction)
 DECL_INSN(SIN, UnaryInstruction)
@@ -57,6 +59,7 @@ DECL_INSN(BSB, BinaryInstruction)
 DECL_INSN(OR, BinaryInstruction)
 DECL_INSN(XOR, BinaryInstruction)
 DECL_INSN(AND, BinaryInstruction)
+DECL_INSN(SIMD_SHUFFLE, BinaryInstruction)
 DECL_INSN(SEL, SelectInstruction)
 DECL_INSN(EQ, CompareInstruction)
 DECL_INSN(NE, CompareInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index 2b1ffdb..26c4129 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -66,6 +66,11 @@ namespace ir {
         const uint32_t srcNum = insn.getSrcNum();
         const uint32_t dstNum = insn.getDstNum();
         bool uniform = true;
+
+        //have no way to decide the dst uniform if there is no source
+        if (srcNum == 0)
+          uniform = false;
+
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const Register reg = insn.getSrc(srcID);
           if (!fn.isUniformRegister(reg))
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 4c272bd..55aedb4 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -43,6 +43,7 @@ namespace ir {
         "zero", "one",
         "retVal", "slm_offset",
         "printf_buffer_pointer", "printf_index_buffer_pointer",
+        "lane_id",
         "invalid"
     };
 
@@ -86,6 +87,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
+      DECL_NEW_REG(FAMILY_DWORD, laneid, 0);
       DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
     }
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 7259d9f..d310128 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -71,8 +71,9 @@ namespace ir {
     static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
     static const Register printfbptr = Register(28); // printf buffer address .
     static const Register printfiptr = Register(29); // printf index buffer address.
-    static const Register invalid = Register(30);  // used for valid comparation.
-    static const uint32_t regNum = 31;             // number of special registers
+    static const Register laneid = Register(30); // printf index buffer address.
+    static const Register invalid = Register(31);  // used for valid comparation.
+    static const uint32_t regNum = 32;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 16f00ee..623affc 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -90,7 +90,7 @@ MACRO(GENERATE_SOURCE_PY _mod)
 	)
 ENDMACRO(GENERATE_SOURCE_PY)
 
-SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math)
+SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math ocl_simd)
 FOREACH(M ${OCL_PY_GENERATED_MODULES})
     GENERATE_HEADER_PY(${M})
     GENERATE_SOURCE_PY(${M})
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index e886670..a53f4c0 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -30,6 +30,7 @@
 #include "ocl_image.h"
 #include "ocl_integer.h"
 #include "ocl_math.h"
+#include "ocl_simd.h"
 #include "ocl_misc.h"
 #include "ocl_printf.h"
 #include "ocl_relational.h"
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index aa3f504..359025b 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -128,14 +128,6 @@ DEF(ulong)
 #undef DEC16
 #undef DEC16X
 
-
-/* Temp to add the SIMD functions here. */
-/////////////////////////////////////////////////////////////////////////////
-// SIMD level function
-/////////////////////////////////////////////////////////////////////////////
-short __gen_ocl_simd_any(short);
-short __gen_ocl_simd_all(short);
-
 struct time_stamp {
   // time tick
   ulong tick;
diff --git a/backend/src/libocl/script/ocl_simd.def b/backend/src/libocl/script/ocl_simd.def
new file mode 100644
index 0000000..ccda619
--- /dev/null
+++ b/backend/src/libocl/script/ocl_simd.def
@@ -0,0 +1,4 @@
+##simd level functions
+floatn __gen_ocl_simd_shuffle(floatn x, uint c)
+intn __gen_ocl_simd_shuffle(intn x, uint c)
+uintn __gen_ocl_simd_shuffle(uintn x, uint c)
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
new file mode 100644
index 0000000..b9da5e2
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -0,0 +1,19 @@
+/*
+ * Copyright @ 2015 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "ocl_simd.h"
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
new file mode 100644
index 0000000..42afc7b
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_SIMD_H__
+#define __OCL_SIMD_H__
+
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// SIMD level function
+/////////////////////////////////////////////////////////////////////////////
+short __gen_ocl_simd_any(short);
+short __gen_ocl_simd_all(short);
+
+uint __gen_ocl_get_simd_size(void);
+uint __gen_ocl_get_simd_id(void);
+
+OVERLOADABLE float __gen_ocl_simd_shuffle(float x, uint c);
+OVERLOADABLE int __gen_ocl_simd_shuffle(int x, uint c);
+OVERLOADABLE uint __gen_ocl_simd_shuffle(uint x, uint c);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index bf03a13..4fcb8bb 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2790,10 +2790,17 @@ namespace gbe
       case GEN_OCL_CONV_F32_TO_F16:
       case GEN_OCL_SIMD_ANY:
       case GEN_OCL_SIMD_ALL:
+      case GEN_OCL_SIMD_SHUFFLE:
       case GEN_OCL_READ_TM:
       case GEN_OCL_REGION:
         this->newRegister(&I);
         break;
+      case GEN_OCL_SIMD_SIZE:
+        this->newRegister(&I, NULL, true);
+        break;
+      case GEN_OCL_SIMD_ID:
+        this->newRegister(&I, NULL, false);
+        break;
       case GEN_OCL_PRINTF:
         break;
       default:
@@ -3053,6 +3060,26 @@ namespace gbe
             ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S16, dst, src);
             break;
           }
+          case GEN_OCL_SIMD_SIZE:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU0(ir::OP_SIMD_SIZE, getType(ctx, I.getType()), dst);
+            break;
+          }
+          case GEN_OCL_SIMD_ID:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU0(ir::OP_SIMD_ID, getType(ctx, I.getType()), dst);
+            break;
+          }
+          case GEN_OCL_SIMD_SHUFFLE:
+          {
+            const ir::Register src0 = this->getRegister(*AI); ++AI;
+            const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.SIMD_SHUFFLE(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
           case GEN_OCL_READ_TM:
           {
             const ir::Register dst = this->getRegister(&I);
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 9536a3c..714a293 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -155,6 +155,10 @@ DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)
 DECL_LLVM_GEN_FUNCTION(SIMD_ANY, __gen_ocl_simd_any)
 DECL_LLVM_GEN_FUNCTION(SIMD_ALL, __gen_ocl_simd_all)
 
+DECL_LLVM_GEN_FUNCTION(SIMD_SIZE, __gen_ocl_get_simd_size)
+DECL_LLVM_GEN_FUNCTION(SIMD_ID, __gen_ocl_get_simd_id)
+DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, __gen_ocl_simd_shuffle)
+
 DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
 DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 253c4f2..3f73de0 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -202,6 +202,14 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
 #undef UPLOAD
 
+  /* __gen_ocl_get_simd_id needs it */
+  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LANE_ID, 0)) >= 0) {
+    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
+    uint32_t *laneid = (uint32_t *) (ker->curbe + offset);
+    int32_t i;
+    for (i = 0; i < (int32_t) simd_sz; ++i) laneid[i] = i;
+  }
+
   /* Write identity for the stack pointer. This is required by the stack pointer
    * computation in the kernel
    */
-- 
1.9.1



More information about the Beignet mailing list