[Beignet] [PATCH 1/2] Backend: Add intel_sub_group_block_read/write form buffer

Xiuli Pan xiuli.pan at intel.com
Thu May 19 21:55:54 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

Using OWORD_BLOCK_RW to read/write a block of data for a thread.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c          | 15 +++++
 backend/src/backend/gen_context.cpp                | 63 ++++++++++++++++++
 backend/src/backend/gen_context.hpp                |  2 +
 backend/src/backend/gen_encoder.cpp                | 38 ++++++++++-
 backend/src/backend/gen_encoder.hpp                |  4 ++
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |  2 +
 backend/src/backend/gen_insn_selection.cpp         | 77 ++++++++++++++++++++--
 backend/src/backend/gen_insn_selection.hpp         |  4 ++
 backend/src/backend/gen_insn_selection.hxx         |  2 +
 backend/src/ir/instruction.cpp                     | 26 ++++++--
 backend/src/ir/instruction.hpp                     |  8 ++-
 backend/src/ir/liveness.cpp                        |  5 ++
 backend/src/libocl/CMakeLists.txt                  |  2 +-
 backend/src/libocl/src/ocl_substore.ll             |  9 +++
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl           | 54 +++++++++++++++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h            | 11 ++++
 backend/src/llvm/llvm_gen_backend.cpp              | 65 ++++++++++++++++++
 backend/src/llvm/llvm_gen_ocl_function.hxx         |  5 +-
 18 files changed, 377 insertions(+), 15 deletions(-)
 create mode 100644 backend/src/libocl/src/ocl_substore.ll

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 067ddd8..9200c26 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -432,6 +432,14 @@ static const char *data_port_data_cache_category[] = {
   "scratch",
 };
 
+static const char *data_port_data_cache_block_size[] = {
+  "1 OWORD LOW",
+  "1 OWORD HIGH",
+  "2 OWORD",
+  "4 OWORD",
+  "8 OWORD",
+};
+
 static const char *data_port_scratch_block_size[] = {
   "1 register",
   "2 registers",
@@ -576,6 +584,7 @@ static int gen_version;
 #define MSG_GW_ACKREQ(inst)        GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.ackreq)
 #define GENERIC_MSG_LENGTH(inst)   GEN_BITS_FIELD(inst, bits3.generic_gen5.msg_length)
 #define GENERIC_RESPONSE_LENGTH(inst) GEN_BITS_FIELD(inst, bits3.generic_gen5.response_length)
+#define OWORD_RW_BLOCK_SIZE(inst)    GEN_BITS_FIELD(inst, bits3.gen7_oblock_rw.block_size)
 
 static int is_special_acc(const void* inst)
 {
@@ -1483,6 +1492,12 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
                    data_port_data_cache_byte_scattered_simd_mode[BYTE_RW_SIMD_MODE(inst)],
                    data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
                    data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            else if(UNTYPED_RW_MSG_TYPE(inst) == 0 || UNTYPED_RW_MSG_TYPE(inst) == 8)
+              format(file, " (bti: %d, data size: %s, %s, %s)",
+                   UNTYPED_RW_BTI(inst),
+                   data_port_data_cache_block_size[OWORD_RW_BLOCK_SIZE(inst)],
+                   data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                   data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
             else
               format(file, " not implemented");
           } else {
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 4d0a3f3..cfb8be1 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3487,6 +3487,69 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister addr = ra->genReg(insn.src(0));
+    const GenRegister first = GenRegister::ud1grf(addr.nr,addr.subnr/sizeof(float));
+    GenRegister header;
+    if (simdWidth == 8)
+      header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_F);
+    else
+      header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(1)),1), GEN_TYPE_F);
+
+    p->push();
+      // Copy r0 into the header first
+      p->curr.execWidth = 8;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(header, GenRegister::f8grf(0,0));
+
+      // Update the header with the current address
+      p->curr.execWidth = 1;
+      const uint32_t nr = header.nr;
+      const uint32_t subnr = header.subnr / sizeof(float);
+      p->SHR(GenRegister::ud1grf(nr, subnr+2), first, GenRegister::immud(4));
+      //p->MOV(GenRegister::ud1grf(nr, subnr+2), first);
+
+      // Put zero in the general state base address
+      p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0));
+
+    p->pop();
+    // Now read the data
+    p->OBREAD(dst, header, insn.getbti(), insn.extra.elem);
+  }
+
+  void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
+    const GenRegister addr = ra->genReg(insn.src(2));
+    const GenRegister first = GenRegister::ud1grf(addr.nr,addr.subnr/sizeof(float));
+    GenRegister header;
+    if (simdWidth == 8)
+      header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
+    else
+      header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(0)),1), GEN_TYPE_F);
+
+    p->push();
+      // Copy r0 into the header first
+      p->curr.execWidth = 8;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(header, GenRegister::f8grf(0,0));
+
+      // Update the header with the current address
+      p->curr.execWidth = 1;
+      const uint32_t nr = header.nr;
+      const uint32_t subnr = header.subnr / sizeof(float);
+      p->SHR(GenRegister::ud1grf(nr, subnr+2), first, GenRegister::immud(4));
+
+      // Put zero in the general state base address
+      p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0));
+
+    p->pop();
+    // Now write the data
+    p->OBWRITE(header, insn.getbti(), insn.extra.elem);
+  }
+
+
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
   BVAR(OCL_OUTPUT_ASM, false);
 
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 4c43ccb..56a5ec2 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -187,6 +187,8 @@ namespace gbe
     void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     unsigned beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned desc);
     void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned jip0);
+    virtual void emitOBReadInstruction(const SelectionInstruction &insn);
+    virtual void emitOBWriteInstruction(const SelectionInstruction &insn);
 
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 31afa67..fc7b5cf 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -258,7 +258,7 @@ namespace gbe
     else
       NOT_SUPPORTED;
   }
-#if 0
+
   static void setOBlockRW(GenEncoder *p,
                           GenNativeInstruction *insn,
                           uint32_t bti,
@@ -272,10 +272,10 @@ namespace gbe
     assert(size == 2 || size == 4);
     insn->bits3.gen7_oblock_rw.msg_type = msg_type;
     insn->bits3.gen7_oblock_rw.bti = bti;
+    GBE_ASSERT(size == 2 || size == 4);
     insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
     insn->bits3.gen7_oblock_rw.header_present = 1;
   }
-#endif
 
   static void setDWordScatterMessgae(GenEncoder *p,
                                      GenNativeInstruction *insn,
@@ -1244,6 +1244,40 @@ namespace gbe
      setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
   }
 
+  void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1;
+    const uint32_t response_length = size / 2; // Size is in owords
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setOBlockRW(this,
+                insn,
+                bti,
+                size,
+                GEN7_OBLOCK_READ,
+                msg_length,
+                response_length);
+  }
+
+  void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1 + size / 2; // Size is in owords
+    const uint32_t response_length = 0;
+    this->setHeader(insn);
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    setOBlockRW(this,
+                insn,
+                bti,
+                size,
+                GEN7_OBLOCK_WRITE,
+                msg_length,
+                response_length);
+  }
+
   void GenEncoder::EOT(uint32_t msg) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 0239293..a53c879 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -267,6 +267,10 @@ namespace gbe
     virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
                             GenRegister src1 = GenRegister::null());
     virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
+    /*! OBlock read */
+    void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! OBlock write */
+    void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
 
     GBE_CLASS(GenEncoder); //!< Use custom allocators
     virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index cb5c4f1..d297726 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -50,3 +50,5 @@ DECL_GEN7_SCHEDULE(StoreProfiling,  80,        1,        1)
 DECL_GEN7_SCHEDULE(WorkGroupOp,     80,        1,        1)
 DECL_GEN7_SCHEDULE(SubGroupOp,      80,        1,        1)
 DECL_GEN7_SCHEDULE(Printf,          80,        1,        1)
+DECL_GEN7_SCHEDULE(OBRead,          80,        1,        1)
+DECL_GEN7_SCHEDULE(OBWrite,         80,        1,        1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 596e70b..7c49242 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -188,7 +188,8 @@ namespace gbe
            this->opcode == SEL_OP_BYTE_GATHER  ||
            this->opcode == SEL_OP_SAMPLE ||
            this->opcode == SEL_OP_VME ||
-           this->opcode == SEL_OP_DWORD_GATHER;
+           this->opcode == SEL_OP_DWORD_GATHER ||
+           this->opcode == SEL_OP_OBREAD;
   }
 
   bool SelectionInstruction::modAcc(void) const {
@@ -210,7 +211,8 @@ namespace gbe
            this->opcode == SEL_OP_WRITE64       ||
            this->opcode == SEL_OP_ATOMIC        ||
            this->opcode == SEL_OP_BYTE_SCATTER  ||
-           this->opcode == SEL_OP_TYPED_WRITE;
+           this->opcode == SEL_OP_TYPED_WRITE ||
+           this->opcode == SEL_OP_OBWRITE;
   }
 
   bool SelectionInstruction::isBranch(void) const {
@@ -697,6 +699,11 @@ namespace gbe
     /*! Sub Group Operations */
     void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
                       GenRegister tmpData1, GenRegister tmpData2);
+    /*! Oblock read */
+    void OBREAD(GenRegister dst, GenRegister addr, GenRegister header, uint32_t bti, uint32_t size);
+    /*! Oblock write */
+    void OBWRITE(GenRegister addr, GenRegister value, GenRegister header, uint32_t bti, uint32_t size);
+
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI
        as much as possible. */
@@ -2014,6 +2021,40 @@ namespace gbe
     insn->src(0) = src;
     insn->src(1) = tmpData2;
   }
+  void Selection::Opaque::OBREAD(GenRegister dst,
+                                 GenRegister addr,
+                                 GenRegister header,
+                                 uint32_t bti,
+                                 uint32_t size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = addr;
+    insn->src(1) = header;
+    insn->setbti(bti);
+    insn->extra.elem = size / sizeof(int[4]); // number of owords
+  }
+
+  void Selection::Opaque::OBWRITE(GenRegister addr,
+                                  GenRegister value,
+                                  GenRegister header,
+                                  uint32_t bti,
+                                  uint32_t size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3);
+    SelectionVector *vector = this->appendVector();
+    insn->src(0) = header;
+    insn->src(1) = value;
+    insn->src(2) = addr;
+    insn->state = this->curr;
+    insn->setbti(bti);
+    insn->extra.elem = size / sizeof(int[4]); // number of owords
+
+    // We need to put the header and the data together
+    vector->regNum = 2;
+    vector->reg = &insn->src(0);
+    vector->offsetID = 0;
+    vector->isSrc = 1;
+  }
+
 
   // Boiler plate to initialize the selection library at c++ pre-main
   static SelectionLibrary *selLib = NULL;
@@ -4002,6 +4043,18 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       }
     }
 
+    void emitOWordRead(Selection::Opaque &sel,
+                       const ir::LoadInstruction &insn,
+                       GenRegister address,
+                       ir::BTI bti) const
+    {
+      using namespace ir;
+      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      sel.OBREAD(value, address, header, bti.imm, simdWidth * sizeof(int));
+    }
+
     // check whether all binded table index point to constant memory
     INLINE bool isAllConstant(const ir::BTI &bti) const {
       if (bti.isConst && bti.imm == BTI_CONSTANT)
@@ -4037,7 +4090,9 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       const uint32_t elemSize = getByteScatterGatherSize(sel, type);
       bool allConstant = isAllConstant(bti);
 
-      if (allConstant) {
+      if (insn.isBlock())
+        this->emitOWordRead(sel, insn, address, bti);
+      else if (allConstant) {
         // XXX TODO read 64bit constant through constant cache
         // Per HW Spec, constant cache messages can read at least DWORD data.
         // So, byte/short data type, we have to read through data cache.
@@ -4164,6 +4219,18 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       }
     }
 
+    void emitOWordWrite(Selection::Opaque &sel,
+                        const ir::StoreInstruction &insn,
+                        GenRegister address,
+                        ir::BTI bti) const
+    {
+      using namespace ir;
+      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      sel.OBWRITE(address, value, header, bti.imm, simdWidth * sizeof(int));
+    }
+
     virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
     {
       using namespace ir;
@@ -4185,7 +4252,9 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         assert(0 && "stateless not supported yet");
       }
 
-      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+      if (insn.isBlock())
+        this->emitOWordWrite(sel, insn, address, bti);
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
         this->emitWrite64(sel, insn, address, bti);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedWrite(sel, insn, address,  bti);
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 8d2e1da..51af686 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -175,6 +175,8 @@ namespace gbe
     INLINE uint32_t getbti() const {
       GBE_ASSERT(isRead() || isWrite());
       switch (opcode) {
+        case SEL_OP_OBREAD:
+        case SEL_OP_OBWRITE:
         case SEL_OP_DWORD_GATHER: return extra.function;
         case SEL_OP_SAMPLE: return extra.rdbti;
         case SEL_OP_VME: return extra.vme_bti;
@@ -188,6 +190,8 @@ namespace gbe
     INLINE void setbti(uint32_t bti) {
       GBE_ASSERT(isRead() || isWrite());
       switch (opcode) {
+        case SEL_OP_OBREAD:
+        case SEL_OP_OBWRITE:
         case SEL_OP_DWORD_GATHER: extra.function = bti; return;
         case SEL_OP_SAMPLE: extra.rdbti = bti; return;
         case SEL_OP_VME: extra.vme_bti = bti; return;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 0e11f9f..4a7caff 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -96,3 +96,5 @@ DECL_SELECTION_IR(STORE_PROFILING, StoreProfilingInstruction)
 DECL_SELECTION_IR(WORKGROUP_OP, WorkGroupOpInstruction)
 DECL_SELECTION_IR(SUBGROUP_OP, SubGroupOpInstruction)
 DECL_SELECTION_IR(PRINTF, PrintfInstruction)
+DECL_SELECTION_IR(OBREAD, OBReadInstruction)
+DECL_SELECTION_IR(OBWRITE, OBWriteInstruction)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 47606b2..88491a7 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -483,10 +483,12 @@ namespace ir {
                         AddressSpace AS,
                         uint32_t _valueNum,
                         bool dwAligned,
-                        AddressMode AM)
+                        AddressMode AM,
+                        bool ifBlock = false)
                       : MemInstruction(AM, AS, dwAligned, type, offset),
                         valueNum(_valueNum),
-                        values(dstValues)
+                        values(dstValues),
+                        ifBlock(ifBlock)
         {
           this->opcode = OP_LOAD;
         }
@@ -519,9 +521,11 @@ namespace ir {
         }
         INLINE bool wellFormed(const Function &fn, std::string &why) const;
         INLINE void out(std::ostream &out, const Function &fn) const;
+        INLINE bool isBlock() const { return ifBlock; }
 
         uint8_t         valueNum;
         Tuple             values;
+        bool             ifBlock;
     };
     class ALIGNED_INSTRUCTION StoreInstruction :
       public MemInstruction,
@@ -534,12 +538,14 @@ namespace ir {
                          AddressSpace addrSpace,
                          uint32_t valueNum,
                          bool dwAligned,
-                         AddressMode AM)
+                         AddressMode AM,
+                         bool ifBlock = false)
           : MemInstruction(AM, addrSpace, dwAligned, type, offset)
         {
           this->opcode = OP_STORE;
           this->values = values;
           this->valueNum = valueNum;
+          this->ifBlock = ifBlock;
         }
         INLINE unsigned getValueNum()      const { return valueNum; }
         INLINE Register getValue(const Function &fn, unsigned id) const {
@@ -565,9 +571,12 @@ namespace ir {
         }
         INLINE bool wellFormed(const Function &fn, std::string &why) const;
         INLINE void out(std::ostream &out, const Function &fn) const;
+        INLINE bool isBlock() const { return ifBlock; }
+
         Register      dst[0];
         uint8_t     valueNum;
         Tuple         values;
+        bool         ifBlock;
     };
 
     class ALIGNED_INSTRUCTION SampleInstruction : // TODO
@@ -1655,6 +1664,8 @@ namespace ir {
     }
 
     INLINE void LoadInstruction::out(std::ostream &out, const Function &fn) const {
+      if(ifBlock)
+        out<< "BLOCK";
       this->outOpcode(out);
       out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
       out << " {";
@@ -1672,6 +1683,8 @@ namespace ir {
     }
 
     INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
+      if(ifBlock)
+        out<< "BLOCK";
       this->outOpcode(out);
       out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
       out << " %" << this->getSrc(fn, 0) << " {";
@@ -2221,7 +2234,9 @@ DECL_MEM_FN(MemInstruction, bool,     isAligned(void), isAligned())
 DECL_MEM_FN(MemInstruction, unsigned, getAddressIndex(void), getAddressIndex())
 DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
 DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
+DECL_MEM_FN(StoreInstruction, bool, isBlock(void), isBlock())
 DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
+DECL_MEM_FN(LoadInstruction, bool, isBlock(void), isBlock())
 DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
 DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
 DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
@@ -2475,9 +2490,10 @@ DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
                    uint32_t valueNum, \
                    bool dwAligned, \
                    AddressMode AM, \
-                   unsigned SurfaceIndex) \
+                   unsigned SurfaceIndex, \
+                   bool isBlock) \
   { \
-    internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \
+    internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM, isBlock); \
     insn.setSurfaceIndex(SurfaceIndex);\
     return insn.convert(); \
   } \
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 799a7bf..4a5811b 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -356,6 +356,8 @@ namespace ir {
     }
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
+    /*! Return true if the given instruction is block write */
+    bool isBlock() const;
   };
 
   /*! Load instruction. The source is simply the address where to get the data.
@@ -372,6 +374,8 @@ namespace ir {
     }
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
+    /*! Return true if the given instruction is block read */
+    bool isBlock() const;
   };
 
   /*! Load immediate instruction loads an typed immediate value into the given
@@ -827,10 +831,10 @@ namespace ir {
   /*! ret */
   Instruction RET(void);
   /*! load.type.space {dst1,...,dst_valueNum} offset value, {bti} */
-  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex);
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex, bool isBlock = false);
   Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
   /*! store.type.space offset {src1,...,src_valueNum} value {bti}*/
-  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex);
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex, bool isBlock = false);
   Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
   /*! loadi.type dst value */
   Instruction LOADI(Type type, Register dst, ImmediateIndex value);
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index d48f067..3162d13 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -117,11 +117,16 @@ namespace ir {
         if (insn.getOpcode() == ir::OP_SIMD_ID)
           uniform = false;
 
+        // do not change dst uniform for block read
+        if (insn.getOpcode() == ir::OP_LOAD && ir::cast<ir::LoadInstruction>(insn).isBlock())
+          uniform = false;
+
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const Register reg = insn.getSrc(srcID);
           if (!fn.isUniformRegister(reg))
             uniform = false;
         }
+
         // A destination is a killed value
         for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
           const Register reg = insn.getDst(dstID);
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 1d1ec68..83e767c 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
 	)
 ENDMACRO(ADD_LL_TO_BC_TARGET)
 
-SET (OCL_LL_MODULES ocl_barrier ocl_clz)
+SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_substore)
 FOREACH(f ${OCL_LL_MODULES})
     COPY_THE_LL(${f})
     ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/src/ocl_substore.ll b/backend/src/libocl/src/ocl_substore.ll
new file mode 100644
index 0000000..665cdfa
--- /dev/null
+++ b/backend/src/libocl/src/ocl_substore.ll
@@ -0,0 +1,9 @@
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
+declare void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* nocapture, i32) nounwind alwaysinline noduplicate
+
+define void @_Z27intel_sub_group_block_writePKU3AS1jj(i32 addrspace(1)* %p, i32 %data) nounwind alwaysinline noduplicate {
+  call void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* %p, i32 %data)
+  ret void
+}
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index a25dcef..66490cc 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -133,3 +133,57 @@ RANGE_OP(scan_exclusive, max, float, true)
 RANGE_OP(scan_exclusive, max, double, true)
 
 #undef RANGE_OP
+PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
+OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
+{
+  return __gen_ocl_sub_group_block_read_mem(p);
+}
+OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
+{
+  return (uint2)(intel_sub_group_block_read(p),
+                 intel_sub_group_block_read(p + get_simd_size()));
+}
+OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
+{
+  return (uint4)(intel_sub_group_block_read(p),
+                 intel_sub_group_block_read(p + get_simd_size()),
+                 intel_sub_group_block_read(p + get_simd_size() * 2),
+                 intel_sub_group_block_read(p + get_simd_size() * 3));
+
+}
+OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
+{
+  return (uint8)(intel_sub_group_block_read(p),
+                 intel_sub_group_block_read(p + get_simd_size()),
+                 intel_sub_group_block_read(p + get_simd_size() * 2),
+                 intel_sub_group_block_read(p + get_simd_size() * 3),
+                 intel_sub_group_block_read(p + get_simd_size() * 4),
+                 intel_sub_group_block_read(p + get_simd_size() * 5),
+                 intel_sub_group_block_read(p + get_simd_size() * 6),
+                 intel_sub_group_block_read(p + get_simd_size() * 7));
+}
+
+OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data)
+{
+  intel_sub_group_block_write(p, data.s0);
+  intel_sub_group_block_write(p + get_simd_size(), data.s1);
+}
+OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data)
+{
+  intel_sub_group_block_write(p, data.s0);
+  intel_sub_group_block_write(p + get_simd_size(), data.s1);
+  intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
+  intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
+
+}
+OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data)
+{
+  intel_sub_group_block_write(p, data.s0);
+  intel_sub_group_block_write(p + get_simd_size(), data.s1);
+  intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
+  intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
+  intel_sub_group_block_write(p + get_simd_size() * 4, data.s4);
+  intel_sub_group_block_write(p + get_simd_size() * 5, data.s5);
+  intel_sub_group_block_write(p + get_simd_size() * 6, data.s6);
+  intel_sub_group_block_write(p + get_simd_size() * 7, data.s7);
+}
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 355ee30..d0676be 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -132,3 +132,14 @@ OVERLOADABLE double sub_group_scan_exclusive_max(double x);
 OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+
+/* blocak read/write */
+OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
+OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p);
+OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
+OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);
+
+OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3ddbfcc..e77290f 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -697,6 +697,8 @@ namespace gbe
     void emitWorkGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
     // Emit subgroup instructions
     void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
+    // Emit subgroup instructions
+    void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite);
 
     uint8_t appendSampler(CallSite::arg_iterator AI);
     uint8_t getImageID(CallInst &I);
@@ -3730,6 +3732,9 @@ namespace gbe
       case GEN_OCL_LRP:
         this->newRegister(&I);
         break;
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+        this->newRegister(&I, NULL, false);
+        break;
       case GEN_OCL_PRINTF:
         this->newRegister(&I);  // fall through
       case GEN_OCL_PUTS:
@@ -3744,6 +3749,7 @@ namespace gbe
       case GEN_OCL_CALC_TIMESTAMP:
       case GEN_OCL_STORE_PROFILING:
       case GEN_OCL_DEBUGWAIT:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
         break;
       case GEN_OCL_NOT_FOUND:
       default:
@@ -3938,6 +3944,61 @@ namespace gbe
     GBE_ASSERT(AI == AE);
   }
 
+  void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite) {
+    CallSite::arg_iterator AI = CS.arg_begin();
+    CallSite::arg_iterator AE = CS.arg_end();
+    GBE_ASSERT(AI != AE);
+
+    Value *llvmPtr = *(AI++);
+    Value *llvmValues;
+    ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
+    GBE_ASSERT(addrSpace == ir::MEM_GLOBAL);
+    ir::Register pointer = this->getRegister(llvmPtr);
+
+    ir::Register ptr;
+    ir::Register btiReg;
+    unsigned SurfaceIndex = 0xff;
+
+    ir::AddressMode AM;
+    if (legacyMode) {
+      Value *bti = getBtiRegister(llvmPtr);
+      Value *ptrBase = getPointerBase(llvmPtr);
+      ir::Register baseReg = this->getRegister(ptrBase);
+      if (isa<ConstantInt>(bti)) {
+        AM = ir::AM_StaticBti;
+        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+        addrSpace = btiToGen(SurfaceIndex);
+      } else {
+        AM = ir::AM_DynamicBti;
+        addrSpace = ir::MEM_MIXED;
+        btiReg = this->getRegister(bti);
+      }
+      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      ptr = ctx.reg(pointerFamily);
+      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+    } else {
+      AM = ir::AM_Stateless;
+      ptr = pointer;
+    }
+
+    ir::Type type = ir::TYPE_U32;
+    GBE_ASSERT(AM != ir::AM_DynamicBti);
+
+    if(isWrite){
+      llvmValues = *(AI++);
+      const ir::Register values = getRegister(llvmValues);
+      const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
+      ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true);
+    } else {
+      llvmValues = &I;
+      const ir::Register values = getRegister(llvmValues);
+      const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
+      ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true);
+    }
+
+    GBE_ASSERT(AI == AE);
+  }
+
   /* append a new sampler. should be called before any reference to
    * a sampler_t value. */
   uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
@@ -4762,6 +4823,10 @@ namespace gbe
             ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2);
             break;
           }
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+            this->emitBlockReadWriteMemInst(I, CS, false); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+            this->emitBlockReadWriteMemInst(I, CS, true); break;
           default: break;
         }
       }
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 213ead0..003be91 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -202,7 +202,7 @@ DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_work_group_scan_
 DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ALL, __gen_ocl_work_group_all)
 DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ANY, __gen_ocl_work_group_any)
 
-// work group function
+// sub group function
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BROADCAST, __gen_ocl_sub_group_broadcast)
 
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_REDUCE_ADD, __gen_ocl_sub_group_reduce_add)
@@ -217,5 +217,8 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, __gen_ocl_sub_group_scan_in
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_inclusive_max)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min)
 
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem)
+
 // common function
 DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
-- 
2.7.4



More information about the Beignet mailing list