[Beignet] [PATCH 1/2] Backend: Add intel_sub_group_block_read/write form buffer

Andrew Lavin aj.lavin at gmail.com
Wed May 25 21:44:58 UTC 2016


This is great stuff. Will we also get the intel_sub_group_block_* functions
that take an image2d_t argument?

Thanks,
Andrew

On Thu, May 19, 2016 at 2:55 PM, Xiuli Pan <xiuli.pan at intel.com> wrote:

> From: Pan Xiuli <xiuli.pan at intel.com>
>
> Using OWORD_BLOCK_RW to read/write a block of data for a thread.
>
> Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
> ---
>  backend/src/backend/gen/gen_mesa_disasm.c          | 15 +++++
>  backend/src/backend/gen_context.cpp                | 63 ++++++++++++++++++
>  backend/src/backend/gen_context.hpp                |  2 +
>  backend/src/backend/gen_encoder.cpp                | 38 ++++++++++-
>  backend/src/backend/gen_encoder.hpp                |  4 ++
>  .../src/backend/gen_insn_gen7_schedule_info.hxx    |  2 +
>  backend/src/backend/gen_insn_selection.cpp         | 77
> ++++++++++++++++++++--
>  backend/src/backend/gen_insn_selection.hpp         |  4 ++
>  backend/src/backend/gen_insn_selection.hxx         |  2 +
>  backend/src/ir/instruction.cpp                     | 26 ++++++--
>  backend/src/ir/instruction.hpp                     |  8 ++-
>  backend/src/ir/liveness.cpp                        |  5 ++
>  backend/src/libocl/CMakeLists.txt                  |  2 +-
>  backend/src/libocl/src/ocl_substore.ll             |  9 +++
>  backend/src/libocl/tmpl/ocl_simd.tmpl.cl           | 54 +++++++++++++++
>  backend/src/libocl/tmpl/ocl_simd.tmpl.h            | 11 ++++
>  backend/src/llvm/llvm_gen_backend.cpp              | 65 ++++++++++++++++++
>  backend/src/llvm/llvm_gen_ocl_function.hxx         |  5 +-
>  18 files changed, 377 insertions(+), 15 deletions(-)
>  create mode 100644 backend/src/libocl/src/ocl_substore.ll
>
> diff --git a/backend/src/backend/gen/gen_mesa_disasm.c
> b/backend/src/backend/gen/gen_mesa_disasm.c
> index 067ddd8..9200c26 100644
> --- a/backend/src/backend/gen/gen_mesa_disasm.c
> +++ b/backend/src/backend/gen/gen_mesa_disasm.c
> @@ -432,6 +432,14 @@ static const char *data_port_data_cache_category[] = {
>    "scratch",
>  };
>
> +static const char *data_port_data_cache_block_size[] = {
> +  "1 OWORD LOW",
> +  "1 OWORD HIGH",
> +  "2 OWORD",
> +  "4 OWORD",
> +  "8 OWORD",
> +};
> +
>  static const char *data_port_scratch_block_size[] = {
>    "1 register",
>    "2 registers",
> @@ -576,6 +584,7 @@ static int gen_version;
>  #define MSG_GW_ACKREQ(inst)        GEN_BITS_FIELD(inst,
> bits3.gen7_msg_gw.ackreq)
>  #define GENERIC_MSG_LENGTH(inst)   GEN_BITS_FIELD(inst,
> bits3.generic_gen5.msg_length)
>  #define GENERIC_RESPONSE_LENGTH(inst) GEN_BITS_FIELD(inst,
> bits3.generic_gen5.response_length)
> +#define OWORD_RW_BLOCK_SIZE(inst)    GEN_BITS_FIELD(inst,
> bits3.gen7_oblock_rw.block_size)
>
>  static int is_special_acc(const void* inst)
>  {
> @@ -1483,6 +1492,12 @@ int gen_disasm (FILE *file, const void *inst,
> uint32_t deviceID, uint32_t compac
>
> data_port_data_cache_byte_scattered_simd_mode[BYTE_RW_SIMD_MODE(inst)],
>
> data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
>
> data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> +            else if(UNTYPED_RW_MSG_TYPE(inst) == 0 ||
> UNTYPED_RW_MSG_TYPE(inst) == 8)
> +              format(file, " (bti: %d, data size: %s, %s, %s)",
> +                   UNTYPED_RW_BTI(inst),
> +
>  data_port_data_cache_block_size[OWORD_RW_BLOCK_SIZE(inst)],
> +
>  data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
> +
>  data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
>              else
>                format(file, " not implemented");
>            } else {
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 4d0a3f3..cfb8be1 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -3487,6 +3487,69 @@ namespace gbe
>      p->pop();
>    }
>
> +  void GenContext::emitOBReadInstruction(const SelectionInstruction
> &insn) {
> +    const GenRegister dst = ra->genReg(insn.dst(0));
> +    const GenRegister addr = ra->genReg(insn.src(0));
> +    const GenRegister first = GenRegister::ud1grf(addr.nr
> ,addr.subnr/sizeof(float));
> +    GenRegister header;
> +    if (simdWidth == 8)
> +      header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_F);
> +    else
> +      header =
> GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(1)),1), GEN_TYPE_F);
> +
> +    p->push();
> +      // Copy r0 into the header first
> +      p->curr.execWidth = 8;
> +      p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
> +      p->MOV(header, GenRegister::f8grf(0,0));
> +
> +      // Update the header with the current address
> +      p->curr.execWidth = 1;
> +      const uint32_t nr = header.nr;
> +      const uint32_t subnr = header.subnr / sizeof(float);
> +      p->SHR(GenRegister::ud1grf(nr, subnr+2), first,
> GenRegister::immud(4));
> +      //p->MOV(GenRegister::ud1grf(nr, subnr+2), first);
> +
> +      // Put zero in the general state base address
> +      p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0));
> +
> +    p->pop();
> +    // Now read the data
> +    p->OBREAD(dst, header, insn.getbti(), insn.extra.elem);
> +  }
> +
> +  void GenContext::emitOBWriteInstruction(const SelectionInstruction
> &insn) {
> +    const GenRegister addr = ra->genReg(insn.src(2));
> +    const GenRegister first = GenRegister::ud1grf(addr.nr
> ,addr.subnr/sizeof(float));
> +    GenRegister header;
> +    if (simdWidth == 8)
> +      header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
> +    else
> +      header =
> GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(0)),1), GEN_TYPE_F);
> +
> +    p->push();
> +      // Copy r0 into the header first
> +      p->curr.execWidth = 8;
> +      p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
> +      p->MOV(header, GenRegister::f8grf(0,0));
> +
> +      // Update the header with the current address
> +      p->curr.execWidth = 1;
> +      const uint32_t nr = header.nr;
> +      const uint32_t subnr = header.subnr / sizeof(float);
> +      p->SHR(GenRegister::ud1grf(nr, subnr+2), first,
> GenRegister::immud(4));
> +
> +      // Put zero in the general state base address
> +      p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0));
> +
> +    p->pop();
> +    // Now write the data
> +    p->OBWRITE(header, insn.getbti(), insn.extra.elem);
> +  }
> +
> +
>    BVAR(OCL_OUTPUT_REG_ALLOC, false);
>    BVAR(OCL_OUTPUT_ASM, false);
>
> diff --git a/backend/src/backend/gen_context.hpp
> b/backend/src/backend/gen_context.hpp
> index 4c43ccb..56a5ec2 100644
> --- a/backend/src/backend/gen_context.hpp
> +++ b/backend/src/backend/gen_context.hpp
> @@ -187,6 +187,8 @@ namespace gbe
>      void scratchRead(const GenRegister dst, const GenRegister header,
> uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t
> channel_mode);
>      unsigned beforeMessage(const SelectionInstruction &insn, GenRegister
> bti, GenRegister flagTemp, GenRegister btiTmp, unsigned desc);
>      void afterMessage(const SelectionInstruction &insn, GenRegister bti,
> GenRegister flagTemp, GenRegister btiTmp, unsigned jip0);
> +    virtual void emitOBReadInstruction(const SelectionInstruction &insn);
> +    virtual void emitOBWriteInstruction(const SelectionInstruction &insn);
>
>      /*! Implements base class */
>      virtual Kernel *allocateKernel(void);
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index 31afa67..fc7b5cf 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -258,7 +258,7 @@ namespace gbe
>      else
>        NOT_SUPPORTED;
>    }
> -#if 0
> +
>    static void setOBlockRW(GenEncoder *p,
>                            GenNativeInstruction *insn,
>                            uint32_t bti,
> @@ -272,10 +272,10 @@ namespace gbe
>      assert(size == 2 || size == 4);
>      insn->bits3.gen7_oblock_rw.msg_type = msg_type;
>      insn->bits3.gen7_oblock_rw.bti = bti;
> +    GBE_ASSERT(size == 2 || size == 4);
>      insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
>      insn->bits3.gen7_oblock_rw.header_present = 1;
>    }
> -#endif
>
>    static void setDWordScatterMessgae(GenEncoder *p,
>                                       GenNativeInstruction *insn,
> @@ -1244,6 +1244,40 @@ namespace gbe
>       setScratchMessage(this, insn, offset, block_size, channel_mode,
> GEN_SCRATCH_READ, 1, dst_num);
>    }
>
> +  void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t size) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    const uint32_t msg_length = 1;
> +    const uint32_t response_length = size / 2; // Size is in owords
> +    this->setHeader(insn);
> +    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> +    this->setSrc1(insn, GenRegister::immud(0));
> +    setOBlockRW(this,
> +                insn,
> +                bti,
> +                size,
> +                GEN7_OBLOCK_READ,
> +                msg_length,
> +                response_length);
> +  }
> +
> +  void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t
> size) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    const uint32_t msg_length = 1 + size / 2; // Size is in owords
> +    const uint32_t response_length = 0;
> +    this->setHeader(insn);
> +    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> +    this->setSrc1(insn, GenRegister::immud(0));
> +    this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UW));
> +    setOBlockRW(this,
> +                insn,
> +                bti,
> +                size,
> +                GEN7_OBLOCK_WRITE,
> +                msg_length,
> +                response_length);
> +  }
> +
>    void GenEncoder::EOT(uint32_t msg) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>      this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UD));
> diff --git a/backend/src/backend/gen_encoder.hpp
> b/backend/src/backend/gen_encoder.hpp
> index 0239293..a53c879 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -267,6 +267,10 @@ namespace gbe
>      virtual bool canHandleLong(uint32_t opcode, GenRegister dst,
> GenRegister src0,
>                              GenRegister src1 = GenRegister::null());
>      virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister
> dst, GenRegister src0, GenRegister src1 = GenRegister::null());
> +    /*! OBlock read */
> +    void OBREAD(GenRegister dst, GenRegister header, uint32_t bti,
> uint32_t elemSize);
> +    /*! OBlock write */
> +    void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
>
>      GBE_CLASS(GenEncoder); //!< Use custom allocators
>      virtual void alu3(uint32_t opcode, GenRegister dst,
> diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> index cb5c4f1..d297726 100644
> --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> @@ -50,3 +50,5 @@ DECL_GEN7_SCHEDULE(StoreProfiling,  80,        1,
> 1)
>  DECL_GEN7_SCHEDULE(WorkGroupOp,     80,        1,        1)
>  DECL_GEN7_SCHEDULE(SubGroupOp,      80,        1,        1)
>  DECL_GEN7_SCHEDULE(Printf,          80,        1,        1)
> +DECL_GEN7_SCHEDULE(OBRead,          80,        1,        1)
> +DECL_GEN7_SCHEDULE(OBWrite,         80,        1,        1)
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 596e70b..7c49242 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -188,7 +188,8 @@ namespace gbe
>             this->opcode == SEL_OP_BYTE_GATHER  ||
>             this->opcode == SEL_OP_SAMPLE ||
>             this->opcode == SEL_OP_VME ||
> -           this->opcode == SEL_OP_DWORD_GATHER;
> +           this->opcode == SEL_OP_DWORD_GATHER ||
> +           this->opcode == SEL_OP_OBREAD;
>    }
>
>    bool SelectionInstruction::modAcc(void) const {
> @@ -210,7 +211,8 @@ namespace gbe
>             this->opcode == SEL_OP_WRITE64       ||
>             this->opcode == SEL_OP_ATOMIC        ||
>             this->opcode == SEL_OP_BYTE_SCATTER  ||
> -           this->opcode == SEL_OP_TYPED_WRITE;
> +           this->opcode == SEL_OP_TYPED_WRITE ||
> +           this->opcode == SEL_OP_OBWRITE;
>    }
>
>    bool SelectionInstruction::isBranch(void) const {
> @@ -697,6 +699,11 @@ namespace gbe
>      /*! Sub Group Operations */
>      void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
>                        GenRegister tmpData1, GenRegister tmpData2);
> +    /*! Oblock read */
> +    void OBREAD(GenRegister dst, GenRegister addr, GenRegister header,
> uint32_t bti, uint32_t size);
> +    /*! Oblock write */
> +    void OBWRITE(GenRegister addr, GenRegister value, GenRegister header,
> uint32_t bti, uint32_t size);
> +
>      /* common functions for both binary instruction and sel_cmp and
> compare instruction.
>         It will handle the IMM or normal register assignment, and will try
> to avoid LOADI
>         as much as possible. */
> @@ -2014,6 +2021,40 @@ namespace gbe
>      insn->src(0) = src;
>      insn->src(1) = tmpData2;
>    }
> +  void Selection::Opaque::OBREAD(GenRegister dst,
> +                                 GenRegister addr,
> +                                 GenRegister header,
> +                                 uint32_t bti,
> +                                 uint32_t size) {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2);
> +    insn->dst(0) = dst;
> +    insn->src(0) = addr;
> +    insn->src(1) = header;
> +    insn->setbti(bti);
> +    insn->extra.elem = size / sizeof(int[4]); // number of owords
> +  }
> +
> +  void Selection::Opaque::OBWRITE(GenRegister addr,
> +                                  GenRegister value,
> +                                  GenRegister header,
> +                                  uint32_t bti,
> +                                  uint32_t size) {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3);
> +    SelectionVector *vector = this->appendVector();
> +    insn->src(0) = header;
> +    insn->src(1) = value;
> +    insn->src(2) = addr;
> +    insn->state = this->curr;
> +    insn->setbti(bti);
> +    insn->extra.elem = size / sizeof(int[4]); // number of owords
> +
> +    // We need to put the header and the data together
> +    vector->regNum = 2;
> +    vector->reg = &insn->src(0);
> +    vector->offsetID = 0;
> +    vector->isSrc = 1;
> +  }
> +
>
>    // Boiler plate to initialize the selection library at c++ pre-main
>    static SelectionLibrary *selLib = NULL;
> @@ -4002,6 +4043,18 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>        }
>      }
>
> +    void emitOWordRead(Selection::Opaque &sel,
> +                       const ir::LoadInstruction &insn,
> +                       GenRegister address,
> +                       ir::BTI bti) const
> +    {
> +      using namespace ir;
> +      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD),
> TYPE_U32);
> +      const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
> +      const uint32_t simdWidth = sel.ctx.getSimdWidth();
> +      sel.OBREAD(value, address, header, bti.imm, simdWidth *
> sizeof(int));
> +    }
> +
>      // check whether all binded table index point to constant memory
>      INLINE bool isAllConstant(const ir::BTI &bti) const {
>        if (bti.isConst && bti.imm == BTI_CONSTANT)
> @@ -4037,7 +4090,9 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>        const uint32_t elemSize = getByteScatterGatherSize(sel, type);
>        bool allConstant = isAllConstant(bti);
>
> -      if (allConstant) {
> +      if (insn.isBlock())
> +        this->emitOWordRead(sel, insn, address, bti);
> +      else if (allConstant) {
>          // XXX TODO read 64bit constant through constant cache
>          // Per HW Spec, constant cache messages can read at least DWORD
> data.
>          // So, byte/short data type, we have to read through data cache.
> @@ -4164,6 +4219,18 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>        }
>      }
>
> +    void emitOWordWrite(Selection::Opaque &sel,
> +                        const ir::StoreInstruction &insn,
> +                        GenRegister address,
> +                        ir::BTI bti) const
> +    {
> +      using namespace ir;
> +      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD),
> TYPE_U32);
> +      const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
> +      const uint32_t simdWidth = sel.ctx.getSimdWidth();
> +      sel.OBWRITE(address, value, header, bti.imm, simdWidth *
> sizeof(int));
> +    }
> +
>      virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
>      {
>        using namespace ir;
> @@ -4185,7 +4252,9 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>          assert(0 && "stateless not supported yet");
>        }
>
> -      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
> +      if (insn.isBlock())
> +        this->emitOWordWrite(sel, insn, address, bti);
> +      else if (insn.isAligned() == true && elemSize ==
> GEN_BYTE_SCATTER_QWORD)
>          this->emitWrite64(sel, insn, address, bti);
>        else if (insn.isAligned() == true && elemSize ==
> GEN_BYTE_SCATTER_DWORD)
>          this->emitUntypedWrite(sel, insn, address,  bti);
> diff --git a/backend/src/backend/gen_insn_selection.hpp
> b/backend/src/backend/gen_insn_selection.hpp
> index 8d2e1da..51af686 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -175,6 +175,8 @@ namespace gbe
>      INLINE uint32_t getbti() const {
>        GBE_ASSERT(isRead() || isWrite());
>        switch (opcode) {
> +        case SEL_OP_OBREAD:
> +        case SEL_OP_OBWRITE:
>          case SEL_OP_DWORD_GATHER: return extra.function;
>          case SEL_OP_SAMPLE: return extra.rdbti;
>          case SEL_OP_VME: return extra.vme_bti;
> @@ -188,6 +190,8 @@ namespace gbe
>      INLINE void setbti(uint32_t bti) {
>        GBE_ASSERT(isRead() || isWrite());
>        switch (opcode) {
> +        case SEL_OP_OBREAD:
> +        case SEL_OP_OBWRITE:
>          case SEL_OP_DWORD_GATHER: extra.function = bti; return;
>          case SEL_OP_SAMPLE: extra.rdbti = bti; return;
>          case SEL_OP_VME: extra.vme_bti = bti; return;
> diff --git a/backend/src/backend/gen_insn_selection.hxx
> b/backend/src/backend/gen_insn_selection.hxx
> index 0e11f9f..4a7caff 100644
> --- a/backend/src/backend/gen_insn_selection.hxx
> +++ b/backend/src/backend/gen_insn_selection.hxx
> @@ -96,3 +96,5 @@ DECL_SELECTION_IR(STORE_PROFILING,
> StoreProfilingInstruction)
>  DECL_SELECTION_IR(WORKGROUP_OP, WorkGroupOpInstruction)
>  DECL_SELECTION_IR(SUBGROUP_OP, SubGroupOpInstruction)
>  DECL_SELECTION_IR(PRINTF, PrintfInstruction)
> +DECL_SELECTION_IR(OBREAD, OBReadInstruction)
> +DECL_SELECTION_IR(OBWRITE, OBWriteInstruction)
> diff --git a/backend/src/ir/instruction.cpp
> b/backend/src/ir/instruction.cpp
> index 47606b2..88491a7 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -483,10 +483,12 @@ namespace ir {
>                          AddressSpace AS,
>                          uint32_t _valueNum,
>                          bool dwAligned,
> -                        AddressMode AM)
> +                        AddressMode AM,
> +                        bool ifBlock = false)
>                        : MemInstruction(AM, AS, dwAligned, type, offset),
>                          valueNum(_valueNum),
> -                        values(dstValues)
> +                        values(dstValues),
> +                        ifBlock(ifBlock)
>          {
>            this->opcode = OP_LOAD;
>          }
> @@ -519,9 +521,11 @@ namespace ir {
>          }
>          INLINE bool wellFormed(const Function &fn, std::string &why)
> const;
>          INLINE void out(std::ostream &out, const Function &fn) const;
> +        INLINE bool isBlock() const { return ifBlock; }
>
>          uint8_t         valueNum;
>          Tuple             values;
> +        bool             ifBlock;
>      };
>      class ALIGNED_INSTRUCTION StoreInstruction :
>        public MemInstruction,
> @@ -534,12 +538,14 @@ namespace ir {
>                           AddressSpace addrSpace,
>                           uint32_t valueNum,
>                           bool dwAligned,
> -                         AddressMode AM)
> +                         AddressMode AM,
> +                         bool ifBlock = false)
>            : MemInstruction(AM, addrSpace, dwAligned, type, offset)
>          {
>            this->opcode = OP_STORE;
>            this->values = values;
>            this->valueNum = valueNum;
> +          this->ifBlock = ifBlock;
>          }
>          INLINE unsigned getValueNum()      const { return valueNum; }
>          INLINE Register getValue(const Function &fn, unsigned id) const {
> @@ -565,9 +571,12 @@ namespace ir {
>          }
>          INLINE bool wellFormed(const Function &fn, std::string &why)
> const;
>          INLINE void out(std::ostream &out, const Function &fn) const;
> +        INLINE bool isBlock() const { return ifBlock; }
> +
>          Register      dst[0];
>          uint8_t     valueNum;
>          Tuple         values;
> +        bool         ifBlock;
>      };
>
>      class ALIGNED_INSTRUCTION SampleInstruction : // TODO
> @@ -1655,6 +1664,8 @@ namespace ir {
>      }
>
>      INLINE void LoadInstruction::out(std::ostream &out, const Function
> &fn) const {
> +      if(ifBlock)
> +        out<< "BLOCK";
>        this->outOpcode(out);
>        out << "." << type << "." << AS << (dwAligned ? "." : ".un") <<
> "aligned";
>        out << " {";
> @@ -1672,6 +1683,8 @@ namespace ir {
>      }
>
>      INLINE void StoreInstruction::out(std::ostream &out, const Function
> &fn) const {
> +      if(ifBlock)
> +        out<< "BLOCK";
>        this->outOpcode(out);
>        out << "." << type << "." << AS << (dwAligned ? "." : ".un") <<
> "aligned";
>        out << " %" << this->getSrc(fn, 0) << " {";
> @@ -2221,7 +2234,9 @@ DECL_MEM_FN(MemInstruction, bool,
>  isAligned(void), isAligned())
>  DECL_MEM_FN(MemInstruction, unsigned, getAddressIndex(void),
> getAddressIndex())
>  DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void),
> getAtomicOpcode())
>  DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
> +DECL_MEM_FN(StoreInstruction, bool, isBlock(void), isBlock())
>  DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
> +DECL_MEM_FN(LoadInstruction, bool, isBlock(void), isBlock())
>  DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
>  DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void),
> getLabelIndex())
>  DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
> @@ -2475,9 +2490,10 @@ DECL_MEM_FN(MemInstruction, void,
>  setBtiReg(Register reg), setBtiReg(reg))
>                     uint32_t valueNum, \
>                     bool dwAligned, \
>                     AddressMode AM, \
> -                   unsigned SurfaceIndex) \
> +                   unsigned SurfaceIndex, \
> +                   bool isBlock) \
>    { \
> -    internal::CLASS insn =
> internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \
> +    internal::CLASS insn =
> internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM, isBlock); \
>      insn.setSurfaceIndex(SurfaceIndex);\
>      return insn.convert(); \
>    } \
> diff --git a/backend/src/ir/instruction.hpp
> b/backend/src/ir/instruction.hpp
> index 799a7bf..4a5811b 100644
> --- a/backend/src/ir/instruction.hpp
> +++ b/backend/src/ir/instruction.hpp
> @@ -356,6 +356,8 @@ namespace ir {
>      }
>      /*! Return true if the given instruction is an instance of this class
> */
>      static bool isClassOf(const Instruction &insn);
> +    /*! Return true if the given instruction is block write */
> +    bool isBlock() const;
>    };
>
>    /*! Load instruction. The source is simply the address where to get the
> data.
> @@ -372,6 +374,8 @@ namespace ir {
>      }
>      /*! Return true if the given instruction is an instance of this class
> */
>      static bool isClassOf(const Instruction &insn);
> +    /*! Return true if the given instruction is block read */
> +    bool isBlock() const;
>    };
>
>    /*! Load immediate instruction loads an typed immediate value into the
> given
> @@ -827,10 +831,10 @@ namespace ir {
>    /*! ret */
>    Instruction RET(void);
>    /*! load.type.space {dst1,...,dst_valueNum} offset value, {bti} */
> -  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex);
> +  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex, bool isBlock = false);
>    Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
>    /*! store.type.space offset {src1,...,src_valueNum} value {bti}*/
> -  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex);
> +  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex, bool isBlock = false);
>    Instruction STORE(Type type, Tuple src, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
>    /*! loadi.type dst value */
>    Instruction LOADI(Type type, Register dst, ImmediateIndex value);
> diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
> index d48f067..3162d13 100644
> --- a/backend/src/ir/liveness.cpp
> +++ b/backend/src/ir/liveness.cpp
> @@ -117,11 +117,16 @@ namespace ir {
>          if (insn.getOpcode() == ir::OP_SIMD_ID)
>            uniform = false;
>
> +        // do not change dst uniform for block read
> +        if (insn.getOpcode() == ir::OP_LOAD &&
> ir::cast<ir::LoadInstruction>(insn).isBlock())
> +          uniform = false;
> +
>          for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
>            const Register reg = insn.getSrc(srcID);
>            if (!fn.isUniformRegister(reg))
>              uniform = false;
>          }
> +
>          // A destination is a killed value
>          for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
>            const Register reg = insn.getDst(dstID);
> diff --git a/backend/src/libocl/CMakeLists.txt
> b/backend/src/libocl/CMakeLists.txt
> index 1d1ec68..83e767c 100644
> --- a/backend/src/libocl/CMakeLists.txt
> +++ b/backend/src/libocl/CMakeLists.txt
> @@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
>         )
>  ENDMACRO(ADD_LL_TO_BC_TARGET)
>
> -SET (OCL_LL_MODULES ocl_barrier ocl_clz)
> +SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_substore)
>  FOREACH(f ${OCL_LL_MODULES})
>      COPY_THE_LL(${f})
>      ADD_LL_TO_BC_TARGET(${f})
> diff --git a/backend/src/libocl/src/ocl_substore.ll
> b/backend/src/libocl/src/ocl_substore.ll
> new file mode 100644
> index 0000000..665cdfa
> --- /dev/null
> +++ b/backend/src/libocl/src/ocl_substore.ll
> @@ -0,0 +1,9 @@
> +target datalayout =
> "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
> +target triple = "spir"
> +
> +declare void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)*
> nocapture, i32) nounwind alwaysinline noduplicate
> +
> +define void @_Z27intel_sub_group_block_writePKU3AS1jj(i32 addrspace(1)*
> %p, i32 %data) nounwind alwaysinline noduplicate {
> +  call void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* %p,
> i32 %data)
> +  ret void
> +}
> diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> index a25dcef..66490cc 100644
> --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> @@ -133,3 +133,57 @@ RANGE_OP(scan_exclusive, max, float, true)
>  RANGE_OP(scan_exclusive, max, double, true)
>
>  #undef RANGE_OP
> +PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
> +OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
> +{
> +  return __gen_ocl_sub_group_block_read_mem(p);
> +}
> +OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
> +{
> +  return (uint2)(intel_sub_group_block_read(p),
> +                 intel_sub_group_block_read(p + get_simd_size()));
> +}
> +OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
> +{
> +  return (uint4)(intel_sub_group_block_read(p),
> +                 intel_sub_group_block_read(p + get_simd_size()),
> +                 intel_sub_group_block_read(p + get_simd_size() * 2),
> +                 intel_sub_group_block_read(p + get_simd_size() * 3));
> +
> +}
> +OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
> +{
> +  return (uint8)(intel_sub_group_block_read(p),
> +                 intel_sub_group_block_read(p + get_simd_size()),
> +                 intel_sub_group_block_read(p + get_simd_size() * 2),
> +                 intel_sub_group_block_read(p + get_simd_size() * 3),
> +                 intel_sub_group_block_read(p + get_simd_size() * 4),
> +                 intel_sub_group_block_read(p + get_simd_size() * 5),
> +                 intel_sub_group_block_read(p + get_simd_size() * 6),
> +                 intel_sub_group_block_read(p + get_simd_size() * 7));
> +}
> +
> +OVERLOADABLE void intel_sub_group_block_write2(const global uint* p,
> uint2 data)
> +{
> +  intel_sub_group_block_write(p, data.s0);
> +  intel_sub_group_block_write(p + get_simd_size(), data.s1);
> +}
> +OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4
> data)
> +{
> +  intel_sub_group_block_write(p, data.s0);
> +  intel_sub_group_block_write(p + get_simd_size(), data.s1);
> +  intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
> +  intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
> +
> +}
> +OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8
> data)
> +{
> +  intel_sub_group_block_write(p, data.s0);
> +  intel_sub_group_block_write(p + get_simd_size(), data.s1);
> +  intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
> +  intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
> +  intel_sub_group_block_write(p + get_simd_size() * 4, data.s4);
> +  intel_sub_group_block_write(p + get_simd_size() * 5, data.s5);
> +  intel_sub_group_block_write(p + get_simd_size() * 6, data.s6);
> +  intel_sub_group_block_write(p + get_simd_size() * 7, data.s7);
> +}
> diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> index 355ee30..d0676be 100644
> --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> @@ -132,3 +132,14 @@ OVERLOADABLE double
> sub_group_scan_exclusive_max(double x);
>  OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
>  OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
>  OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
> +
> +/* blocak read/write */
> +OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
> +OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p);
> +OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
> +OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);
> +
> +OVERLOADABLE void intel_sub_group_block_write(const __global uint* p,
> uint data);
> +OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p,
> uint2 data);
> +OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p,
> uint4 data);
> +OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p,
> uint8 data);
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 3ddbfcc..e77290f 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -697,6 +697,8 @@ namespace gbe
>      void emitWorkGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps
> opcode);
>      // Emit subgroup instructions
>      void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps
> opcode);
> +    // Emit subgroup instructions
> +    void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool
> isWrite);
>
>      uint8_t appendSampler(CallSite::arg_iterator AI);
>      uint8_t getImageID(CallInst &I);
> @@ -3730,6 +3732,9 @@ namespace gbe
>        case GEN_OCL_LRP:
>          this->newRegister(&I);
>          break;
> +      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
> +        this->newRegister(&I, NULL, false);
> +        break;
>        case GEN_OCL_PRINTF:
>          this->newRegister(&I);  // fall through
>        case GEN_OCL_PUTS:
> @@ -3744,6 +3749,7 @@ namespace gbe
>        case GEN_OCL_CALC_TIMESTAMP:
>        case GEN_OCL_STORE_PROFILING:
>        case GEN_OCL_DEBUGWAIT:
> +      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
>          break;
>        case GEN_OCL_NOT_FOUND:
>        default:
> @@ -3938,6 +3944,61 @@ namespace gbe
>      GBE_ASSERT(AI == AE);
>    }
>
> +  void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS,
> bool isWrite) {
> +    CallSite::arg_iterator AI = CS.arg_begin();
> +    CallSite::arg_iterator AE = CS.arg_end();
> +    GBE_ASSERT(AI != AE);
> +
> +    Value *llvmPtr = *(AI++);
> +    Value *llvmValues;
> +    ir::AddressSpace addrSpace =
> addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
> +    GBE_ASSERT(addrSpace == ir::MEM_GLOBAL);
> +    ir::Register pointer = this->getRegister(llvmPtr);
> +
> +    ir::Register ptr;
> +    ir::Register btiReg;
> +    unsigned SurfaceIndex = 0xff;
> +
> +    ir::AddressMode AM;
> +    if (legacyMode) {
> +      Value *bti = getBtiRegister(llvmPtr);
> +      Value *ptrBase = getPointerBase(llvmPtr);
> +      ir::Register baseReg = this->getRegister(ptrBase);
> +      if (isa<ConstantInt>(bti)) {
> +        AM = ir::AM_StaticBti;
> +        SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
> +        addrSpace = btiToGen(SurfaceIndex);
> +      } else {
> +        AM = ir::AM_DynamicBti;
> +        addrSpace = ir::MEM_MIXED;
> +        btiReg = this->getRegister(bti);
> +      }
> +      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> +      ptr = ctx.reg(pointerFamily);
> +      ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
> +    } else {
> +      AM = ir::AM_Stateless;
> +      ptr = pointer;
> +    }
> +
> +    ir::Type type = ir::TYPE_U32;
> +    GBE_ASSERT(AM != ir::AM_DynamicBti);
> +
> +    if(isWrite){
> +      llvmValues = *(AI++);
> +      const ir::Register values = getRegister(llvmValues);
> +      const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
> +      ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex,
> true);
> +    } else {
> +      llvmValues = &I;
> +      const ir::Register values = getRegister(llvmValues);
> +      const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
> +      ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex,
> true);
> +    }
> +
> +    GBE_ASSERT(AI == AE);
> +  }
> +
>    /* append a new sampler. should be called before any reference to
>     * a sampler_t value. */
>    uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
> @@ -4762,6 +4823,10 @@ namespace gbe
>              ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2);
>              break;
>            }
> +          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
> +            this->emitBlockReadWriteMemInst(I, CS, false); break;
> +          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
> +            this->emitBlockReadWriteMemInst(I, CS, true); break;
>            default: break;
>          }
>        }
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx
> b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 213ead0..003be91 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -202,7 +202,7 @@ DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN,
> __gen_ocl_work_group_scan_
>  DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ALL, __gen_ocl_work_group_all)
>  DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ANY, __gen_ocl_work_group_any)
>
> -// work group function
> +// sub group function
>  DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BROADCAST, __gen_ocl_sub_group_broadcast)
>
>  DECL_LLVM_GEN_FUNCTION(SUB_GROUP_REDUCE_ADD,
> __gen_ocl_sub_group_reduce_add)
> @@ -217,5 +217,8 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD,
> __gen_ocl_sub_group_scan_in
>  DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX,
> __gen_ocl_sub_group_scan_inclusive_max)
>  DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN,
> __gen_ocl_sub_group_scan_inclusive_min)
>
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM,
> __gen_ocl_sub_group_block_read_mem)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM,
> __gen_ocl_sub_group_block_write_mem)
> +
>  // common function
>  DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
> --
> 2.7.4
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/beignet/attachments/20160525/91dc4493/attachment-0001.html>


More information about the Beignet mailing list