[Beignet] [PATCH 1/2] Backend: Add intel_sub_group_block_read/write form buffer
Andrew Lavin
aj.lavin at gmail.com
Wed May 25 21:44:58 UTC 2016
This is great stuff. Will we also get the intel_sub_group_block_* functions
that take an image2d_t argument?
Thanks,
Andrew
On Thu, May 19, 2016 at 2:55 PM, Xiuli Pan <xiuli.pan at intel.com> wrote:
> From: Pan Xiuli <xiuli.pan at intel.com>
>
> Using OWORD_BLOCK_RW to read/write a block of data for a thread.
>
> Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
> ---
> backend/src/backend/gen/gen_mesa_disasm.c | 15 +++++
> backend/src/backend/gen_context.cpp | 63 ++++++++++++++++++
> backend/src/backend/gen_context.hpp | 2 +
> backend/src/backend/gen_encoder.cpp | 38 ++++++++++-
> backend/src/backend/gen_encoder.hpp | 4 ++
> .../src/backend/gen_insn_gen7_schedule_info.hxx | 2 +
> backend/src/backend/gen_insn_selection.cpp | 77
> ++++++++++++++++++++--
> backend/src/backend/gen_insn_selection.hpp | 4 ++
> backend/src/backend/gen_insn_selection.hxx | 2 +
> backend/src/ir/instruction.cpp | 26 ++++++--
> backend/src/ir/instruction.hpp | 8 ++-
> backend/src/ir/liveness.cpp | 5 ++
> backend/src/libocl/CMakeLists.txt | 2 +-
> backend/src/libocl/src/ocl_substore.ll | 9 +++
> backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 54 +++++++++++++++
> backend/src/libocl/tmpl/ocl_simd.tmpl.h | 11 ++++
> backend/src/llvm/llvm_gen_backend.cpp | 65 ++++++++++++++++++
> backend/src/llvm/llvm_gen_ocl_function.hxx | 5 +-
> 18 files changed, 377 insertions(+), 15 deletions(-)
> create mode 100644 backend/src/libocl/src/ocl_substore.ll
>
> diff --git a/backend/src/backend/gen/gen_mesa_disasm.c
> b/backend/src/backend/gen/gen_mesa_disasm.c
> index 067ddd8..9200c26 100644
> --- a/backend/src/backend/gen/gen_mesa_disasm.c
> +++ b/backend/src/backend/gen/gen_mesa_disasm.c
> @@ -432,6 +432,14 @@ static const char *data_port_data_cache_category[] = {
> "scratch",
> };
>
> +static const char *data_port_data_cache_block_size[] = {
> + "1 OWORD LOW",
> + "1 OWORD HIGH",
> + "2 OWORD",
> + "4 OWORD",
> + "8 OWORD",
> +};
> +
> static const char *data_port_scratch_block_size[] = {
> "1 register",
> "2 registers",
> @@ -576,6 +584,7 @@ static int gen_version;
> #define MSG_GW_ACKREQ(inst) GEN_BITS_FIELD(inst,
> bits3.gen7_msg_gw.ackreq)
> #define GENERIC_MSG_LENGTH(inst) GEN_BITS_FIELD(inst,
> bits3.generic_gen5.msg_length)
> #define GENERIC_RESPONSE_LENGTH(inst) GEN_BITS_FIELD(inst,
> bits3.generic_gen5.response_length)
> +#define OWORD_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst,
> bits3.gen7_oblock_rw.block_size)
>
> static int is_special_acc(const void* inst)
> {
> @@ -1483,6 +1492,12 @@ int gen_disasm (FILE *file, const void *inst,
> uint32_t deviceID, uint32_t compac
>
> data_port_data_cache_byte_scattered_simd_mode[BYTE_RW_SIMD_MODE(inst)],
>
> data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
>
> data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> + else if(UNTYPED_RW_MSG_TYPE(inst) == 0 ||
> UNTYPED_RW_MSG_TYPE(inst) == 8)
> + format(file, " (bti: %d, data size: %s, %s, %s)",
> + UNTYPED_RW_BTI(inst),
> +
> data_port_data_cache_block_size[OWORD_RW_BLOCK_SIZE(inst)],
> +
> data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
> +
> data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> else
> format(file, " not implemented");
> } else {
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 4d0a3f3..cfb8be1 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -3487,6 +3487,69 @@ namespace gbe
> p->pop();
> }
>
> + void GenContext::emitOBReadInstruction(const SelectionInstruction
> &insn) {
> + const GenRegister dst = ra->genReg(insn.dst(0));
> + const GenRegister addr = ra->genReg(insn.src(0));
> + const GenRegister first = GenRegister::ud1grf(addr.nr
> ,addr.subnr/sizeof(float));
> + GenRegister header;
> + if (simdWidth == 8)
> + header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_F);
> + else
> + header =
> GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(1)),1), GEN_TYPE_F);
> +
> + p->push();
> + // Copy r0 into the header first
> + p->curr.execWidth = 8;
> + p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> + p->MOV(header, GenRegister::f8grf(0,0));
> +
> + // Update the header with the current address
> + p->curr.execWidth = 1;
> + const uint32_t nr = header.nr;
> + const uint32_t subnr = header.subnr / sizeof(float);
> + p->SHR(GenRegister::ud1grf(nr, subnr+2), first,
> GenRegister::immud(4));
> + //p->MOV(GenRegister::ud1grf(nr, subnr+2), first);
> +
> + // Put zero in the general state base address
> + p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0));
> +
> + p->pop();
> + // Now read the data
> + p->OBREAD(dst, header, insn.getbti(), insn.extra.elem);
> + }
> +
> + void GenContext::emitOBWriteInstruction(const SelectionInstruction
> &insn) {
> + const GenRegister addr = ra->genReg(insn.src(2));
> + const GenRegister first = GenRegister::ud1grf(addr.nr
> ,addr.subnr/sizeof(float));
> + GenRegister header;
> + if (simdWidth == 8)
> + header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
> + else
> + header =
> GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(0)),1), GEN_TYPE_F);
> +
> + p->push();
> + // Copy r0 into the header first
> + p->curr.execWidth = 8;
> + p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> + p->MOV(header, GenRegister::f8grf(0,0));
> +
> + // Update the header with the current address
> + p->curr.execWidth = 1;
> + const uint32_t nr = header.nr;
> + const uint32_t subnr = header.subnr / sizeof(float);
> + p->SHR(GenRegister::ud1grf(nr, subnr+2), first,
> GenRegister::immud(4));
> +
> + // Put zero in the general state base address
> + p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0));
> +
> + p->pop();
> + // Now write the data
> + p->OBWRITE(header, insn.getbti(), insn.extra.elem);
> + }
> +
> +
> BVAR(OCL_OUTPUT_REG_ALLOC, false);
> BVAR(OCL_OUTPUT_ASM, false);
>
> diff --git a/backend/src/backend/gen_context.hpp
> b/backend/src/backend/gen_context.hpp
> index 4c43ccb..56a5ec2 100644
> --- a/backend/src/backend/gen_context.hpp
> +++ b/backend/src/backend/gen_context.hpp
> @@ -187,6 +187,8 @@ namespace gbe
> void scratchRead(const GenRegister dst, const GenRegister header,
> uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t
> channel_mode);
> unsigned beforeMessage(const SelectionInstruction &insn, GenRegister
> bti, GenRegister flagTemp, GenRegister btiTmp, unsigned desc);
> void afterMessage(const SelectionInstruction &insn, GenRegister bti,
> GenRegister flagTemp, GenRegister btiTmp, unsigned jip0);
> + virtual void emitOBReadInstruction(const SelectionInstruction &insn);
> + virtual void emitOBWriteInstruction(const SelectionInstruction &insn);
>
> /*! Implements base class */
> virtual Kernel *allocateKernel(void);
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index 31afa67..fc7b5cf 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -258,7 +258,7 @@ namespace gbe
> else
> NOT_SUPPORTED;
> }
> -#if 0
> +
> static void setOBlockRW(GenEncoder *p,
> GenNativeInstruction *insn,
> uint32_t bti,
> @@ -272,10 +272,10 @@ namespace gbe
> assert(size == 2 || size == 4);
> insn->bits3.gen7_oblock_rw.msg_type = msg_type;
> insn->bits3.gen7_oblock_rw.bti = bti;
> + GBE_ASSERT(size == 2 || size == 4);
> insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
> insn->bits3.gen7_oblock_rw.header_present = 1;
> }
> -#endif
>
> static void setDWordScatterMessgae(GenEncoder *p,
> GenNativeInstruction *insn,
> @@ -1244,6 +1244,40 @@ namespace gbe
> setScratchMessage(this, insn, offset, block_size, channel_mode,
> GEN_SCRATCH_READ, 1, dst_num);
> }
>
> + void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t size) {
> + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> + const uint32_t msg_length = 1;
> + const uint32_t response_length = size / 2; // Size is in owords
> + this->setHeader(insn);
> + this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> + this->setSrc1(insn, GenRegister::immud(0));
> + setOBlockRW(this,
> + insn,
> + bti,
> + size,
> + GEN7_OBLOCK_READ,
> + msg_length,
> + response_length);
> + }
> +
> + void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t
> size) {
> + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> + const uint32_t msg_length = 1 + size / 2; // Size is in owords
> + const uint32_t response_length = 0;
> + this->setHeader(insn);
> + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> + this->setSrc1(insn, GenRegister::immud(0));
> + this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UW));
> + setOBlockRW(this,
> + insn,
> + bti,
> + size,
> + GEN7_OBLOCK_WRITE,
> + msg_length,
> + response_length);
> + }
> +
> void GenEncoder::EOT(uint32_t msg) {
> GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UD));
> diff --git a/backend/src/backend/gen_encoder.hpp
> b/backend/src/backend/gen_encoder.hpp
> index 0239293..a53c879 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -267,6 +267,10 @@ namespace gbe
> virtual bool canHandleLong(uint32_t opcode, GenRegister dst,
> GenRegister src0,
> GenRegister src1 = GenRegister::null());
> virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister
> dst, GenRegister src0, GenRegister src1 = GenRegister::null());
> + /*! OBlock read */
> + void OBREAD(GenRegister dst, GenRegister header, uint32_t bti,
> uint32_t elemSize);
> + /*! OBlock write */
> + void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
>
> GBE_CLASS(GenEncoder); //!< Use custom allocators
> virtual void alu3(uint32_t opcode, GenRegister dst,
> diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> index cb5c4f1..d297726 100644
> --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> @@ -50,3 +50,5 @@ DECL_GEN7_SCHEDULE(StoreProfiling, 80, 1,
> 1)
> DECL_GEN7_SCHEDULE(WorkGroupOp, 80, 1, 1)
> DECL_GEN7_SCHEDULE(SubGroupOp, 80, 1, 1)
> DECL_GEN7_SCHEDULE(Printf, 80, 1, 1)
> +DECL_GEN7_SCHEDULE(OBRead, 80, 1, 1)
> +DECL_GEN7_SCHEDULE(OBWrite, 80, 1, 1)
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 596e70b..7c49242 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -188,7 +188,8 @@ namespace gbe
> this->opcode == SEL_OP_BYTE_GATHER ||
> this->opcode == SEL_OP_SAMPLE ||
> this->opcode == SEL_OP_VME ||
> - this->opcode == SEL_OP_DWORD_GATHER;
> + this->opcode == SEL_OP_DWORD_GATHER ||
> + this->opcode == SEL_OP_OBREAD;
> }
>
> bool SelectionInstruction::modAcc(void) const {
> @@ -210,7 +211,8 @@ namespace gbe
> this->opcode == SEL_OP_WRITE64 ||
> this->opcode == SEL_OP_ATOMIC ||
> this->opcode == SEL_OP_BYTE_SCATTER ||
> - this->opcode == SEL_OP_TYPED_WRITE;
> + this->opcode == SEL_OP_TYPED_WRITE ||
> + this->opcode == SEL_OP_OBWRITE;
> }
>
> bool SelectionInstruction::isBranch(void) const {
> @@ -697,6 +699,11 @@ namespace gbe
> /*! Sub Group Operations */
> void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
> GenRegister tmpData1, GenRegister tmpData2);
> + /*! Oblock read */
> + void OBREAD(GenRegister dst, GenRegister addr, GenRegister header,
> uint32_t bti, uint32_t size);
> + /*! Oblock write */
> + void OBWRITE(GenRegister addr, GenRegister value, GenRegister header,
> uint32_t bti, uint32_t size);
> +
> /* common functions for both binary instruction and sel_cmp and
> compare instruction.
> It will handle the IMM or normal register assignment, and will try
> to avoid LOADI
> as much as possible. */
> @@ -2014,6 +2021,40 @@ namespace gbe
> insn->src(0) = src;
> insn->src(1) = tmpData2;
> }
> + void Selection::Opaque::OBREAD(GenRegister dst,
> + GenRegister addr,
> + GenRegister header,
> + uint32_t bti,
> + uint32_t size) {
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2);
> + insn->dst(0) = dst;
> + insn->src(0) = addr;
> + insn->src(1) = header;
> + insn->setbti(bti);
> + insn->extra.elem = size / sizeof(int[4]); // number of owords
> + }
> +
> + void Selection::Opaque::OBWRITE(GenRegister addr,
> + GenRegister value,
> + GenRegister header,
> + uint32_t bti,
> + uint32_t size) {
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3);
> + SelectionVector *vector = this->appendVector();
> + insn->src(0) = header;
> + insn->src(1) = value;
> + insn->src(2) = addr;
> + insn->state = this->curr;
> + insn->setbti(bti);
> + insn->extra.elem = size / sizeof(int[4]); // number of owords
> +
> + // We need to put the header and the data together
> + vector->regNum = 2;
> + vector->reg = &insn->src(0);
> + vector->offsetID = 0;
> + vector->isSrc = 1;
> + }
> +
>
> // Boiler plate to initialize the selection library at c++ pre-main
> static SelectionLibrary *selLib = NULL;
> @@ -4002,6 +4043,18 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> }
> }
>
> + void emitOWordRead(Selection::Opaque &sel,
> + const ir::LoadInstruction &insn,
> + GenRegister address,
> + ir::BTI bti) const
> + {
> + using namespace ir;
> + const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD),
> TYPE_U32);
> + const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
> + const uint32_t simdWidth = sel.ctx.getSimdWidth();
> + sel.OBREAD(value, address, header, bti.imm, simdWidth *
> sizeof(int));
> + }
> +
> // check whether all binded table index point to constant memory
> INLINE bool isAllConstant(const ir::BTI &bti) const {
> if (bti.isConst && bti.imm == BTI_CONSTANT)
> @@ -4037,7 +4090,9 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> const uint32_t elemSize = getByteScatterGatherSize(sel, type);
> bool allConstant = isAllConstant(bti);
>
> - if (allConstant) {
> + if (insn.isBlock())
> + this->emitOWordRead(sel, insn, address, bti);
> + else if (allConstant) {
> // XXX TODO read 64bit constant through constant cache
> // Per HW Spec, constant cache messages can read at least DWORD
> data.
> // So, byte/short data type, we have to read through data cache.
> @@ -4164,6 +4219,18 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> }
> }
>
> + void emitOWordWrite(Selection::Opaque &sel,
> + const ir::StoreInstruction &insn,
> + GenRegister address,
> + ir::BTI bti) const
> + {
> + using namespace ir;
> + const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD),
> TYPE_U32);
> + const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
> + const uint32_t simdWidth = sel.ctx.getSimdWidth();
> + sel.OBWRITE(address, value, header, bti.imm, simdWidth *
> sizeof(int));
> + }
> +
> virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
> {
> using namespace ir;
> @@ -4185,7 +4252,9 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> assert(0 && "stateless not supported yet");
> }
>
> - if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
> + if (insn.isBlock())
> + this->emitOWordWrite(sel, insn, address, bti);
> + else if (insn.isAligned() == true && elemSize ==
> GEN_BYTE_SCATTER_QWORD)
> this->emitWrite64(sel, insn, address, bti);
> else if (insn.isAligned() == true && elemSize ==
> GEN_BYTE_SCATTER_DWORD)
> this->emitUntypedWrite(sel, insn, address, bti);
> diff --git a/backend/src/backend/gen_insn_selection.hpp
> b/backend/src/backend/gen_insn_selection.hpp
> index 8d2e1da..51af686 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -175,6 +175,8 @@ namespace gbe
> INLINE uint32_t getbti() const {
> GBE_ASSERT(isRead() || isWrite());
> switch (opcode) {
> + case SEL_OP_OBREAD:
> + case SEL_OP_OBWRITE:
> case SEL_OP_DWORD_GATHER: return extra.function;
> case SEL_OP_SAMPLE: return extra.rdbti;
> case SEL_OP_VME: return extra.vme_bti;
> @@ -188,6 +190,8 @@ namespace gbe
> INLINE void setbti(uint32_t bti) {
> GBE_ASSERT(isRead() || isWrite());
> switch (opcode) {
> + case SEL_OP_OBREAD:
> + case SEL_OP_OBWRITE:
> case SEL_OP_DWORD_GATHER: extra.function = bti; return;
> case SEL_OP_SAMPLE: extra.rdbti = bti; return;
> case SEL_OP_VME: extra.vme_bti = bti; return;
> diff --git a/backend/src/backend/gen_insn_selection.hxx
> b/backend/src/backend/gen_insn_selection.hxx
> index 0e11f9f..4a7caff 100644
> --- a/backend/src/backend/gen_insn_selection.hxx
> +++ b/backend/src/backend/gen_insn_selection.hxx
> @@ -96,3 +96,5 @@ DECL_SELECTION_IR(STORE_PROFILING,
> StoreProfilingInstruction)
> DECL_SELECTION_IR(WORKGROUP_OP, WorkGroupOpInstruction)
> DECL_SELECTION_IR(SUBGROUP_OP, SubGroupOpInstruction)
> DECL_SELECTION_IR(PRINTF, PrintfInstruction)
> +DECL_SELECTION_IR(OBREAD, OBReadInstruction)
> +DECL_SELECTION_IR(OBWRITE, OBWriteInstruction)
> diff --git a/backend/src/ir/instruction.cpp
> b/backend/src/ir/instruction.cpp
> index 47606b2..88491a7 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -483,10 +483,12 @@ namespace ir {
> AddressSpace AS,
> uint32_t _valueNum,
> bool dwAligned,
> - AddressMode AM)
> + AddressMode AM,
> + bool ifBlock = false)
> : MemInstruction(AM, AS, dwAligned, type, offset),
> valueNum(_valueNum),
> - values(dstValues)
> + values(dstValues),
> + ifBlock(ifBlock)
> {
> this->opcode = OP_LOAD;
> }
> @@ -519,9 +521,11 @@ namespace ir {
> }
> INLINE bool wellFormed(const Function &fn, std::string &why)
> const;
> INLINE void out(std::ostream &out, const Function &fn) const;
> + INLINE bool isBlock() const { return ifBlock; }
>
> uint8_t valueNum;
> Tuple values;
> + bool ifBlock;
> };
> class ALIGNED_INSTRUCTION StoreInstruction :
> public MemInstruction,
> @@ -534,12 +538,14 @@ namespace ir {
> AddressSpace addrSpace,
> uint32_t valueNum,
> bool dwAligned,
> - AddressMode AM)
> + AddressMode AM,
> + bool ifBlock = false)
> : MemInstruction(AM, addrSpace, dwAligned, type, offset)
> {
> this->opcode = OP_STORE;
> this->values = values;
> this->valueNum = valueNum;
> + this->ifBlock = ifBlock;
> }
> INLINE unsigned getValueNum() const { return valueNum; }
> INLINE Register getValue(const Function &fn, unsigned id) const {
> @@ -565,9 +571,12 @@ namespace ir {
> }
> INLINE bool wellFormed(const Function &fn, std::string &why)
> const;
> INLINE void out(std::ostream &out, const Function &fn) const;
> + INLINE bool isBlock() const { return ifBlock; }
> +
> Register dst[0];
> uint8_t valueNum;
> Tuple values;
> + bool ifBlock;
> };
>
> class ALIGNED_INSTRUCTION SampleInstruction : // TODO
> @@ -1655,6 +1664,8 @@ namespace ir {
> }
>
> INLINE void LoadInstruction::out(std::ostream &out, const Function
> &fn) const {
> + if(ifBlock)
> + out<< "BLOCK";
> this->outOpcode(out);
> out << "." << type << "." << AS << (dwAligned ? "." : ".un") <<
> "aligned";
> out << " {";
> @@ -1672,6 +1683,8 @@ namespace ir {
> }
>
> INLINE void StoreInstruction::out(std::ostream &out, const Function
> &fn) const {
> + if(ifBlock)
> + out<< "BLOCK";
> this->outOpcode(out);
> out << "." << type << "." << AS << (dwAligned ? "." : ".un") <<
> "aligned";
> out << " %" << this->getSrc(fn, 0) << " {";
> @@ -2221,7 +2234,9 @@ DECL_MEM_FN(MemInstruction, bool,
> isAligned(void), isAligned())
> DECL_MEM_FN(MemInstruction, unsigned, getAddressIndex(void),
> getAddressIndex())
> DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void),
> getAtomicOpcode())
> DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
> +DECL_MEM_FN(StoreInstruction, bool, isBlock(void), isBlock())
> DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
> +DECL_MEM_FN(LoadInstruction, bool, isBlock(void), isBlock())
> DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
> DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void),
> getLabelIndex())
> DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
> @@ -2475,9 +2490,10 @@ DECL_MEM_FN(MemInstruction, void,
> setBtiReg(Register reg), setBtiReg(reg))
> uint32_t valueNum, \
> bool dwAligned, \
> AddressMode AM, \
> - unsigned SurfaceIndex) \
> + unsigned SurfaceIndex, \
> + bool isBlock) \
> { \
> - internal::CLASS insn =
> internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \
> + internal::CLASS insn =
> internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM, isBlock); \
> insn.setSurfaceIndex(SurfaceIndex);\
> return insn.convert(); \
> } \
> diff --git a/backend/src/ir/instruction.hpp
> b/backend/src/ir/instruction.hpp
> index 799a7bf..4a5811b 100644
> --- a/backend/src/ir/instruction.hpp
> +++ b/backend/src/ir/instruction.hpp
> @@ -356,6 +356,8 @@ namespace ir {
> }
> /*! Return true if the given instruction is an instance of this class
> */
> static bool isClassOf(const Instruction &insn);
> + /*! Return true if the given instruction is block write */
> + bool isBlock() const;
> };
>
> /*! Load instruction. The source is simply the address where to get the
> data.
> @@ -372,6 +374,8 @@ namespace ir {
> }
> /*! Return true if the given instruction is an instance of this class
> */
> static bool isClassOf(const Instruction &insn);
> + /*! Return true if the given instruction is block read */
> + bool isBlock() const;
> };
>
> /*! Load immediate instruction loads an typed immediate value into the
> given
> @@ -827,10 +831,10 @@ namespace ir {
> /*! ret */
> Instruction RET(void);
> /*! load.type.space {dst1,...,dst_valueNum} offset value, {bti} */
> - Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex);
> + Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex, bool isBlock = false);
> Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
> /*! store.type.space offset {src1,...,src_valueNum} value {bti}*/
> - Instruction STORE(Type type, Tuple src, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex);
> + Instruction STORE(Type type, Tuple src, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned
> SurfaceIndex, bool isBlock = false);
> Instruction STORE(Type type, Tuple src, Register offset, AddressSpace
> space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
> /*! loadi.type dst value */
> Instruction LOADI(Type type, Register dst, ImmediateIndex value);
> diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
> index d48f067..3162d13 100644
> --- a/backend/src/ir/liveness.cpp
> +++ b/backend/src/ir/liveness.cpp
> @@ -117,11 +117,16 @@ namespace ir {
> if (insn.getOpcode() == ir::OP_SIMD_ID)
> uniform = false;
>
> + // do not change dst uniform for block read
> + if (insn.getOpcode() == ir::OP_LOAD &&
> ir::cast<ir::LoadInstruction>(insn).isBlock())
> + uniform = false;
> +
> for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
> const Register reg = insn.getSrc(srcID);
> if (!fn.isUniformRegister(reg))
> uniform = false;
> }
> +
> // A destination is a killed value
> for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
> const Register reg = insn.getDst(dstID);
> diff --git a/backend/src/libocl/CMakeLists.txt
> b/backend/src/libocl/CMakeLists.txt
> index 1d1ec68..83e767c 100644
> --- a/backend/src/libocl/CMakeLists.txt
> +++ b/backend/src/libocl/CMakeLists.txt
> @@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
> )
> ENDMACRO(ADD_LL_TO_BC_TARGET)
>
> -SET (OCL_LL_MODULES ocl_barrier ocl_clz)
> +SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_substore)
> FOREACH(f ${OCL_LL_MODULES})
> COPY_THE_LL(${f})
> ADD_LL_TO_BC_TARGET(${f})
> diff --git a/backend/src/libocl/src/ocl_substore.ll
> b/backend/src/libocl/src/ocl_substore.ll
> new file mode 100644
> index 0000000..665cdfa
> --- /dev/null
> +++ b/backend/src/libocl/src/ocl_substore.ll
> @@ -0,0 +1,9 @@
> +target datalayout =
> "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
> +target triple = "spir"
> +
> +declare void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)*
> nocapture, i32) nounwind alwaysinline noduplicate
> +
> +define void @_Z27intel_sub_group_block_writePKU3AS1jj(i32 addrspace(1)*
> %p, i32 %data) nounwind alwaysinline noduplicate {
> + call void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* %p,
> i32 %data)
> + ret void
> +}
> diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> index a25dcef..66490cc 100644
> --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> @@ -133,3 +133,57 @@ RANGE_OP(scan_exclusive, max, float, true)
> RANGE_OP(scan_exclusive, max, double, true)
>
> #undef RANGE_OP
> +PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
> +OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
> +{
> + return __gen_ocl_sub_group_block_read_mem(p);
> +}
> +OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
> +{
> + return (uint2)(intel_sub_group_block_read(p),
> + intel_sub_group_block_read(p + get_simd_size()));
> +}
> +OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
> +{
> + return (uint4)(intel_sub_group_block_read(p),
> + intel_sub_group_block_read(p + get_simd_size()),
> + intel_sub_group_block_read(p + get_simd_size() * 2),
> + intel_sub_group_block_read(p + get_simd_size() * 3));
> +
> +}
> +OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
> +{
> + return (uint8)(intel_sub_group_block_read(p),
> + intel_sub_group_block_read(p + get_simd_size()),
> + intel_sub_group_block_read(p + get_simd_size() * 2),
> + intel_sub_group_block_read(p + get_simd_size() * 3),
> + intel_sub_group_block_read(p + get_simd_size() * 4),
> + intel_sub_group_block_read(p + get_simd_size() * 5),
> + intel_sub_group_block_read(p + get_simd_size() * 6),
> + intel_sub_group_block_read(p + get_simd_size() * 7));
> +}
> +
> +OVERLOADABLE void intel_sub_group_block_write2(const global uint* p,
> uint2 data)
> +{
> + intel_sub_group_block_write(p, data.s0);
> + intel_sub_group_block_write(p + get_simd_size(), data.s1);
> +}
> +OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4
> data)
> +{
> + intel_sub_group_block_write(p, data.s0);
> + intel_sub_group_block_write(p + get_simd_size(), data.s1);
> + intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
> + intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
> +
> +}
> +OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8
> data)
> +{
> + intel_sub_group_block_write(p, data.s0);
> + intel_sub_group_block_write(p + get_simd_size(), data.s1);
> + intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
> + intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
> + intel_sub_group_block_write(p + get_simd_size() * 4, data.s4);
> + intel_sub_group_block_write(p + get_simd_size() * 5, data.s5);
> + intel_sub_group_block_write(p + get_simd_size() * 6, data.s6);
> + intel_sub_group_block_write(p + get_simd_size() * 7, data.s7);
> +}
> diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> index 355ee30..d0676be 100644
> --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> @@ -132,3 +132,14 @@ OVERLOADABLE double
> sub_group_scan_exclusive_max(double x);
> OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
> OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
> OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
> +
> +/* blocak read/write */
> +OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
> +OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p);
> +OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
> +OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);
> +
> +OVERLOADABLE void intel_sub_group_block_write(const __global uint* p,
> uint data);
> +OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p,
> uint2 data);
> +OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p,
> uint4 data);
> +OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p,
> uint8 data);
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 3ddbfcc..e77290f 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -697,6 +697,8 @@ namespace gbe
> void emitWorkGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps
> opcode);
> // Emit subgroup instructions
> void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps
> opcode);
> + // Emit subgroup instructions
> + void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool
> isWrite);
>
> uint8_t appendSampler(CallSite::arg_iterator AI);
> uint8_t getImageID(CallInst &I);
> @@ -3730,6 +3732,9 @@ namespace gbe
> case GEN_OCL_LRP:
> this->newRegister(&I);
> break;
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
> + this->newRegister(&I, NULL, false);
> + break;
> case GEN_OCL_PRINTF:
> this->newRegister(&I); // fall through
> case GEN_OCL_PUTS:
> @@ -3744,6 +3749,7 @@ namespace gbe
> case GEN_OCL_CALC_TIMESTAMP:
> case GEN_OCL_STORE_PROFILING:
> case GEN_OCL_DEBUGWAIT:
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
> break;
> case GEN_OCL_NOT_FOUND:
> default:
> @@ -3938,6 +3944,61 @@ namespace gbe
> GBE_ASSERT(AI == AE);
> }
>
> + void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS,
> bool isWrite) {
> + CallSite::arg_iterator AI = CS.arg_begin();
> + CallSite::arg_iterator AE = CS.arg_end();
> + GBE_ASSERT(AI != AE);
> +
> + Value *llvmPtr = *(AI++);
> + Value *llvmValues;
> + ir::AddressSpace addrSpace =
> addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
> + GBE_ASSERT(addrSpace == ir::MEM_GLOBAL);
> + ir::Register pointer = this->getRegister(llvmPtr);
> +
> + ir::Register ptr;
> + ir::Register btiReg;
> + unsigned SurfaceIndex = 0xff;
> +
> + ir::AddressMode AM;
> + if (legacyMode) {
> + Value *bti = getBtiRegister(llvmPtr);
> + Value *ptrBase = getPointerBase(llvmPtr);
> + ir::Register baseReg = this->getRegister(ptrBase);
> + if (isa<ConstantInt>(bti)) {
> + AM = ir::AM_StaticBti;
> + SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
> + addrSpace = btiToGen(SurfaceIndex);
> + } else {
> + AM = ir::AM_DynamicBti;
> + addrSpace = ir::MEM_MIXED;
> + btiReg = this->getRegister(bti);
> + }
> + const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
> + ptr = ctx.reg(pointerFamily);
> + ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
> + } else {
> + AM = ir::AM_Stateless;
> + ptr = pointer;
> + }
> +
> + ir::Type type = ir::TYPE_U32;
> + GBE_ASSERT(AM != ir::AM_DynamicBti);
> +
> + if(isWrite){
> + llvmValues = *(AI++);
> + const ir::Register values = getRegister(llvmValues);
> + const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
> + ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex,
> true);
> + } else {
> + llvmValues = &I;
> + const ir::Register values = getRegister(llvmValues);
> + const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
> + ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex,
> true);
> + }
> +
> + GBE_ASSERT(AI == AE);
> + }
> +
> /* append a new sampler. should be called before any reference to
> * a sampler_t value. */
> uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
> @@ -4762,6 +4823,10 @@ namespace gbe
> ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2);
> break;
> }
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
> + this->emitBlockReadWriteMemInst(I, CS, false); break;
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
> + this->emitBlockReadWriteMemInst(I, CS, true); break;
> default: break;
> }
> }
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx
> b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 213ead0..003be91 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -202,7 +202,7 @@ DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN,
> __gen_ocl_work_group_scan_
> DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ALL, __gen_ocl_work_group_all)
> DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ANY, __gen_ocl_work_group_any)
>
> -// work group function
> +// sub group function
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BROADCAST, __gen_ocl_sub_group_broadcast)
>
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_REDUCE_ADD,
> __gen_ocl_sub_group_reduce_add)
> @@ -217,5 +217,8 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD,
> __gen_ocl_sub_group_scan_in
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX,
> __gen_ocl_sub_group_scan_inclusive_max)
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN,
> __gen_ocl_sub_group_scan_inclusive_min)
>
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM,
> __gen_ocl_sub_group_block_read_mem)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM,
> __gen_ocl_sub_group_block_write_mem)
> +
> // common function
> DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
> --
> 2.7.4
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/beignet/attachments/20160525/91dc4493/attachment-0001.html>
More information about the Beignet
mailing list