[Beignet] [PATCH 1/2] Backend: Add intel_sub_group_block_read/write form buffer
Xiuli Pan
xiuli.pan at intel.com
Thu May 19 21:55:54 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
Using OWORD_BLOCK_RW to read/write a block of data for a thread.
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
backend/src/backend/gen/gen_mesa_disasm.c | 15 +++++
backend/src/backend/gen_context.cpp | 63 ++++++++++++++++++
backend/src/backend/gen_context.hpp | 2 +
backend/src/backend/gen_encoder.cpp | 38 ++++++++++-
backend/src/backend/gen_encoder.hpp | 4 ++
.../src/backend/gen_insn_gen7_schedule_info.hxx | 2 +
backend/src/backend/gen_insn_selection.cpp | 77 ++++++++++++++++++++--
backend/src/backend/gen_insn_selection.hpp | 4 ++
backend/src/backend/gen_insn_selection.hxx | 2 +
backend/src/ir/instruction.cpp | 26 ++++++--
backend/src/ir/instruction.hpp | 8 ++-
backend/src/ir/liveness.cpp | 5 ++
backend/src/libocl/CMakeLists.txt | 2 +-
backend/src/libocl/src/ocl_substore.ll | 9 +++
backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 54 +++++++++++++++
backend/src/libocl/tmpl/ocl_simd.tmpl.h | 11 ++++
backend/src/llvm/llvm_gen_backend.cpp | 65 ++++++++++++++++++
backend/src/llvm/llvm_gen_ocl_function.hxx | 5 +-
18 files changed, 377 insertions(+), 15 deletions(-)
create mode 100644 backend/src/libocl/src/ocl_substore.ll
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 067ddd8..9200c26 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -432,6 +432,14 @@ static const char *data_port_data_cache_category[] = {
"scratch",
};
+static const char *data_port_data_cache_block_size[] = {
+ "1 OWORD LOW",
+ "1 OWORD HIGH",
+ "2 OWORD",
+ "4 OWORD",
+ "8 OWORD",
+};
+
static const char *data_port_scratch_block_size[] = {
"1 register",
"2 registers",
@@ -576,6 +584,7 @@ static int gen_version;
#define MSG_GW_ACKREQ(inst) GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.ackreq)
#define GENERIC_MSG_LENGTH(inst) GEN_BITS_FIELD(inst, bits3.generic_gen5.msg_length)
#define GENERIC_RESPONSE_LENGTH(inst) GEN_BITS_FIELD(inst, bits3.generic_gen5.response_length)
+#define OWORD_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst, bits3.gen7_oblock_rw.block_size)
static int is_special_acc(const void* inst)
{
@@ -1483,6 +1492,12 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
data_port_data_cache_byte_scattered_simd_mode[BYTE_RW_SIMD_MODE(inst)],
data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+ else if(UNTYPED_RW_MSG_TYPE(inst) == 0 || UNTYPED_RW_MSG_TYPE(inst) == 8)
+ format(file, " (bti: %d, data size: %s, %s, %s)",
+ UNTYPED_RW_BTI(inst),
+ data_port_data_cache_block_size[OWORD_RW_BLOCK_SIZE(inst)],
+ data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+ data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
else
format(file, " not implemented");
} else {
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 4d0a3f3..cfb8be1 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3487,6 +3487,69 @@ namespace gbe
p->pop();
}
+ void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister addr = ra->genReg(insn.src(0));
+ const GenRegister first = GenRegister::ud1grf(addr.nr,addr.subnr/sizeof(float));
+ GenRegister header;
+ if (simdWidth == 8)
+ header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_F);
+ else
+ header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(1)),1), GEN_TYPE_F);
+
+ p->push();
+ // Copy r0 into the header first
+ p->curr.execWidth = 8;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->MOV(header, GenRegister::f8grf(0,0));
+
+ // Update the header with the current address
+ p->curr.execWidth = 1;
+ const uint32_t nr = header.nr;
+ const uint32_t subnr = header.subnr / sizeof(float);
+ p->SHR(GenRegister::ud1grf(nr, subnr+2), first, GenRegister::immud(4));
+ //p->MOV(GenRegister::ud1grf(nr, subnr+2), first);
+
+ // Put zero in the general state base address
+ p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0));
+
+ p->pop();
+ // Now read the data
+ p->OBREAD(dst, header, insn.getbti(), insn.extra.elem);
+ }
+
+ void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
+ const GenRegister addr = ra->genReg(insn.src(2));
+ const GenRegister first = GenRegister::ud1grf(addr.nr,addr.subnr/sizeof(float));
+ GenRegister header;
+ if (simdWidth == 8)
+ header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
+ else
+ header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(0)),1), GEN_TYPE_F);
+
+ p->push();
+ // Copy r0 into the header first
+ p->curr.execWidth = 8;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->MOV(header, GenRegister::f8grf(0,0));
+
+ // Update the header with the current address
+ p->curr.execWidth = 1;
+ const uint32_t nr = header.nr;
+ const uint32_t subnr = header.subnr / sizeof(float);
+ p->SHR(GenRegister::ud1grf(nr, subnr+2), first, GenRegister::immud(4));
+
+ // Put zero in the general state base address
+ p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0));
+
+ p->pop();
+ // Now write the data
+ p->OBWRITE(header, insn.getbti(), insn.extra.elem);
+ }
+
+
BVAR(OCL_OUTPUT_REG_ALLOC, false);
BVAR(OCL_OUTPUT_ASM, false);
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 4c43ccb..56a5ec2 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -187,6 +187,8 @@ namespace gbe
void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
unsigned beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned desc);
void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned jip0);
+ virtual void emitOBReadInstruction(const SelectionInstruction &insn);
+ virtual void emitOBWriteInstruction(const SelectionInstruction &insn);
/*! Implements base class */
virtual Kernel *allocateKernel(void);
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 31afa67..fc7b5cf 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -258,7 +258,7 @@ namespace gbe
else
NOT_SUPPORTED;
}
-#if 0
+
static void setOBlockRW(GenEncoder *p,
GenNativeInstruction *insn,
uint32_t bti,
@@ -272,10 +272,10 @@ namespace gbe
assert(size == 2 || size == 4);
insn->bits3.gen7_oblock_rw.msg_type = msg_type;
insn->bits3.gen7_oblock_rw.bti = bti;
+ GBE_ASSERT(size == 2 || size == 4);
insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
insn->bits3.gen7_oblock_rw.header_present = 1;
}
-#endif
static void setDWordScatterMessgae(GenEncoder *p,
GenNativeInstruction *insn,
@@ -1244,6 +1244,40 @@ namespace gbe
setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
}
+ void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ const uint32_t msg_length = 1;
+ const uint32_t response_length = size / 2; // Size is in owords
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setOBlockRW(this,
+ insn,
+ bti,
+ size,
+ GEN7_OBLOCK_READ,
+ msg_length,
+ response_length);
+ }
+
+ void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ const uint32_t msg_length = 1 + size / 2; // Size is in owords
+ const uint32_t response_length = 0;
+ this->setHeader(insn);
+ this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ setOBlockRW(this,
+ insn,
+ bti,
+ size,
+ GEN7_OBLOCK_WRITE,
+ msg_length,
+ response_length);
+ }
+
void GenEncoder::EOT(uint32_t msg) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 0239293..a53c879 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -267,6 +267,10 @@ namespace gbe
virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
GenRegister src1 = GenRegister::null());
virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
+ /*! OBlock read */
+ void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+ /*! OBlock write */
+ void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
GBE_CLASS(GenEncoder); //!< Use custom allocators
virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index cb5c4f1..d297726 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -50,3 +50,5 @@ DECL_GEN7_SCHEDULE(StoreProfiling, 80, 1, 1)
DECL_GEN7_SCHEDULE(WorkGroupOp, 80, 1, 1)
DECL_GEN7_SCHEDULE(SubGroupOp, 80, 1, 1)
DECL_GEN7_SCHEDULE(Printf, 80, 1, 1)
+DECL_GEN7_SCHEDULE(OBRead, 80, 1, 1)
+DECL_GEN7_SCHEDULE(OBWrite, 80, 1, 1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 596e70b..7c49242 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -188,7 +188,8 @@ namespace gbe
this->opcode == SEL_OP_BYTE_GATHER ||
this->opcode == SEL_OP_SAMPLE ||
this->opcode == SEL_OP_VME ||
- this->opcode == SEL_OP_DWORD_GATHER;
+ this->opcode == SEL_OP_DWORD_GATHER ||
+ this->opcode == SEL_OP_OBREAD;
}
bool SelectionInstruction::modAcc(void) const {
@@ -210,7 +211,8 @@ namespace gbe
this->opcode == SEL_OP_WRITE64 ||
this->opcode == SEL_OP_ATOMIC ||
this->opcode == SEL_OP_BYTE_SCATTER ||
- this->opcode == SEL_OP_TYPED_WRITE;
+ this->opcode == SEL_OP_TYPED_WRITE ||
+ this->opcode == SEL_OP_OBWRITE;
}
bool SelectionInstruction::isBranch(void) const {
@@ -697,6 +699,11 @@ namespace gbe
/*! Sub Group Operations */
void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
GenRegister tmpData1, GenRegister tmpData2);
+ /*! Oblock read */
+ void OBREAD(GenRegister dst, GenRegister addr, GenRegister header, uint32_t bti, uint32_t size);
+ /*! Oblock write */
+ void OBWRITE(GenRegister addr, GenRegister value, GenRegister header, uint32_t bti, uint32_t size);
+
/* common functions for both binary instruction and sel_cmp and compare instruction.
It will handle the IMM or normal register assignment, and will try to avoid LOADI
as much as possible. */
@@ -2014,6 +2021,40 @@ namespace gbe
insn->src(0) = src;
insn->src(1) = tmpData2;
}
+ void Selection::Opaque::OBREAD(GenRegister dst,
+ GenRegister addr,
+ GenRegister header,
+ uint32_t bti,
+ uint32_t size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = addr;
+ insn->src(1) = header;
+ insn->setbti(bti);
+ insn->extra.elem = size / sizeof(int[4]); // number of owords
+ }
+
+ void Selection::Opaque::OBWRITE(GenRegister addr,
+ GenRegister value,
+ GenRegister header,
+ uint32_t bti,
+ uint32_t size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3);
+ SelectionVector *vector = this->appendVector();
+ insn->src(0) = header;
+ insn->src(1) = value;
+ insn->src(2) = addr;
+ insn->state = this->curr;
+ insn->setbti(bti);
+ insn->extra.elem = size / sizeof(int[4]); // number of owords
+
+ // We need to put the header and the data together
+ vector->regNum = 2;
+ vector->reg = &insn->src(0);
+ vector->offsetID = 0;
+ vector->isSrc = 1;
+ }
+
// Boiler plate to initialize the selection library at c++ pre-main
static SelectionLibrary *selLib = NULL;
@@ -4002,6 +4043,18 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
}
}
+ void emitOWordRead(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister address,
+ ir::BTI bti) const
+ {
+ using namespace ir;
+ const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
+ const uint32_t simdWidth = sel.ctx.getSimdWidth();
+ sel.OBREAD(value, address, header, bti.imm, simdWidth * sizeof(int));
+ }
+
// check whether all binded table index point to constant memory
INLINE bool isAllConstant(const ir::BTI &bti) const {
if (bti.isConst && bti.imm == BTI_CONSTANT)
@@ -4037,7 +4090,9 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
const uint32_t elemSize = getByteScatterGatherSize(sel, type);
bool allConstant = isAllConstant(bti);
- if (allConstant) {
+ if (insn.isBlock())
+ this->emitOWordRead(sel, insn, address, bti);
+ else if (allConstant) {
// XXX TODO read 64bit constant through constant cache
// Per HW Spec, constant cache messages can read at least DWORD data.
// So, byte/short data type, we have to read through data cache.
@@ -4164,6 +4219,18 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
}
}
+ void emitOWordWrite(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ GenRegister address,
+ ir::BTI bti) const
+ {
+ using namespace ir;
+ const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
+ const uint32_t simdWidth = sel.ctx.getSimdWidth();
+ sel.OBWRITE(address, value, header, bti.imm, simdWidth * sizeof(int));
+ }
+
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
@@ -4185,7 +4252,9 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
assert(0 && "stateless not supported yet");
}
- if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ if (insn.isBlock())
+ this->emitOWordWrite(sel, insn, address, bti);
+ else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
this->emitWrite64(sel, insn, address, bti);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedWrite(sel, insn, address, bti);
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 8d2e1da..51af686 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -175,6 +175,8 @@ namespace gbe
INLINE uint32_t getbti() const {
GBE_ASSERT(isRead() || isWrite());
switch (opcode) {
+ case SEL_OP_OBREAD:
+ case SEL_OP_OBWRITE:
case SEL_OP_DWORD_GATHER: return extra.function;
case SEL_OP_SAMPLE: return extra.rdbti;
case SEL_OP_VME: return extra.vme_bti;
@@ -188,6 +190,8 @@ namespace gbe
INLINE void setbti(uint32_t bti) {
GBE_ASSERT(isRead() || isWrite());
switch (opcode) {
+ case SEL_OP_OBREAD:
+ case SEL_OP_OBWRITE:
case SEL_OP_DWORD_GATHER: extra.function = bti; return;
case SEL_OP_SAMPLE: extra.rdbti = bti; return;
case SEL_OP_VME: extra.vme_bti = bti; return;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 0e11f9f..4a7caff 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -96,3 +96,5 @@ DECL_SELECTION_IR(STORE_PROFILING, StoreProfilingInstruction)
DECL_SELECTION_IR(WORKGROUP_OP, WorkGroupOpInstruction)
DECL_SELECTION_IR(SUBGROUP_OP, SubGroupOpInstruction)
DECL_SELECTION_IR(PRINTF, PrintfInstruction)
+DECL_SELECTION_IR(OBREAD, OBReadInstruction)
+DECL_SELECTION_IR(OBWRITE, OBWriteInstruction)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 47606b2..88491a7 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -483,10 +483,12 @@ namespace ir {
AddressSpace AS,
uint32_t _valueNum,
bool dwAligned,
- AddressMode AM)
+ AddressMode AM,
+ bool ifBlock = false)
: MemInstruction(AM, AS, dwAligned, type, offset),
valueNum(_valueNum),
- values(dstValues)
+ values(dstValues),
+ ifBlock(ifBlock)
{
this->opcode = OP_LOAD;
}
@@ -519,9 +521,11 @@ namespace ir {
}
INLINE bool wellFormed(const Function &fn, std::string &why) const;
INLINE void out(std::ostream &out, const Function &fn) const;
+ INLINE bool isBlock() const { return ifBlock; }
uint8_t valueNum;
Tuple values;
+ bool ifBlock;
};
class ALIGNED_INSTRUCTION StoreInstruction :
public MemInstruction,
@@ -534,12 +538,14 @@ namespace ir {
AddressSpace addrSpace,
uint32_t valueNum,
bool dwAligned,
- AddressMode AM)
+ AddressMode AM,
+ bool ifBlock = false)
: MemInstruction(AM, addrSpace, dwAligned, type, offset)
{
this->opcode = OP_STORE;
this->values = values;
this->valueNum = valueNum;
+ this->ifBlock = ifBlock;
}
INLINE unsigned getValueNum() const { return valueNum; }
INLINE Register getValue(const Function &fn, unsigned id) const {
@@ -565,9 +571,12 @@ namespace ir {
}
INLINE bool wellFormed(const Function &fn, std::string &why) const;
INLINE void out(std::ostream &out, const Function &fn) const;
+ INLINE bool isBlock() const { return ifBlock; }
+
Register dst[0];
uint8_t valueNum;
Tuple values;
+ bool ifBlock;
};
class ALIGNED_INSTRUCTION SampleInstruction : // TODO
@@ -1655,6 +1664,8 @@ namespace ir {
}
INLINE void LoadInstruction::out(std::ostream &out, const Function &fn) const {
+ if(ifBlock)
+ out<< "BLOCK";
this->outOpcode(out);
out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
out << " {";
@@ -1672,6 +1683,8 @@ namespace ir {
}
INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
+ if(ifBlock)
+ out<< "BLOCK";
this->outOpcode(out);
out << "." << type << "." << AS << (dwAligned ? "." : ".un") << "aligned";
out << " %" << this->getSrc(fn, 0) << " {";
@@ -2221,7 +2234,9 @@ DECL_MEM_FN(MemInstruction, bool, isAligned(void), isAligned())
DECL_MEM_FN(MemInstruction, unsigned, getAddressIndex(void), getAddressIndex())
DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
+DECL_MEM_FN(StoreInstruction, bool, isBlock(void), isBlock())
DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
+DECL_MEM_FN(LoadInstruction, bool, isBlock(void), isBlock())
DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
@@ -2475,9 +2490,10 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg))
uint32_t valueNum, \
bool dwAligned, \
AddressMode AM, \
- unsigned SurfaceIndex) \
+ unsigned SurfaceIndex, \
+ bool isBlock) \
{ \
- internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \
+ internal::CLASS insn = internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM, isBlock); \
insn.setSurfaceIndex(SurfaceIndex);\
return insn.convert(); \
} \
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 799a7bf..4a5811b 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -356,6 +356,8 @@ namespace ir {
}
/*! Return true if the given instruction is an instance of this class */
static bool isClassOf(const Instruction &insn);
+ /*! Return true if the given instruction is block write */
+ bool isBlock() const;
};
/*! Load instruction. The source is simply the address where to get the data.
@@ -372,6 +374,8 @@ namespace ir {
}
/*! Return true if the given instruction is an instance of this class */
static bool isClassOf(const Instruction &insn);
+ /*! Return true if the given instruction is block read */
+ bool isBlock() const;
};
/*! Load immediate instruction loads an typed immediate value into the given
@@ -827,10 +831,10 @@ namespace ir {
/*! ret */
Instruction RET(void);
/*! load.type.space {dst1,...,dst_valueNum} offset value, {bti} */
- Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex);
+ Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex, bool isBlock = false);
Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
/*! store.type.space offset {src1,...,src_valueNum} value {bti}*/
- Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex);
+ Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned SurfaceIndex, bool isBlock = false);
Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti);
/*! loadi.type dst value */
Instruction LOADI(Type type, Register dst, ImmediateIndex value);
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index d48f067..3162d13 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -117,11 +117,16 @@ namespace ir {
if (insn.getOpcode() == ir::OP_SIMD_ID)
uniform = false;
+ // do not change dst uniform for block read
+ if (insn.getOpcode() == ir::OP_LOAD && ir::cast<ir::LoadInstruction>(insn).isBlock())
+ uniform = false;
+
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const Register reg = insn.getSrc(srcID);
if (!fn.isUniformRegister(reg))
uniform = false;
}
+
// A destination is a killed value
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const Register reg = insn.getDst(dstID);
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 1d1ec68..83e767c 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
)
ENDMACRO(ADD_LL_TO_BC_TARGET)
-SET (OCL_LL_MODULES ocl_barrier ocl_clz)
+SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_substore)
FOREACH(f ${OCL_LL_MODULES})
COPY_THE_LL(${f})
ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/src/ocl_substore.ll b/backend/src/libocl/src/ocl_substore.ll
new file mode 100644
index 0000000..665cdfa
--- /dev/null
+++ b/backend/src/libocl/src/ocl_substore.ll
@@ -0,0 +1,9 @@
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
+declare void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* nocapture, i32) nounwind alwaysinline noduplicate
+
+define void @_Z27intel_sub_group_block_writePKU3AS1jj(i32 addrspace(1)* %p, i32 %data) nounwind alwaysinline noduplicate {
+ call void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* %p, i32 %data)
+ ret void
+}
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index a25dcef..66490cc 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -133,3 +133,57 @@ RANGE_OP(scan_exclusive, max, float, true)
RANGE_OP(scan_exclusive, max, double, true)
#undef RANGE_OP
+PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
+OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
+{
+ return __gen_ocl_sub_group_block_read_mem(p);
+}
+OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
+{
+ return (uint2)(intel_sub_group_block_read(p),
+ intel_sub_group_block_read(p + get_simd_size()));
+}
+OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
+{
+ return (uint4)(intel_sub_group_block_read(p),
+ intel_sub_group_block_read(p + get_simd_size()),
+ intel_sub_group_block_read(p + get_simd_size() * 2),
+ intel_sub_group_block_read(p + get_simd_size() * 3));
+
+}
+OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
+{
+ return (uint8)(intel_sub_group_block_read(p),
+ intel_sub_group_block_read(p + get_simd_size()),
+ intel_sub_group_block_read(p + get_simd_size() * 2),
+ intel_sub_group_block_read(p + get_simd_size() * 3),
+ intel_sub_group_block_read(p + get_simd_size() * 4),
+ intel_sub_group_block_read(p + get_simd_size() * 5),
+ intel_sub_group_block_read(p + get_simd_size() * 6),
+ intel_sub_group_block_read(p + get_simd_size() * 7));
+}
+
+OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data)
+{
+ intel_sub_group_block_write(p, data.s0);
+ intel_sub_group_block_write(p + get_simd_size(), data.s1);
+}
+OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data)
+{
+ intel_sub_group_block_write(p, data.s0);
+ intel_sub_group_block_write(p + get_simd_size(), data.s1);
+ intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
+ intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
+
+}
+OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data)
+{
+ intel_sub_group_block_write(p, data.s0);
+ intel_sub_group_block_write(p + get_simd_size(), data.s1);
+ intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
+ intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
+ intel_sub_group_block_write(p + get_simd_size() * 4, data.s4);
+ intel_sub_group_block_write(p + get_simd_size() * 5, data.s5);
+ intel_sub_group_block_write(p + get_simd_size() * 6, data.s6);
+ intel_sub_group_block_write(p + get_simd_size() * 7, data.s7);
+}
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 355ee30..d0676be 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -132,3 +132,14 @@ OVERLOADABLE double sub_group_scan_exclusive_max(double x);
OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+
+/* blocak read/write */
+OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
+OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p);
+OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
+OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);
+
+OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3ddbfcc..e77290f 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -697,6 +697,8 @@ namespace gbe
void emitWorkGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
// Emit subgroup instructions
void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
+ // Emit subgroup instructions
+ void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite);
uint8_t appendSampler(CallSite::arg_iterator AI);
uint8_t getImageID(CallInst &I);
@@ -3730,6 +3732,9 @@ namespace gbe
case GEN_OCL_LRP:
this->newRegister(&I);
break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+ this->newRegister(&I, NULL, false);
+ break;
case GEN_OCL_PRINTF:
this->newRegister(&I); // fall through
case GEN_OCL_PUTS:
@@ -3744,6 +3749,7 @@ namespace gbe
case GEN_OCL_CALC_TIMESTAMP:
case GEN_OCL_STORE_PROFILING:
case GEN_OCL_DEBUGWAIT:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
break;
case GEN_OCL_NOT_FOUND:
default:
@@ -3938,6 +3944,61 @@ namespace gbe
GBE_ASSERT(AI == AE);
}
+ void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite) {
+ CallSite::arg_iterator AI = CS.arg_begin();
+ CallSite::arg_iterator AE = CS.arg_end();
+ GBE_ASSERT(AI != AE);
+
+ Value *llvmPtr = *(AI++);
+ Value *llvmValues;
+ ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
+ GBE_ASSERT(addrSpace == ir::MEM_GLOBAL);
+ ir::Register pointer = this->getRegister(llvmPtr);
+
+ ir::Register ptr;
+ ir::Register btiReg;
+ unsigned SurfaceIndex = 0xff;
+
+ ir::AddressMode AM;
+ if (legacyMode) {
+ Value *bti = getBtiRegister(llvmPtr);
+ Value *ptrBase = getPointerBase(llvmPtr);
+ ir::Register baseReg = this->getRegister(ptrBase);
+ if (isa<ConstantInt>(bti)) {
+ AM = ir::AM_StaticBti;
+ SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue();
+ addrSpace = btiToGen(SurfaceIndex);
+ } else {
+ AM = ir::AM_DynamicBti;
+ addrSpace = ir::MEM_MIXED;
+ btiReg = this->getRegister(bti);
+ }
+ const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+ ptr = ctx.reg(pointerFamily);
+ ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+ } else {
+ AM = ir::AM_Stateless;
+ ptr = pointer;
+ }
+
+ ir::Type type = ir::TYPE_U32;
+ GBE_ASSERT(AM != ir::AM_DynamicBti);
+
+ if(isWrite){
+ llvmValues = *(AI++);
+ const ir::Register values = getRegister(llvmValues);
+ const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
+ ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true);
+ } else {
+ llvmValues = &I;
+ const ir::Register values = getRegister(llvmValues);
+ const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
+ ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true);
+ }
+
+ GBE_ASSERT(AI == AE);
+ }
+
/* append a new sampler. should be called before any reference to
* a sampler_t value. */
uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
@@ -4762,6 +4823,10 @@ namespace gbe
ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2);
break;
}
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+ this->emitBlockReadWriteMemInst(I, CS, false); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+ this->emitBlockReadWriteMemInst(I, CS, true); break;
default: break;
}
}
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 213ead0..003be91 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -202,7 +202,7 @@ DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_work_group_scan_
DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ALL, __gen_ocl_work_group_all)
DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ANY, __gen_ocl_work_group_any)
-// work group function
+// sub group function
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BROADCAST, __gen_ocl_sub_group_broadcast)
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_REDUCE_ADD, __gen_ocl_sub_group_reduce_add)
@@ -217,5 +217,8 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, __gen_ocl_sub_group_scan_in
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_inclusive_max)
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem)
+
// common function
DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
--
2.7.4
More information about the Beignet
mailing list