[Beignet] [PATCH 1/2] Backend: Refine block read/write buffer
Xiuli Pan
xiuli.pan at intel.com
Fri Jun 24 02:20:14 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
Using max to 8 OWORD as read/write size for high profermance.
V4: Reuse tmp for oword read for small and less vector.
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
backend/src/backend/gen_context.cpp | 154 ++++++++++++++++++++++++++---
backend/src/backend/gen_encoder.cpp | 6 +-
backend/src/backend/gen_insn_selection.cpp | 84 +++++++++++-----
backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 41 +++-----
backend/src/llvm/llvm_gen_backend.cpp | 46 ++++++---
backend/src/llvm/llvm_gen_ocl_function.hxx | 6 ++
backend/src/llvm/llvm_scalarize.cpp | 12 +++
7 files changed, 263 insertions(+), 86 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 081033a..5303191 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3488,11 +3488,17 @@ namespace gbe
}
void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
- const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
- GenRegister header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD);
+ const GenRegister header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_UD);
+ const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
+ const uint32_t vec_size = insn.extra.elem;
+ const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size)), GEN_TYPE_UD);
+ const uint32_t simdWidth = p->curr.execWidth;
+ // Make header
p->push();
+ {
// Copy r0 into the header first
p->curr.execWidth = 8;
p->curr.predicate = GEN_PREDICATE_NONE;
@@ -3501,23 +3507,81 @@ namespace gbe
// Update the header with the current address
p->curr.execWidth = 1;
- p->SHR(GenRegister::offset(header, 0, 2*4), addr, GenRegister::immud(4));
+ p->SHR(headeraddr, addr, GenRegister::immud(4));
// Put zero in the general state base address
- p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
-
+ p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
+ }
p->pop();
- // Now read the data
- p->OBREAD(dst, header, insn.getbti(), insn.extra.elem);
+ // Now read the data, oword block read can only work with simd16 and no mask
+ if (vec_size == 1) {
+ p->push();
+ {
+ p->curr.execWidth = 16;
+ p->curr.noMask = 1;
+ p->OBREAD(dst, header, insn.getbti(), simdWidth / 4);
+ }
+ p->pop();
+ } else if (vec_size == 2) {
+ p->push();
+ {
+ p->curr.execWidth = 16;
+ p->curr.noMask = 1;
+ p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2);
+ }
+ p->pop();
+ p->MOV(ra->genReg(insn.dst(0)), GenRegister::offset(tmp, 0));
+ p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, simdWidth / 8));
+ } else if (vec_size == 4 || vec_size == 8) {
+ if (simdWidth == 8) {
+ for (uint32_t i = 0; i < vec_size / 4; i++) {
+ if (i > 0) {
+ p->push();
+ {
+ // Update the address in header
+ p->curr.execWidth = 1;
+ p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ }
+ p->pop();
+ }
+ p->push();
+ {
+ p->curr.execWidth = 16;
+ p->curr.noMask = 1;
+ p->OBREAD(tmp, header, insn.getbti(), 8);
+ }
+ p->pop();
+ for (uint32_t j = 0; j < 4; j++)
+ p->MOV(ra->genReg(insn.dst(j + i * 4)), GenRegister::offset(tmp, j));
+ }
+ } else {
+ for (uint32_t i = 0; i < vec_size / 2; i++) {
+ if (i > 0) {
+ p->push();
+ {
+ // Update the address in header
+ p->curr.execWidth = 1;
+ p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ }
+ p->pop();
+ }
+ p->OBREAD(tmp, header, insn.getbti(), 8);
+ for (uint32_t j = 0; j < 2; j++)
+ p->MOV(ra->genReg(insn.dst(j + i * 2)), GenRegister::offset(tmp, j*2));
+ }
+ }
+ } else NOT_SUPPORTED;
}
void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
- const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(2)), GEN_TYPE_UD);
- GenRegister header;
- if (simdWidth == 8)
- header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
- else
- header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(0)),1), GEN_TYPE_UD);
+ const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+ const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+ const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
+ const uint32_t vec_size = insn.extra.elem;
+ const GenRegister tmp = GenRegister::offset(header, 1);
+ const uint32_t simdWidth = p->curr.execWidth;
+ uint32_t tmp_size = simdWidth * vec_size / 8;
+ tmp_size = tmp_size > 4 ? 4 : tmp_size;
p->push();
// Copy r0 into the header first
@@ -3528,14 +3592,72 @@ namespace gbe
// Update the header with the current address
p->curr.execWidth = 1;
- p->SHR(GenRegister::offset(header, 0, 2*4), addr, GenRegister::immud(4));
+ p->SHR(headeraddr, addr, GenRegister::immud(4));
// Put zero in the general state base address
p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
p->pop();
- // Now write the data
- p->OBWRITE(header, insn.getbti(), insn.extra.elem);
+ // Now write the data, oword block write can only work with simd16 and no mask
+ if (vec_size == 1) {
+ p->MOV(tmp, ra->genReg(insn.src(1)));
+ p->push();
+ {
+ p->curr.execWidth = 16;
+ p->curr.noMask = 1;
+ p->OBWRITE(header, insn.getbti(), simdWidth / 4);
+ }
+ p->pop();
+ } else if (vec_size == 2) {
+ p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ;
+ p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra->genReg(insn.src(2))) ;
+ p->push();
+ {
+ p->curr.execWidth = 16;
+ p->curr.noMask = 1;
+ p->OBWRITE(header, insn.getbti(), simdWidth / 2);
+ }
+ p->pop();
+ } else if (vec_size == 4 || vec_size == 8) {
+ if (simdWidth == 8) {
+ for (uint32_t i = 0; i < vec_size / 4; i++) {
+ for (uint32_t j = 0; j < 4; j++)
+ p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j + i*4))) ;
+ if (i > 0) {
+ p->push();
+ {
+ // Update the address in header
+ p->curr.execWidth = 1;
+ p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ }
+ p->pop();
+ }
+ p->push();
+ {
+ p->curr.execWidth = 16;
+ p->curr.noMask = 1;
+ p->OBWRITE(header, insn.getbti(), 8);
+ }
+ p->pop();
+ }
+ } else {
+ for (uint32_t i = 0; i < vec_size / 2; i++) {
+ for (uint32_t j = 0; j < 2; j++)
+ p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + j + i*2))) ;
+ if (i > 0) {
+ p->push();
+ {
+ // Update the address in header
+ p->curr.execWidth = 1;
+ p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ }
+ p->pop();
+ }
+ p->OBWRITE(header, insn.getbti(), 8);
+ }
+ }
+ } else NOT_SUPPORTED;
+
}
void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index eb9fbeb..f8c99b2 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -269,10 +269,10 @@ namespace gbe
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
p->setMessageDescriptor(insn, sfid, msg_length, response_length);
- assert(size == 2 || size == 4);
+ assert(size == 2 || size == 4 || size == 8);
insn->bits3.gen7_oblock_rw.msg_type = msg_type;
insn->bits3.gen7_oblock_rw.bti = bti;
- insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
+ insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : 4);
insn->bits3.gen7_oblock_rw.header_present = 1;
}
@@ -1261,7 +1261,7 @@ namespace gbe
void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
const uint32_t msg_length = 1;
- const uint32_t response_length = size / 2; // Size is in owords
+ const uint32_t response_length = size / 2; // Size is in regs
this->setHeader(insn);
this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 9a5567d..c566957 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -702,9 +702,9 @@ namespace gbe
void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
GenRegister tmpData1, GenRegister tmpData2);
/*! Oblock read */
- void OBREAD(GenRegister dst, GenRegister addr, GenRegister header, uint32_t bti, uint32_t size);
+ void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
/*! Oblock write */
- void OBWRITE(GenRegister addr, GenRegister value, GenRegister header, uint32_t bti, uint32_t size);
+ void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
/*! Media block read */
void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
/*! Media block write */
@@ -2027,38 +2027,54 @@ namespace gbe
insn->src(0) = src;
insn->src(1) = tmpData2;
}
- void Selection::Opaque::OBREAD(GenRegister dst,
+ void Selection::Opaque::OBREAD(GenRegister* dsts,
+ uint32_t vec_size,
GenRegister addr,
GenRegister header,
uint32_t bti,
- uint32_t size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2);
- insn->dst(0) = dst;
+ GenRegister* tmp,
+ uint32_t tmp_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size + tmp_size, 2);
+ SelectionVector *vector = this->appendVector();
+ for (uint32_t i = 0; i < vec_size; ++i)
+ insn->dst(i) = dsts[i];
+ for (uint32_t i = 0; i < tmp_size; ++i)
+ insn->dst(i + vec_size) = tmp[i];
insn->src(0) = addr;
insn->src(1) = header;
insn->setbti(bti);
- insn->extra.elem = size / sizeof(int[4]); // number of owords
+ insn->extra.elem = vec_size; // number of vector size
+
+ // tmp regs for OWORD read dst
+ vector->regNum = tmp_size;
+ vector->reg = &insn->dst(vec_size);
+ vector->offsetID = vec_size;
+ vector->isSrc = 0;
}
void Selection::Opaque::OBWRITE(GenRegister addr,
- GenRegister value,
+ GenRegister* values,
+ uint32_t vec_size,
GenRegister header,
uint32_t bti,
- uint32_t size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3);
+ GenRegister* tmp,
+ uint32_t tmp_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1);
SelectionVector *vector = this->appendVector();
- insn->src(0) = header;
- insn->src(1) = value;
- insn->src(2) = addr;
- insn->state = this->curr;
+ insn->src(0) = addr;
+ for (uint32_t i = 0; i < vec_size; ++i)
+ insn->src(i + 1) = values[i];
+ insn->dst(0) = header;
+ for (uint32_t i = 0; i < tmp_size; ++i)
+ insn->dst(i + 1) = tmp[i];
insn->setbti(bti);
- insn->extra.elem = size / sizeof(int[4]); // number of owords
+ insn->extra.elem = vec_size; // number of vector_size
- // We need to put the header and the data together
- vector->regNum = 2;
- vector->reg = &insn->src(0);
+ // tmp regs for OWORD read dst
+ vector->regNum = tmp_size + 1;
+ vector->reg = &insn->dst(0);
vector->offsetID = 0;
- vector->isSrc = 1;
+ vector->isSrc = 0;
}
void Selection::Opaque::MBREAD(GenRegister* dsts,
@@ -4113,10 +4129,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
ir::BTI bti) const
{
using namespace ir;
- const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
- const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
+ const uint32_t vec_size = insn.getValueNum();
const uint32_t simdWidth = sel.ctx.getSimdWidth();
- sel.OBREAD(value, address, header, bti.imm, simdWidth * sizeof(int));
+ const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+ vector<GenRegister> valuesVec;
+ for(uint32_t i = 0; i < vec_size; i++)
+ valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
+ // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
+ uint32_t tmp_size = simdWidth * vec_size / 8;
+ tmp_size = tmp_size > 4 ? 4 : tmp_size;
+ vector<GenRegister> tmpVec;
+ for(uint32_t i = 0; i < tmp_size; i++)
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+ sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm, &tmpVec[0], tmp_size);
}
// check whether all binded table index point to constant memory
@@ -4289,10 +4314,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
ir::BTI bti) const
{
using namespace ir;
- const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
- const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
+ const uint32_t vec_size = insn.getValueNum();
const uint32_t simdWidth = sel.ctx.getSimdWidth();
- sel.OBWRITE(address, value, header, bti.imm, simdWidth * sizeof(int));
+ const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+ vector<GenRegister> valuesVec;
+ for(uint32_t i = 0; i < vec_size; i++)
+ valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
+ // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
+ uint32_t tmp_size = simdWidth * vec_size / 8;
+ tmp_size = tmp_size > 4 ? 4 : tmp_size;
+ vector<GenRegister> tmpVec;
+ for(uint32_t i = 0; i < tmp_size; i++)
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+ sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm, &tmpVec[0], tmp_size);
}
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 5d3d0bb..b066502 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -134,63 +134,46 @@ RANGE_OP(scan_exclusive, max, double, true)
#undef RANGE_OP
PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global uint* p);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global uint* p);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global uint* p);
OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
{
return __gen_ocl_sub_group_block_read_mem(p);
}
OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
{
- return (uint2)(intel_sub_group_block_read(p),
- intel_sub_group_block_read(p + get_simd_size()));
+ return __gen_ocl_sub_group_block_read_mem2(p);
}
OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
{
- return (uint4)(intel_sub_group_block_read(p),
- intel_sub_group_block_read(p + get_simd_size()),
- intel_sub_group_block_read(p + get_simd_size() * 2),
- intel_sub_group_block_read(p + get_simd_size() * 3));
+ return __gen_ocl_sub_group_block_read_mem4(p);
}
OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
{
- return (uint8)(intel_sub_group_block_read(p),
- intel_sub_group_block_read(p + get_simd_size()),
- intel_sub_group_block_read(p + get_simd_size() * 2),
- intel_sub_group_block_read(p + get_simd_size() * 3),
- intel_sub_group_block_read(p + get_simd_size() * 4),
- intel_sub_group_block_read(p + get_simd_size() * 5),
- intel_sub_group_block_read(p + get_simd_size() * 6),
- intel_sub_group_block_read(p + get_simd_size() * 7));
+ return __gen_ocl_sub_group_block_read_mem8(p);
}
-
void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint data);
+void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2 data);
+void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4 data);
+void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8 data);
OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint data)
{
__gen_ocl_sub_group_block_write_mem(p, data);
}
OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data)
{
- intel_sub_group_block_write(p, data.s0);
- intel_sub_group_block_write(p + get_simd_size(), data.s1);
+ __gen_ocl_sub_group_block_write_mem2(p, data);
}
OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data)
{
- intel_sub_group_block_write(p, data.s0);
- intel_sub_group_block_write(p + get_simd_size(), data.s1);
- intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
- intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
+ __gen_ocl_sub_group_block_write_mem4(p, data);
}
OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data)
{
- intel_sub_group_block_write(p, data.s0);
- intel_sub_group_block_write(p + get_simd_size(), data.s1);
- intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
- intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
- intel_sub_group_block_write(p + get_simd_size() * 4, data.s4);
- intel_sub_group_block_write(p + get_simd_size() * 5, data.s5);
- intel_sub_group_block_write(p + get_simd_size() * 6, data.s6);
- intel_sub_group_block_write(p + get_simd_size() * 7, data.s7);
+ __gen_ocl_sub_group_block_write_mem8(p, data);
}
PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int y);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 419f585..074391f 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -698,7 +698,7 @@ namespace gbe
// Emit subgroup instructions
void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
// Emit subgroup instructions
- void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite);
+ void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
uint8_t appendSampler(CallSite::arg_iterator AI);
@@ -3726,6 +3726,9 @@ namespace gbe
case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN:
case GEN_OCL_LRP:
case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
@@ -3747,6 +3750,9 @@ namespace gbe
case GEN_OCL_STORE_PROFILING:
case GEN_OCL_DEBUGWAIT:
case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
@@ -3945,13 +3951,12 @@ namespace gbe
GBE_ASSERT(AI == AE);
}
- void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite) {
+ void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
CallSite::arg_iterator AI = CS.arg_begin();
CallSite::arg_iterator AE = CS.arg_end();
GBE_ASSERT(AI != AE);
Value *llvmPtr = *(AI++);
- Value *llvmValues;
ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace());
GBE_ASSERT(addrSpace == ir::MEM_GLOBAL);
ir::Register pointer = this->getRegister(llvmPtr);
@@ -3986,15 +3991,18 @@ namespace gbe
GBE_ASSERT(AM != ir::AM_DynamicBti);
if(isWrite){
- llvmValues = *(AI++);
- const ir::Register values = getRegister(llvmValues);
- const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
- ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true);
+ Value *llvmValues = *(AI++);
+ vector<ir::Register> srcTupleData;
+ for(int i = 0;i < vec_size; i++)
+ srcTupleData.push_back(getRegister(llvmValues, i));
+ const ir::Tuple tuple = ctx.arrayTuple(&srcTupleData[0], vec_size);
+ ctx.STORE(type, tuple, ptr, addrSpace, vec_size, true, AM, SurfaceIndex, true);
} else {
- llvmValues = &I;
- const ir::Register values = getRegister(llvmValues);
- const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
- ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true);
+ vector<ir::Register> dstTupleData;
+ for(int i = 0;i < vec_size; i++)
+ dstTupleData.push_back(getRegister(&I, i));
+ const ir::Tuple tuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
+ ctx.LOAD(type, tuple, ptr, addrSpace, vec_size, true, AM, SurfaceIndex, true);
}
GBE_ASSERT(AI == AE);
@@ -4858,9 +4866,21 @@ namespace gbe
break;
}
case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
- this->emitBlockReadWriteMemInst(I, CS, false); break;
+ this->emitBlockReadWriteMemInst(I, CS, false, 1); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
+ this->emitBlockReadWriteMemInst(I, CS, false, 2); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
+ this->emitBlockReadWriteMemInst(I, CS, false, 4); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
+ this->emitBlockReadWriteMemInst(I, CS, false, 8); break;
case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
- this->emitBlockReadWriteMemInst(I, CS, true); break;
+ this->emitBlockReadWriteMemInst(I, CS, true, 1); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
+ this->emitBlockReadWriteMemInst(I, CS, true, 2); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
+ this->emitBlockReadWriteMemInst(I, CS, true, 4); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+ this->emitBlockReadWriteMemInst(I, CS, true, 8); break;
case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
this->emitBlockReadWriteImageInst(I, CS, false, 1); break;
case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 456ab58..48a72d1 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -218,7 +218,13 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_in
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min)
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2, __gen_ocl_sub_group_block_read_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4, __gen_ocl_sub_group_block_read_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8, __gen_ocl_sub_group_block_read_mem8)
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2, __gen_ocl_sub_group_block_write_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4, __gen_ocl_sub_group_block_write_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8, __gen_ocl_sub_group_block_write_mem8)
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, __gen_ocl_sub_group_block_read_image)
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, __gen_ocl_sub_group_block_read_image2)
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, __gen_ocl_sub_group_block_read_image4)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index e60bf4b..615fb50 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -693,7 +693,19 @@ namespace gbe {
*CI = InsertToVector(call, *CI);
break;
}
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+ {
+ if ((*CI)->getType()->isVectorTy())
+ *CI = InsertToVector(call, *CI);
+ break;
+ }
case GEN_OCL_VME:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
--
2.7.4
More information about the Beignet
mailing list