[Beignet] [PATCH V5] Backend: Refine block read/write buffer
Yang, Rong R
rong.r.yang at intel.com
Fri Jun 24 09:02:16 UTC 2016
The patchset LGTM, pushed.
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Xiuli Pan
> Sent: Friday, June 24, 2016 11:08
> To: beignet at lists.freedesktop.org
> Cc: Pan, Xiuli <xiuli.pan at intel.com>
> Subject: [Beignet] [PATCH V5] Backend: Refine block read/write buffer
>
> From: Pan Xiuli <xiuli.pan at intel.com>
>
> Using max to 8 OWORD as read/write size for high profermance.
> V4: Reuse tmp for oword read for small and less vector.
> V5: Move all tmp in dst
>
> Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
> ---
> backend/src/backend/gen_context.cpp | 154
> ++++++++++++++++++++++++++---
> backend/src/backend/gen_encoder.cpp | 6 +-
> backend/src/backend/gen_insn_selection.cpp | 86 +++++++++++-----
> backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 41 +++-----
> backend/src/llvm/llvm_gen_backend.cpp | 46 ++++++---
> backend/src/llvm/llvm_gen_ocl_function.hxx | 6 ++
> backend/src/llvm/llvm_scalarize.cpp | 12 +++
> 7 files changed, 264 insertions(+), 87 deletions(-)
>
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 081033a..5ddf084 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -3488,11 +3488,17 @@ namespace gbe
> }
>
> void GenContext::emitOBReadInstruction(const SelectionInstruction &insn)
> {
> - const GenRegister dst = ra->genReg(insn.dst(0));
> + const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)),
> GEN_TYPE_UD);
> const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)),
> GEN_TYPE_UD);
> - GenRegister header = GenRegister::retype(ra->genReg(insn.src(1)),
> GEN_TYPE_UD);
> + const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> + const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
> + const uint32_t vec_size = insn.extra.elem;
> + const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 +
> vec_size)), GEN_TYPE_UD);
> + const uint32_t simdWidth = p->curr.execWidth;
>
> + // Make header
> p->push();
> + {
> // Copy r0 into the header first
> p->curr.execWidth = 8;
> p->curr.predicate = GEN_PREDICATE_NONE;
> @@ -3501,23 +3507,81 @@ namespace gbe
>
> // Update the header with the current address
> p->curr.execWidth = 1;
> - p->SHR(GenRegister::offset(header, 0, 2*4), addr,
> GenRegister::immud(4));
> + p->SHR(headeraddr, addr, GenRegister::immud(4));
>
> // Put zero in the general state base address
> - p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
> -
> + p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
> + }
> p->pop();
> - // Now read the data
> - p->OBREAD(dst, header, insn.getbti(), insn.extra.elem);
> + // Now read the data, oword block read can only work with simd16 and no
> mask
> + if (vec_size == 1) {
> + p->push();
> + {
> + p->curr.execWidth = 16;
> + p->curr.noMask = 1;
> + p->OBREAD(dst, header, insn.getbti(), simdWidth / 4);
> + }
> + p->pop();
> + } else if (vec_size == 2) {
> + p->push();
> + {
> + p->curr.execWidth = 16;
> + p->curr.noMask = 1;
> + p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2);
> + }
> + p->pop();
> + p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
> + p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth /
> 8));
> + } else if (vec_size == 4 || vec_size == 8) {
> + if (simdWidth == 8) {
> + for (uint32_t i = 0; i < vec_size / 4; i++) {
> + if (i > 0) {
> + p->push();
> + {
> + // Update the address in header
> + p->curr.execWidth = 1;
> + p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
> + }
> + p->pop();
> + }
> + p->push();
> + {
> + p->curr.execWidth = 16;
> + p->curr.noMask = 1;
> + p->OBREAD(tmp, header, insn.getbti(), 8);
> + }
> + p->pop();
> + for (uint32_t j = 0; j < 4; j++)
> + p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp,
> j));
> + }
> + } else {
> + for (uint32_t i = 0; i < vec_size / 2; i++) {
> + if (i > 0) {
> + p->push();
> + {
> + // Update the address in header
> + p->curr.execWidth = 1;
> + p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
> + }
> + p->pop();
> + }
> + p->OBREAD(tmp, header, insn.getbti(), 8);
> + for (uint32_t j = 0; j < 2; j++)
> + p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp,
> j*2));
> + }
> + }
> + } else NOT_SUPPORTED;
> }
>
> void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn)
> {
> - const GenRegister addr = GenRegister::toUniform(ra-
> >genReg(insn.src(2)), GEN_TYPE_UD);
> - GenRegister header;
> - if (simdWidth == 8)
> - header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
> - else
> - header = GenRegister::retype(GenRegister::Qn(ra-
> >genReg(insn.src(0)),1), GEN_TYPE_UD);
> + const GenRegister addr = GenRegister::toUniform(ra-
> >genReg(insn.src(0)), GEN_TYPE_UD);
> + const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> + const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
> + const uint32_t vec_size = insn.extra.elem;
> + const GenRegister tmp = GenRegister::offset(header, 1);
> + const uint32_t simdWidth = p->curr.execWidth;
> + uint32_t tmp_size = simdWidth * vec_size / 8;
> + tmp_size = tmp_size > 4 ? 4 : tmp_size;
>
> p->push();
> // Copy r0 into the header first
> @@ -3528,14 +3592,72 @@ namespace gbe
>
> // Update the header with the current address
> p->curr.execWidth = 1;
> - p->SHR(GenRegister::offset(header, 0, 2*4), addr,
> GenRegister::immud(4));
> + p->SHR(headeraddr, addr, GenRegister::immud(4));
>
> // Put zero in the general state base address
> p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
>
> p->pop();
> - // Now write the data
> - p->OBWRITE(header, insn.getbti(), insn.extra.elem);
> + // Now write the data, oword block write can only work with simd16 and
> no mask
> + if (vec_size == 1) {
> + p->MOV(tmp, ra->genReg(insn.src(1)));
> + p->push();
> + {
> + p->curr.execWidth = 16;
> + p->curr.noMask = 1;
> + p->OBWRITE(header, insn.getbti(), simdWidth / 4);
> + }
> + p->pop();
> + } else if (vec_size == 2) {
> + p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ;
> + p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra-
> >genReg(insn.src(2))) ;
> + p->push();
> + {
> + p->curr.execWidth = 16;
> + p->curr.noMask = 1;
> + p->OBWRITE(header, insn.getbti(), simdWidth / 2);
> + }
> + p->pop();
> + } else if (vec_size == 4 || vec_size == 8) {
> + if (simdWidth == 8) {
> + for (uint32_t i = 0; i < vec_size / 4; i++) {
> + for (uint32_t j = 0; j < 4; j++)
> + p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j +
> i*4))) ;
> + if (i > 0) {
> + p->push();
> + {
> + // Update the address in header
> + p->curr.execWidth = 1;
> + p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
> + }
> + p->pop();
> + }
> + p->push();
> + {
> + p->curr.execWidth = 16;
> + p->curr.noMask = 1;
> + p->OBWRITE(header, insn.getbti(), 8);
> + }
> + p->pop();
> + }
> + } else {
> + for (uint32_t i = 0; i < vec_size / 2; i++) {
> + for (uint32_t j = 0; j < 2; j++)
> + p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + j +
> i*2))) ;
> + if (i > 0) {
> + p->push();
> + {
> + // Update the address in header
> + p->curr.execWidth = 1;
> + p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
> + }
> + p->pop();
> + }
> + p->OBWRITE(header, insn.getbti(), 8);
> + }
> + }
> + } else NOT_SUPPORTED;
> +
> }
>
> void GenContext::emitMBReadInstruction(const SelectionInstruction &insn)
> {
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index eb9fbeb..f8c99b2 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -269,10 +269,10 @@ namespace gbe
> {
> const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
> p->setMessageDescriptor(insn, sfid, msg_length, response_length);
> - assert(size == 2 || size == 4);
> + assert(size == 2 || size == 4 || size == 8);
> insn->bits3.gen7_oblock_rw.msg_type = msg_type;
> insn->bits3.gen7_oblock_rw.bti = bti;
> - insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
> + insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : 4);
> insn->bits3.gen7_oblock_rw.header_present = 1;
> }
>
> @@ -1261,7 +1261,7 @@ namespace gbe
> void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t size) {
> GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> const uint32_t msg_length = 1;
> - const uint32_t response_length = size / 2; // Size is in owords
> + const uint32_t response_length = size / 2; // Size is in regs
> this->setHeader(insn);
> this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 9a5567d..788a69a 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -702,9 +702,9 @@ namespace gbe
> void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
> GenRegister tmpData1, GenRegister tmpData2);
> /*! Oblock read */
> - void OBREAD(GenRegister dst, GenRegister addr, GenRegister header,
> uint32_t bti, uint32_t size);
> + void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr,
> GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
> /*! Oblock write */
> - void OBWRITE(GenRegister addr, GenRegister value, GenRegister header,
> uint32_t bti, uint32_t size);
> + void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size,
> GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
> /*! Media block read */
> void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister
> coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t
> vec_size);
> /*! Media block write */
> @@ -2027,38 +2027,54 @@ namespace gbe
> insn->src(0) = src;
> insn->src(1) = tmpData2;
> }
> - void Selection::Opaque::OBREAD(GenRegister dst,
> + void Selection::Opaque::OBREAD(GenRegister* dsts,
> + uint32_t vec_size,
> GenRegister addr,
> GenRegister header,
> uint32_t bti,
> - uint32_t size) {
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2);
> - insn->dst(0) = dst;
> + GenRegister* tmp,
> + uint32_t tmp_size) {
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 +
> vec_size + tmp_size, 1);
> + SelectionVector *vector = this->appendVector();
> + insn->dst(0) = header;
> + for (uint32_t i = 0; i < vec_size; ++i)
> + insn->dst(1 + i) = dsts[i];
> + for (uint32_t i = 0; i < tmp_size; ++i)
> + insn->dst(1 + i + vec_size) = tmp[i];
> insn->src(0) = addr;
> - insn->src(1) = header;
> insn->setbti(bti);
> - insn->extra.elem = size / sizeof(int[4]); // number of owords
> + insn->extra.elem = vec_size; // number of vector size
> +
> + // tmp regs for OWORD read dst
> + vector->regNum = tmp_size;
> + vector->reg = &insn->dst(1 + vec_size);
> + vector->offsetID = 1 + vec_size;
> + vector->isSrc = 0;
> }
>
> void Selection::Opaque::OBWRITE(GenRegister addr,
> - GenRegister value,
> + GenRegister* values,
> + uint32_t vec_size,
> GenRegister header,
> uint32_t bti,
> - uint32_t size) {
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3);
> + GenRegister* tmp,
> + uint32_t tmp_size) {
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE,
> tmp_size + 1, vec_size + 1);
> SelectionVector *vector = this->appendVector();
> - insn->src(0) = header;
> - insn->src(1) = value;
> - insn->src(2) = addr;
> - insn->state = this->curr;
> + insn->src(0) = addr;
> + for (uint32_t i = 0; i < vec_size; ++i)
> + insn->src(i + 1) = values[i];
> + insn->dst(0) = header;
> + for (uint32_t i = 0; i < tmp_size; ++i)
> + insn->dst(i + 1) = tmp[i];
> insn->setbti(bti);
> - insn->extra.elem = size / sizeof(int[4]); // number of owords
> + insn->extra.elem = vec_size; // number of vector_size
>
> - // We need to put the header and the data together
> - vector->regNum = 2;
> - vector->reg = &insn->src(0);
> + // tmp regs for OWORD read dst
> + vector->regNum = tmp_size + 1;
> + vector->reg = &insn->dst(0);
> vector->offsetID = 0;
> - vector->isSrc = 1;
> + vector->isSrc = 0;
> }
>
> void Selection::Opaque::MBREAD(GenRegister* dsts,
> @@ -4113,10 +4129,19 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> ir::BTI bti) const
> {
> using namespace ir;
> - const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD),
> TYPE_U32);
> - const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
> + const uint32_t vec_size = insn.getValueNum();
> const uint32_t simdWidth = sel.ctx.getSimdWidth();
> - sel.OBREAD(value, address, header, bti.imm, simdWidth * sizeof(int));
> + const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> TYPE_U32);
> + vector<GenRegister> valuesVec;
> + for(uint32_t i = 0; i < vec_size; i++)
> + valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
> + // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
> + uint32_t tmp_size = simdWidth * vec_size / 8;
> + tmp_size = tmp_size > 4 ? 4 : tmp_size;
> + vector<GenRegister> tmpVec;
> + for(uint32_t i = 0; i < tmp_size; i++)
> +
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), TYPE_U32));
> + sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm,
> &tmpVec[0], tmp_size);
> }
>
> // check whether all binded table index point to constant memory
> @@ -4289,10 +4314,19 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> ir::BTI bti) const
> {
> using namespace ir;
> - const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD),
> TYPE_U32);
> - const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32);
> + const uint32_t vec_size = insn.getValueNum();
> const uint32_t simdWidth = sel.ctx.getSimdWidth();
> - sel.OBWRITE(address, value, header, bti.imm, simdWidth * sizeof(int));
> + const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> TYPE_U32);
> + vector<GenRegister> valuesVec;
> + for(uint32_t i = 0; i < vec_size; i++)
> + valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
> + // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
> + uint32_t tmp_size = simdWidth * vec_size / 8;
> + tmp_size = tmp_size > 4 ? 4 : tmp_size;
> + vector<GenRegister> tmpVec;
> + for(uint32_t i = 0; i < tmp_size; i++)
> +
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), TYPE_U32));
> + sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm,
> &tmpVec[0], tmp_size);
> }
>
> virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
> diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> index 5d3d0bb..b066502 100644
> --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> @@ -134,63 +134,46 @@ RANGE_OP(scan_exclusive, max, double, true)
>
> #undef RANGE_OP
> PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global
> uint* p);
> +PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global
> uint* p);
> +PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global
> uint* p);
> +PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global
> uint* p);
> OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
> {
> return __gen_ocl_sub_group_block_read_mem(p);
> }
> OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
> {
> - return (uint2)(intel_sub_group_block_read(p),
> - intel_sub_group_block_read(p + get_simd_size()));
> + return __gen_ocl_sub_group_block_read_mem2(p);
> }
> OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
> {
> - return (uint4)(intel_sub_group_block_read(p),
> - intel_sub_group_block_read(p + get_simd_size()),
> - intel_sub_group_block_read(p + get_simd_size() * 2),
> - intel_sub_group_block_read(p + get_simd_size() * 3));
> + return __gen_ocl_sub_group_block_read_mem4(p);
>
> }
> OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
> {
> - return (uint8)(intel_sub_group_block_read(p),
> - intel_sub_group_block_read(p + get_simd_size()),
> - intel_sub_group_block_read(p + get_simd_size() * 2),
> - intel_sub_group_block_read(p + get_simd_size() * 3),
> - intel_sub_group_block_read(p + get_simd_size() * 4),
> - intel_sub_group_block_read(p + get_simd_size() * 5),
> - intel_sub_group_block_read(p + get_simd_size() * 6),
> - intel_sub_group_block_read(p + get_simd_size() * 7));
> + return __gen_ocl_sub_group_block_read_mem8(p);
> }
> -
> void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint
> data);
> +void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2
> data);
> +void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4
> data);
> +void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8
> data);
> OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint
> data)
> {
> __gen_ocl_sub_group_block_write_mem(p, data);
> }
> OVERLOADABLE void intel_sub_group_block_write2(const global uint* p,
> uint2 data)
> {
> - intel_sub_group_block_write(p, data.s0);
> - intel_sub_group_block_write(p + get_simd_size(), data.s1);
> + __gen_ocl_sub_group_block_write_mem2(p, data);
> }
> OVERLOADABLE void intel_sub_group_block_write4(const global uint*
> p,uint4 data)
> {
> - intel_sub_group_block_write(p, data.s0);
> - intel_sub_group_block_write(p + get_simd_size(), data.s1);
> - intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
> - intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
> + __gen_ocl_sub_group_block_write_mem4(p, data);
>
> }
> OVERLOADABLE void intel_sub_group_block_write8(const global uint*
> p,uint8 data)
> {
> - intel_sub_group_block_write(p, data.s0);
> - intel_sub_group_block_write(p + get_simd_size(), data.s1);
> - intel_sub_group_block_write(p + get_simd_size() * 2, data.s2);
> - intel_sub_group_block_write(p + get_simd_size() * 3, data.s3);
> - intel_sub_group_block_write(p + get_simd_size() * 4, data.s4);
> - intel_sub_group_block_write(p + get_simd_size() * 5, data.s5);
> - intel_sub_group_block_write(p + get_simd_size() * 6, data.s6);
> - intel_sub_group_block_write(p + get_simd_size() * 7, data.s7);
> + __gen_ocl_sub_group_block_write_mem8(p, data);
> }
>
> PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p,
> int x, int y);
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 419f585..074391f 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -698,7 +698,7 @@ namespace gbe
> // Emit subgroup instructions
> void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps
> opcode);
> // Emit subgroup instructions
> - void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite);
> + void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite,
> uint8_t vec_size);
> void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite,
> uint8_t vec_size);
>
> uint8_t appendSampler(CallSite::arg_iterator AI);
> @@ -3726,6 +3726,9 @@ namespace gbe
> case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN:
> case GEN_OCL_LRP:
> case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
> case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
> case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
> case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
> @@ -3747,6 +3750,9 @@ namespace gbe
> case GEN_OCL_STORE_PROFILING:
> case GEN_OCL_DEBUGWAIT:
> case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
> case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
> case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
> case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
> @@ -3945,13 +3951,12 @@ namespace gbe
> GBE_ASSERT(AI == AE);
> }
>
> - void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS,
> bool isWrite) {
> + void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS,
> bool isWrite, uint8_t vec_size) {
> CallSite::arg_iterator AI = CS.arg_begin();
> CallSite::arg_iterator AE = CS.arg_end();
> GBE_ASSERT(AI != AE);
>
> Value *llvmPtr = *(AI++);
> - Value *llvmValues;
> ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr-
> >getType()->getPointerAddressSpace());
> GBE_ASSERT(addrSpace == ir::MEM_GLOBAL);
> ir::Register pointer = this->getRegister(llvmPtr);
> @@ -3986,15 +3991,18 @@ namespace gbe
> GBE_ASSERT(AM != ir::AM_DynamicBti);
>
> if(isWrite){
> - llvmValues = *(AI++);
> - const ir::Register values = getRegister(llvmValues);
> - const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
> - ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true);
> + Value *llvmValues = *(AI++);
> + vector<ir::Register> srcTupleData;
> + for(int i = 0;i < vec_size; i++)
> + srcTupleData.push_back(getRegister(llvmValues, i));
> + const ir::Tuple tuple = ctx.arrayTuple(&srcTupleData[0], vec_size);
> + ctx.STORE(type, tuple, ptr, addrSpace, vec_size, true, AM, SurfaceIndex,
> true);
> } else {
> - llvmValues = &I;
> - const ir::Register values = getRegister(llvmValues);
> - const ir::Tuple tuple = ctx.arrayTuple(&values, 1);
> - ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true);
> + vector<ir::Register> dstTupleData;
> + for(int i = 0;i < vec_size; i++)
> + dstTupleData.push_back(getRegister(&I, i));
> + const ir::Tuple tuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
> + ctx.LOAD(type, tuple, ptr, addrSpace, vec_size, true, AM, SurfaceIndex,
> true);
> }
>
> GBE_ASSERT(AI == AE);
> @@ -4858,9 +4866,21 @@ namespace gbe
> break;
> }
> case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
> - this->emitBlockReadWriteMemInst(I, CS, false); break;
> + this->emitBlockReadWriteMemInst(I, CS, false, 1); break;
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
> + this->emitBlockReadWriteMemInst(I, CS, false, 2); break;
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
> + this->emitBlockReadWriteMemInst(I, CS, false, 4); break;
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
> + this->emitBlockReadWriteMemInst(I, CS, false, 8); break;
> case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
> - this->emitBlockReadWriteMemInst(I, CS, true); break;
> + this->emitBlockReadWriteMemInst(I, CS, true, 1); break;
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
> + this->emitBlockReadWriteMemInst(I, CS, true, 2); break;
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
> + this->emitBlockReadWriteMemInst(I, CS, true, 4); break;
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
> + this->emitBlockReadWriteMemInst(I, CS, true, 8); break;
> case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
> this->emitBlockReadWriteImageInst(I, CS, false, 1); break;
> case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx
> b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 456ab58..48a72d1 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -218,7 +218,13 @@
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX,
> __gen_ocl_sub_group_scan_in
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN,
> __gen_ocl_sub_group_scan_inclusive_min)
>
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM,
> __gen_ocl_sub_group_block_read_mem)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2,
> __gen_ocl_sub_group_block_read_mem2)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4,
> __gen_ocl_sub_group_block_read_mem4)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8,
> __gen_ocl_sub_group_block_read_mem8)
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM,
> __gen_ocl_sub_group_block_write_mem)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2,
> __gen_ocl_sub_group_block_write_mem2)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4,
> __gen_ocl_sub_group_block_write_mem4)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8,
> __gen_ocl_sub_group_block_write_mem8)
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE,
> __gen_ocl_sub_group_block_read_image)
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2,
> __gen_ocl_sub_group_block_read_image2)
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4,
> __gen_ocl_sub_group_block_read_image4)
> diff --git a/backend/src/llvm/llvm_scalarize.cpp
> b/backend/src/llvm/llvm_scalarize.cpp
> index e60bf4b..615fb50 100644
> --- a/backend/src/llvm/llvm_scalarize.cpp
> +++ b/backend/src/llvm/llvm_scalarize.cpp
> @@ -693,7 +693,19 @@ namespace gbe {
> *CI = InsertToVector(call, *CI);
> break;
> }
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
> + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
> + {
> + if ((*CI)->getType()->isVectorTy())
> + *CI = InsertToVector(call, *CI);
> + break;
> + }
> case GEN_OCL_VME:
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
> + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
> case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
> case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
> case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
> --
> 2.7.4
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list