[Beignet] [PATCH V4] Backend: Refine block read/write instruction selection
Yang, Rong R
rong.r.yang at intel.com
Mon Dec 26 03:36:40 UTC 2016
Pushed, thanks.
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Guo, Yejun
> Sent: Monday, December 19, 2016 16:43
> To: Pan, Xiuli <xiuli.pan at intel.com>; beignet at lists.freedesktop.org
> Cc: Pan, Xiuli <xiuli.pan at intel.com>
> Subject: Re: [Beignet] [PATCH V4] Backend: Refine block read/write
> instruction selection
>
> LGTM, thanks.
>
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Xiuli Pan
> Sent: Monday, December 19, 2016 3:58 PM
> To: beignet at lists.freedesktop.org
> Cc: Pan, Xiuli
> Subject: [Beignet] [PATCH V4] Backend: Refine block read/write instruction
> selection
>
> From: Pan Xiuli <xiuli.pan at intel.com>
>
> Move the block pack/unpack into instruction selection in order to get
> optimization. Also change some variable name to avoid misleading.
> And make some new function in GenEncoder class.
> V2: Use ud8grf instead of f8grf to save a retype.
> V3: Merge change name patch and fix some comments.
> V4: Fix some simd 8 related bug and comments typo.
>
> Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
> ---
> backend/src/backend/gen8_encoder.cpp | 40 ++-
> backend/src/backend/gen_context.cpp | 459 ++---------------------------
> backend/src/backend/gen_encoder.cpp | 105 ++++---
> backend/src/backend/gen_encoder.hpp | 18 +-
> backend/src/backend/gen_insn_selection.cpp | 448
> +++++++++++++++++++++-------
> 5 files changed, 440 insertions(+), 630 deletions(-)
>
> diff --git a/backend/src/backend/gen8_encoder.cpp
> b/backend/src/backend/gen8_encoder.cpp
> index 8f73346..39dcfd3 100644
> --- a/backend/src/backend/gen8_encoder.cpp
> +++ b/backend/src/backend/gen8_encoder.cpp
> @@ -840,20 +840,15 @@ namespace gbe
> gen8_insn->bits3.gen8_block_rw_a64.header_present = 1;
> }
>
> - void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header,
> uint32_t bti, uint32_t size) {
> - GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> + void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header,
> uint32_t bti, uint32_t ow_size) {
> + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> const uint32_t msg_length = 1;
> - uint32_t rsize = size / 2;
> - uint32_t msgsize = size;
> - // When size is 1 OWord, which means half a reg, we need to know which
> half to use
> - if (size == 1) {
> - if (dst.subnr == 0)
> - msgsize = 0;
> - else
> - msgsize = 1;
> - }
> - rsize = rsize == 0 ? 1 : rsize;
> - const uint32_t response_length = rsize; // Size is in regs
> + uint32_t sizeinreg = ow_size / 2;
> + // half reg should also have size 1
> + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
> + const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
> + const uint32_t response_length = sizeinreg; // Size is in reg
> +
> this->setHeader(insn);
> this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> @@ -861,21 +856,22 @@ namespace gbe
> setOBlockRWA64(this,
> insn,
> bti,
> - msgsize,
> + block_size,
> GEN8_P1_BLOCK_READ_A64,
> msg_length,
> response_length);
>
> }
>
> - void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti,
> uint32_t size) {
> - GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> - uint32_t rsize = size / 2;
> - rsize = rsize == 0 ? 1 : rsize;
> - const uint32_t msg_length = 1 + rsize; // Size is in owords
> + void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti,
> uint32_t ow_size) {
> + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> + uint32_t sizeinreg = ow_size / 2;
> + // half reg should also have size 1
> + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
> + const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
> const uint32_t response_length = 0;
> - uint32_t msgsize = size;
> - msgsize = msgsize == 1 ? 0 : msgsize;
> + const uint32_t block_size = getOBlockSize(ow_size);
> +
> this->setHeader(insn);
> this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> this->setSrc1(insn, GenRegister::immud(0));
> @@ -883,7 +879,7 @@ namespace gbe
> setOBlockRWA64(this,
> insn,
> bti,
> - msgsize,
> + block_size,
> GEN8_P1_BLOCK_WRITE_A64,
> msg_length,
> response_length);
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 8288fa5..791e607 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -3551,458 +3551,39 @@ namespace gbe
> }
>
> void GenContext::emitOBReadInstruction(const SelectionInstruction &insn)
> {
> - const GenRegister dst= ra->genReg(insn.dst(1));
> - const GenRegister addrreg = ra->genReg(insn.src(0));
> - uint32_t type = dst.type;
> - uint32_t typesize = typeSize(type);
> - const uint32_t vec_size = insn.extra.elem;
> - const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 +
> vec_size)), type);
> - const uint32_t simdWidth = p->curr.execWidth;
> - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> - const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
> - GenRegister headeraddr;
> - bool isA64 = insn.getbti() == 255;
> + const GenRegister header = ra->genReg(insn.src(0));
> + const GenRegister tmp = ra->genReg(insn.dst(0));
> + const uint32_t bti = insn.getbti();
> + const uint32_t ow_size = insn.extra.elem;
> + bool isA64 = bti == 255;
> if (isA64)
> - headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0),
> GEN_TYPE_UL);
> + p->OBREADA64(tmp, header, bti, ow_size);
> else
> - headeraddr = GenRegister::offset(header, 0, 2*4);
> -
> - // Make header
> - p->push();
> - {
> - // Copy r0 into the header first
> - p->curr.execWidth = 8;
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - p->MOV(header, GenRegister::ud8grf(0, 0));
> -
> - // Update the header with the current address
> - p->curr.execWidth = 1;
> - p->MOV(headeraddr, addr);
> -
> - // Put zero in the general state base address
> - if (!isA64)
> - p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
> -
> - }
> - p->pop();
> - // Now read the data, oword block read can only work with simd16 and no
> mask
> - if (vec_size == 1) {
> - p->push();
> - {
> - p->curr.execWidth = 16;
> - p->curr.noMask = 1;
> - if (isA64) {
> - //p->curr.execWidth = 8;
> - p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
> - }
> - else
> - p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
> - }
> - p->pop();
> - } else if (vec_size == 2) {
> - p->push();
> - {
> - p->curr.execWidth = 16;
> - p->curr.noMask = 1;
> - if (isA64)
> - p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
> - else
> - p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
> - }
> - p->pop();
> - p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
> - p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0,
> simdWidth * typesize ));
> - } else if (vec_size == 4) {
> - if (simdWidth == 8) {
> - p->push();
> - {
> - p->curr.execWidth = 16;
> - p->curr.noMask = 1;
> - if (isA64)
> - p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize);
> - else
> - p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
> - }
> - p->pop();
> - for (uint32_t j = 0; j < 4; j++)
> - p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j *
> simdWidth * typesize ));
> - } else {
> - for (uint32_t i = 0; i < typesize / 2; i++) {
> - if (i > 0) {
> - p->push();
> - {
> - // Update the address in header
> - p->curr.execWidth = 1;
> - p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
> - }
> - p->pop();
> - }
> - if (isA64)
> - p->OBREADA64(tmp, header, insn.getbti(), 8);
> - else
> - p->OBREAD(tmp, header, insn.getbti(), 8);
> - for (uint32_t j = 0; j < 8 / typesize ; j++)
> - p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp,
> 0 ,j * simdWidth * typesize ));
> - }
> - }
> - } else if (vec_size == 8) {
> - if (simdWidth == 8) {
> - for (uint32_t i = 0; i < typesize / 2; i++) {
> - if (i > 0) {
> - p->push();
> - {
> - // Update the address in header
> - p->curr.execWidth = 1;
> - p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
> - }
> - p->pop();
> - }
> - p->push();
> - {
> - p->curr.execWidth = 16;
> - p->curr.noMask = 1;
> - if (isA64)
> - p->OBREADA64(tmp, header, insn.getbti(), 8);
> - else
> - p->OBREAD(tmp, header, insn.getbti(), 8);
> - }
> - p->pop();
> - for (uint32_t j = 0; j < 16 / typesize; j++)
> - p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp,
> 0, j * simdWidth * typesize ));
> - }
> - } else {
> - for (uint32_t i = 0; i < typesize ; i++) {
> - if (i > 0) {
> - p->push();
> - {
> - // Update the address in header
> - p->curr.execWidth = 1;
> - p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
> - }
> - p->pop();
> - }
> - if (isA64)
> - p->OBREADA64(tmp, header, insn.getbti(), 8);
> - else
> - p->OBREAD(tmp, header, insn.getbti(), 8);
> - for (uint32_t j = 0; j < 8 / typesize; j++)
> - p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)),
> GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
> - }
> - }
> - } else NOT_SUPPORTED;
> + p->OBREAD(tmp, header, bti, ow_size);
> }
>
> void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn)
> {
> - const GenRegister addrreg = ra->genReg(insn.src(0));
> - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> - uint32_t type = ra->genReg(insn.src(1)).type;
> - uint32_t typesize = typeSize(type);
> - const uint32_t vec_size = insn.extra.elem;
> - const GenRegister tmp = GenRegister::offset(header, 1);
> - const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
> - GenRegister headeraddr;
> - bool isA64 = insn.getbti() == 255;
> + const GenRegister header = ra->genReg(insn.src(0));
> + const uint32_t bti = insn.getbti();
> + const uint32_t ow_size = insn.extra.elem;
> + bool isA64 = bti == 255;
> if (isA64)
> - headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0),
> GEN_TYPE_UL);
> + p->OBWRITEA64(header, bti, ow_size);
> else
> - headeraddr = GenRegister::offset(header, 0, 2*4);
> - const uint32_t simdWidth = p->curr.execWidth;
> - uint32_t tmp_size = simdWidth * vec_size / 8;
> - tmp_size = tmp_size > 4 ? 4 : tmp_size;
> - uint32_t offset_size = isA64 ? 128 : 8;
> -
> - p->push();
> - // Copy r0 into the header first
> - p->curr.execWidth = 8;
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - p->MOV(header, GenRegister::ud8grf(0,0));
> -
> - // Update the header with the current address
> - p->curr.execWidth = 1;
> - if (isA64)
> - p->MOV(headeraddr, addr);
> - else
> - p->SHR(headeraddr, addr, GenRegister::immud(4));
> -
> - // Put zero in the general state base address
> - if (!isA64)
> - p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
> -
> - p->pop();
> - // Now write the data, oword block write can only work with simd16 and
> no mask
> - if (vec_size == 1) {
> - p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1)));
> - p->push();
> - {
> - p->curr.execWidth = 16;
> - p->curr.noMask = 1;
> - if (isA64)
> - p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16);
> - else
> - p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
> - }
> - p->pop();
> - } else if (vec_size == 2) {
> - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra-
> >genReg(insn.src(1)));
> - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth *
> typesize), type), ra->genReg(insn.src(2)));
> - p->push();
> - {
> - p->curr.execWidth = 16;
> - p->curr.noMask = 1;
> - if (isA64)
> - p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8);
> - else
> - p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
> - }
> - p->pop();
> - } else if (vec_size == 4) {
> - if (simdWidth == 8) {
> - for (uint32_t i = 0; i < 4; i++)
> - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i *
> simdWidth * typesize), type), ra->genReg(insn.src(1 + i)));
> - p->push();
> - {
> - p->curr.execWidth = 16;
> - p->curr.noMask = 1;
> - if (isA64)
> - p->OBWRITEA64(header, insn.getbti(), 2 * typesize);
> - else
> - p->OBWRITE(header, insn.getbti(), 2 * typesize);
> - }
> - p->pop();
> - } else {
> - for (uint32_t i = 0; i < typesize / 2; i++) {
> - for (uint32_t j = 0; j < 8 / typesize; j++)
> - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j *
> simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
> - if (i > 0) {
> - p->push();
> - {
> - // Update the address in header
> - p->curr.execWidth = 1;
> - p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
> - }
> - p->pop();
> - }
> - if (isA64)
> - p->OBWRITEA64(header, insn.getbti(), 8);
> - else
> - p->OBWRITE(header, insn.getbti(), 8);
> - }
> - }
> - } else if (vec_size == 8) {
> - if (simdWidth == 8) {
> - for (uint32_t i = 0; i < typesize / 2; i++) {
> - for (uint32_t j = 0; j < 16 / typesize; j++)
> - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j *
> simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize)));
> - if (i > 0) {
> - p->push();
> - {
> - // Update the address in header
> - p->curr.execWidth = 1;
> - p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
> - }
> - p->pop();
> - }
> - p->push();
> - {
> - p->curr.execWidth = 16;
> - p->curr.noMask = 1;
> - if (isA64)
> - p->OBWRITEA64(header, insn.getbti(), 8);
> - else
> - p->OBWRITE(header, insn.getbti(), 8);
> - }
> - p->pop();
> - }
> - } else {
> - for (uint32_t i = 0; i < typesize; i++) {
> - for (uint32_t j = 0; j < 8 / typesize; j++)
> - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j *
> simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
> - if (i > 0) {
> - p->push();
> - {
> - // Update the address in header
> - p->curr.execWidth = 1;
> - p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
> - }
> - p->pop();
> - }
> - if (isA64)
> - p->OBWRITEA64(header, insn.getbti(), 8);
> - else
> - p->OBWRITE(header, insn.getbti(), 8);
> - }
> - }
> - } else NOT_SUPPORTED;
> -
> + p->OBWRITE(header, bti, ow_size);
> }
>
> void GenContext::emitMBReadInstruction(const SelectionInstruction &insn)
> {
> - const GenRegister dst = ra->genReg(insn.dst(1));
> - const GenRegister coordx = GenRegister::toUniform(ra-
> >genReg(insn.src(0)),GEN_TYPE_D);
> - const GenRegister coordy = GenRegister::toUniform(ra-
> >genReg(insn.src(1)),GEN_TYPE_D);
> - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> - const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
> - const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
> - const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
> - size_t vec_size = insn.extra.elem;
> - uint32_t type = dst.type;
> - uint32_t typesize = typeSize(type);
> - uint32_t block_width = typesize * simdWidth;
> - uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
> -
> - if (simdWidth == 8)
> - {
> - p->push();
> - // Copy r0 into the header first
> - p->curr.execWidth = 8;
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - p->MOV(header, GenRegister::ud8grf(0,0));
> -
> - // Update the header with the coord
> - p->curr.execWidth = 1;
> - p->MOV(offsetx, coordx);
> - p->MOV(offsety, coordy);
> - // Update block width and height
> - p->MOV(blocksizereg, GenRegister::immud(blocksize));
> - p->curr.execWidth = 8;
> - // ushort in simd8 will have half reg, but response lenght is still 1
> - uint32_t rsize = vec_size * typesize / 4;
> - rsize = rsize ? rsize : 1;
> - // Now read the data
> - p->MBREAD(dst, header, insn.getbti(), rsize);
> - p->pop();
> -
> - }
> - else if (simdWidth == 16)
> - {
> - const GenRegister tmp = GenRegister::retype(ra-
> >genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
> - p->push();
> - // Copy r0 into the header first
> - p->curr.execWidth = 8;
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - p->MOV(header, GenRegister::ud8grf(0,0));
> -
> - // First half
> - // Update the header with the coord
> - p->curr.execWidth = 1;
> - p->MOV(offsetx, coordx);
> - p->MOV(offsety, coordy);
> - // Update block width and height
> - p->MOV(blocksizereg, GenRegister::immud(blocksize));
> - // Now read the data
> - p->curr.execWidth = 8;
> - p->MBREAD(tmp, header, insn.getbti(), vec_size);
> - for (uint32_t i = 0; i < vec_size; i++)
> - p->MOV(GenRegister::retype(ra->genReg(insn.dst(i +
> 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i));
> -
> - if (typesize == 4)
> - {
> - // Second half
> - // Update the header with the coord
> - p->curr.execWidth = 1;
> - p->ADD(offsetx, offsetx, GenRegister::immud(32));
> -
> - // Now read the data
> - p->curr.execWidth = 8;
> - p->MBREAD(tmp, header, insn.getbti(), vec_size);
> -
> - // Move the reg to fit vector rule.
> - for (uint32_t i = 0; i < vec_size; i++)
> - p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
> - GenRegister::offset(tmp, i));
> - }
> - p->pop();
> - } else NOT_IMPLEMENTED;
> + const GenRegister dst = ra->genReg(insn.dst(0));
> + const GenRegister header = ra->genReg(insn.src(0));
> + const size_t response_size = insn.extra.elem;
> + p->MBREAD(dst, header, insn.getbti(), response_size);
> }
>
> void GenContext::emitMBWriteInstruction(const SelectionInstruction
> &insn) {
> - const GenRegister coordx = GenRegister::toUniform(ra-
> >genReg(insn.src(0)), GEN_TYPE_D);
> - const GenRegister coordy = GenRegister::toUniform(ra-
> >genReg(insn.src(1)), GEN_TYPE_D);
> - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> - const GenRegister tmp = GenRegister::offset(header, 1);
> - GenRegister offsetx, offsety, blocksizereg;
> - size_t vec_size = insn.extra.elem;
> - uint32_t type = ra->genReg(insn.src(2)).type;
> - uint32_t typesize = typeSize(type);
> - uint32_t block_width = typesize * simdWidth;
> - uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16;
> -
> - offsetx = GenRegister::offset(header, 0, 0*4);
> - offsety = GenRegister::offset(header, 0, 1*4);
> - blocksizereg = GenRegister::offset(header, 0, 2*4);
> -
> - if (simdWidth == 8)
> - {
> - p->push();
> - // Copy r0 into the header first
> - p->curr.execWidth = 8;
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - p->MOV(header, GenRegister::ud8grf(0,0));
> -
> - // Update the header with the coord
> - p->curr.execWidth = 1;
> - p->MOV(offsetx, coordx);
> - p->MOV(offsety, coordy);
> - // Update block width and height
> - p->MOV(blocksizereg, GenRegister::immud(blocksize));
> - p->curr.execWidth = 8;
> - // Mov what we need into msgs
> - for(uint32_t i = 0; i < vec_size; i++)
> - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i *
> block_width), type),
> - ra->genReg(insn.src(2 + i)));
> - // ushort in simd8 will have half reg, but reponse lenght is still 1
> - uint32_t rsize = vec_size * typesize / 4;
> - rsize = rsize ? rsize : 1;
> - // Now read the data
> - p->MBWRITE(header, insn.getbti(), rsize);
> - p->pop();
> -
> - }
> - else
> - {
> - p->push();
> - // Copy r0 into the header first
> - p->curr.execWidth = 8;
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - p->MOV(header, GenRegister::ud8grf(0,0));
> -
> - // First half
> - // Update the header with the coord
> - p->curr.execWidth = 1;
> - p->MOV(offsetx, coordx);
> - p->MOV(offsety, coordy);
> - // Update block width and height
> - p->MOV(blocksizereg, GenRegister::immud(blocksize));
> - // Now read the data
> - p->curr.execWidth = 8;
> - // Mov what we need into msgs
> - for(uint32_t i = 0; i < vec_size; i++)
> - p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra-
> >genReg(insn.src(2 + i)), GEN_TYPE_UD));
> - p->MBWRITE(header, insn.getbti(), vec_size);
> -
> - if (typesize == 4)
> - {
> - // Second half
> - // Update the header with the coord
> - p->curr.execWidth = 1;
> - p->ADD(offsetx, offsetx, GenRegister::immud(32));
> -
> - p->curr.execWidth = 8;
> - // Mov what we need into msgs
> - for(uint32_t i = 0; i < vec_size; i++)
> - p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra-
> >genReg(insn.src(2 + i)), 1));
> - // Now write the data
> - p->MBWRITE(header, insn.getbti(), vec_size);
> - }
> -
> - p->pop();
> - }
> + const GenRegister header = ra->genReg(insn.dst(0));
> + const size_t data_size = insn.extra.elem;
> + p->MBWRITE(header, insn.getbti(), data_size);
> }
>
> BVAR(OCL_OUTPUT_REG_ALLOC, false);
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index 49d93e8..1bca668 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -257,32 +257,47 @@ namespace gbe
> NOT_SUPPORTED;
> }
>
> - static void setOBlockRW(GenEncoder *p,
> - GenNativeInstruction *insn,
> - uint32_t bti,
> - uint32_t size,
> - uint32_t msg_type,
> - uint32_t msg_length,
> - uint32_t response_length)
> + void GenEncoder::setOBlockRW(GenNativeInstruction *insn,
> + uint32_t bti,
> + uint32_t block_size,
> + uint32_t msg_type,
> + uint32_t msg_length,
> + uint32_t response_length)
> {
> const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
> - p->setMessageDescriptor(insn, sfid, msg_length, response_length);
> - assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
> + setMessageDescriptor(insn, sfid, msg_length, response_length);
> insn->bits3.gen7_oblock_rw.msg_type = msg_type;
> insn->bits3.gen7_oblock_rw.bti = bti;
> - insn->bits3.gen7_oblock_rw.block_size = size <= 2 ? size : (size == 4 ? 3 :
> 4);
> + insn->bits3.gen7_oblock_rw.block_size = block_size;
> insn->bits3.gen7_oblock_rw.header_present = 1;
> }
>
> - static void setMBlockRW(GenEncoder *p,
> - GenNativeInstruction *insn,
> - uint32_t bti,
> - uint32_t msg_type,
> - uint32_t msg_length,
> - uint32_t response_length)
> + uint32_t GenEncoder::getOBlockSize(uint32_t oword_size, bool low_half)
> + {
> + /* 000: 1 OWord, read into or written from the low 128 bits of the
> destination register.
> + * 001: 1 OWord, read into or written from the high 128 bits of the
> destination register.
> + * 010: 2 OWords
> + * 011: 4 OWords
> + * 100: 8 OWords */
> + switch(oword_size)
> + {
> + case 1: return low_half ? 0 : 1;
> + case 2: return 2;
> + case 4: return 3;
> + case 8: return 4;
> + default: NOT_SUPPORTED;
> + }
> + return 0;
> + }
> +
> + void GenEncoder::setMBlockRW(GenNativeInstruction *insn,
> + uint32_t bti,
> + uint32_t msg_type,
> + uint32_t msg_length,
> + uint32_t response_length)
> {
> const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
> - p->setMessageDescriptor(insn, sfid, msg_length, response_length);
> + setMessageDescriptor(insn, sfid, msg_length, response_length);
> insn->bits3.gen7_mblock_rw.msg_type = msg_type;
> insn->bits3.gen7_mblock_rw.bti = bti;
> insn->bits3.gen7_mblock_rw.header_present = 1;
> @@ -1312,80 +1327,72 @@ namespace gbe
> setScratchMessage(this, insn, offset, block_size, channel_mode,
> GEN_SCRATCH_READ, 1, dst_num);
> }
>
> - void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t size) {
> + void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t ow_size) {
> GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> const uint32_t msg_length = 1;
> - uint32_t rsize = size / 2;
> - uint32_t msgsize = size;
> - // When size is 1 OWord, which means half a reg, we need to know which
> half to use
> - if (size == 1) {
> - if (dst.subnr == 0)
> - msgsize = 0;
> - else
> - msgsize = 1;
> - }
> - rsize = rsize == 0 ? 1 : rsize;
> - const uint32_t response_length = rsize; // Size is in regs
> + uint32_t sizeinreg = ow_size / 2;
> + // half reg should also have size 1
> + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
> + const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
> + const uint32_t response_length = sizeinreg; // Size is in reg
> +
> this->setHeader(insn);
> this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> this->setSrc1(insn, GenRegister::immud(0));
> - setOBlockRW(this,
> - insn,
> + setOBlockRW(insn,
> bti,
> - msgsize,
> + block_size,
> GEN7_UNALIGNED_OBLOCK_READ,
> msg_length,
> response_length);
> }
>
> - void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size)
> {
> + void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t
> ow_size) {
> GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> - uint32_t rsize = size / 2;
> - rsize = rsize == 0 ? 1 : rsize;
> - const uint32_t msg_length = 1 + rsize; // Size is in owords
> + uint32_t sizeinreg = ow_size / 2;
> + // half reg should also have size 1
> + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
> + const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
> const uint32_t response_length = 0;
> - uint32_t msgsize = size;
> - msgsize = msgsize == 1 ? 0 : msgsize;
> + const uint32_t block_size = getOBlockSize(ow_size);
> +
> this->setHeader(insn);
> this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> this->setSrc1(insn, GenRegister::immud(0));
> this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UW));
> - setOBlockRW(this,
> - insn,
> + setOBlockRW(insn,
> bti,
> - msgsize,
> + block_size,
> GEN7_OBLOCK_WRITE,
> msg_length,
> response_length);
> }
>
> - void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t size) {
> + void GenEncoder::MBREAD(GenRegister dst, GenRegister header,
> uint32_t bti, uint32_t response_size) {
> GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> const uint32_t msg_length = 1;
> - const uint32_t response_length = size; // Size of registers
> + const uint32_t response_length = response_size; // Size of registers
> this->setHeader(insn);
> this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
> this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> this->setSrc1(insn, GenRegister::immud(0));
> - setMBlockRW(this,
> - insn,
> + setMBlockRW(insn,
> bti,
> GEN75_P1_MEDIA_BREAD,
> msg_length,
> response_length);
> }
>
> - void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t
> size) {
> + void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t
> data_size) {
> GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> - const uint32_t msg_length = 1 + size;
> + const uint32_t msg_length = 1 + data_size;
> const uint32_t response_length = 0; // Size of registers
> this->setHeader(insn);
> this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UW));
> this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> this->setSrc1(insn, GenRegister::immud(0));
> - setMBlockRW(this,
> - insn,
> + setMBlockRW(insn,
> bti,
> GEN75_P1_MEDIA_TYPED_BWRITE,
> msg_length,
> diff --git a/backend/src/backend/gen_encoder.hpp
> b/backend/src/backend/gen_encoder.hpp
> index e5eb2e2..46ec53b 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -286,18 +286,24 @@ namespace gbe
> virtual bool canHandleLong(uint32_t opcode, GenRegister dst,
> GenRegister src0,
> GenRegister src1 = GenRegister::null());
> virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister
> dst, GenRegister src0, GenRegister src1 = GenRegister::null());
> +
> + /*! OBlock helper function */
> + uint32_t getOBlockSize(uint32_t oword_size, bool low_half = true);
> + void setMBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t
> msg_type, uint32_t msg_length, uint32_t response_length);
> + void setOBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t
> block_size, uint32_t msg_type, uint32_t msg_length, uint32_t
> response_lengtha);
> +
> /*! OBlock read */
> - void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t
> elemSize);
> + void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t
> ow_size);
> /*! OBlock write */
> - void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
> + void OBWRITE(GenRegister header, uint32_t bti, uint32_t ow_size);
> /*! MBlock read */
> - virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti,
> uint32_t elemSize);
> + virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti,
> uint32_t response_size);
> /*! MBlock write */
> - virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t
> elemSize);
> + virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t
> data_size);
> /*! A64 OBlock read */
> - virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t elemSize);
> + virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t ow_size);
> /*! A64 OBlock write */
> - virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t
> elemSize);
> + virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t
> ow_size);
>
> GBE_CLASS(GenEncoder); //!< Use custom allocators
> virtual void alu3(uint32_t opcode, GenRegister dst,
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 1cd6137..223c384 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -759,13 +759,13 @@ namespace gbe
> void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
> GenRegister tmpData1, GenRegister tmpData2);
> /*! Oblock read */
> - void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr,
> GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
> + void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header,
> uint32_t bti, uint32_t ow_size);
> /*! Oblock write */
> - void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size,
> GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
> + void OBWRITE(GenRegister header, GenRegister* values, uint32_t
> tmp_size, uint32_t bti, uint32_t ow_size);
> /*! Media block read */
> - void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister
> coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t
> vec_size);
> + void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header,
> uint32_t bti, uint32_t response_size);
> /*! Media block write */
> - void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister*
> values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t
> vec_size);
> + void MBWRITE(GenRegister header, GenRegister* values, uint32_t
> tmp_size, uint32_t bti, uint32_t data_size);
>
> /* common functions for both binary instruction and sel_cmp and
> compare instruction.
> It will handle the IMM or normal register assignment, and will try to avoid
> LOADI
> @@ -2267,118 +2267,84 @@ namespace gbe
> }
> void Selection::Opaque::OBREAD(GenRegister* dsts,
> uint32_t vec_size,
> - GenRegister addr,
> GenRegister header,
> uint32_t bti,
> - GenRegister* tmp,
> - uint32_t tmp_size) {
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 +
> vec_size + tmp_size, 1);
> + uint32_t ow_size) {
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size,
> 1);
> SelectionVector *vector = this->appendVector();
> - insn->dst(0) = header;
> + insn->src(0) = header;
> for (uint32_t i = 0; i < vec_size; ++i)
> - insn->dst(1 + i) = dsts[i];
> - for (uint32_t i = 0; i < tmp_size; ++i)
> - insn->dst(1 + i + vec_size) = tmp[i];
> - insn->src(0) = addr;
> + insn->dst(i) = dsts[i];
> insn->setbti(bti);
> - insn->extra.elem = vec_size; // number of vector size
> + insn->extra.elem = ow_size; // number of OWord size
>
> // tmp regs for OWORD read dst
> - vector->regNum = tmp_size;
> - vector->reg = &insn->dst(1 + vec_size);
> - vector->offsetID = 1 + vec_size;
> + vector->regNum = vec_size;
> + vector->reg = &insn->dst(0);
> + vector->offsetID = 0;
> vector->isSrc = 0;
> }
>
> - void Selection::Opaque::OBWRITE(GenRegister addr,
> + void Selection::Opaque::OBWRITE(GenRegister header,
> GenRegister* values,
> uint32_t vec_size,
> - GenRegister header,
> uint32_t bti,
> - GenRegister* tmp,
> - uint32_t tmp_size) {
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE,
> tmp_size + 1, vec_size + 1);
> + uint32_t ow_size) {
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0,
> vec_size + 1);
> SelectionVector *vector = this->appendVector();
> - insn->src(0) = addr;
> + insn->src(0) = header;
> for (uint32_t i = 0; i < vec_size; ++i)
> insn->src(i + 1) = values[i];
> - insn->dst(0) = header;
> - for (uint32_t i = 0; i < tmp_size; ++i)
> - insn->dst(i + 1) = tmp[i];
> insn->setbti(bti);
> - insn->extra.elem = vec_size; // number of vector_size
> + insn->extra.elem = ow_size; // number of OWord_size
>
> - // tmp regs for OWORD read dst
> - vector->regNum = tmp_size + 1;
> - vector->reg = &insn->dst(0);
> + // tmp regs for OWORD write header and values
> + vector->regNum = vec_size + 1;
> + vector->reg = &insn->src(0);
> vector->offsetID = 0;
> - vector->isSrc = 0;
> + vector->isSrc = 1;
> +
> }
>
> void Selection::Opaque::MBREAD(GenRegister* dsts,
> - GenRegister coordx,
> - GenRegister coordy,
> + uint32_t tmp_size,
> GenRegister header,
> - GenRegister* tmp,
> uint32_t bti,
> - uint32_t vec_size) {
> -
> - uint32_t simdWidth = curr.execWidth;
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size
> * simdWidth / 8 + 1, 2);
> - insn->dst(0) = header;
> - for (uint32_t i = 0; i < vec_size; ++i) {
> - insn->dst(i + 1) = dsts[i];
> - if(simdWidth == 16)
> - insn->dst(i + vec_size + 1) = tmp[i];
> - }
> - insn->src(0) = coordx;
> - insn->src(1) = coordy;
> + uint32_t response_size) {
> +
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD,
> tmp_size, 1);
> + insn->src(0) = header;
> insn->setbti(bti);
> - insn->extra.elem = vec_size; // vector size
> + insn->extra.elem = response_size; // send response length
>
> - // Only in simd 8 the data is in vector form
> - if(simdWidth == 8) {
> - SelectionVector *vector = this->appendVector();
> - vector->regNum = vec_size;
> - vector->reg = &insn->dst(1);
> - vector->offsetID = 1;
> - vector->isSrc = 0;
> - }
> - if(simdWidth == 16)
> - {
> - SelectionVector *vectortmp = this->appendVector();
> - vectortmp->regNum = vec_size;
> - vectortmp->reg = &insn->dst(vec_size + 1);
> - vectortmp->offsetID = vec_size + 1;
> - vectortmp->isSrc = 0;
> + for (uint32_t i = 0; i < tmp_size; ++i) {
> + insn->dst(i) = dsts[i];
> }
> + SelectionVector *vector = this->appendVector();
> + vector->regNum = tmp_size;
> + vector->reg = &insn->dst(0);
> + vector->offsetID = 0;
> + vector->isSrc = 0;
> }
>
> - void Selection::Opaque::MBWRITE(GenRegister coordx,
> - GenRegister coordy,
> + void Selection::Opaque::MBWRITE(GenRegister header,
> GenRegister* values,
> - GenRegister header,
> - GenRegister* tmp,
> + uint32_t tmp_size,
> uint32_t bti,
> - uint32_t vec_size) {
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 +
> vec_size, 2 + vec_size);
> + uint32_t data_size) {
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 +
> tmp_size);
> SelectionVector *vector = this->appendVector();
> - insn->src(0) = coordx;
> - insn->src(1) = coordy;
> - for (uint32_t i = 0; i < vec_size; ++i)
> - insn->src(2 + i) = values[i];
> - insn->dst(0) = header;
> - for (uint32_t i = 0; i < vec_size; ++i)
> - insn->dst(1 + i) = tmp[i];
> - insn->state = this->curr;
> + insn->src(0) = header;
> + for (uint32_t i = 0; i < tmp_size; ++i)
> + insn->src(1 + i) = values[i];
> insn->setbti(bti);
> - insn->extra.elem = vec_size; // vector size
> + insn->extra.elem = data_size; // msg data part size
>
> // We need to put the header and the data together
> - vector->regNum = 1 + vec_size;
> - vector->reg = &insn->dst(0);
> + vector->regNum = 1 + tmp_size;
> + vector->reg = &insn->src(0);
> vector->offsetID = 0;
> - vector->isSrc = 0;
> + vector->isSrc = 1;
> }
>
> // Boiler plate to initialize the selection library at c++ pre-main
> @@ -4715,18 +4681,79 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> const uint32_t simdWidth = sel.ctx.getSimdWidth();
> const Type type = insn.getValueType();
> const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
> - const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> GEN_TYPE_UD);
> + const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD :
> GEN_TYPE_UW;
> + const RegisterFamily family = getFamily(type);
> + bool isA64 = SI == 255;
> +
> + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
> vector<GenRegister> valuesVec;
> + vector<GenRegister> tmpVec;
> for(uint32_t i = 0; i < vec_size; i++)
> valuesVec.push_back(sel.selReg(insn.getValue(i), type));
> - // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
> - uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
> - tmp_size = tmp_size == 0 ? 1 : tmp_size;
> - tmp_size = tmp_size > 4 ? 4 : tmp_size;
> - vector<GenRegister> tmpVec;
> +
> + GenRegister headeraddr;
> + if (isA64)
> + headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0),
> GEN_TYPE_UL);
> + else
> + headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
> + // Make header
> + sel.push();
> + {
> + // Copy r0 into the header first
> + sel.curr.execWidth = 8;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.MOV(header, GenRegister::ud8grf(0, 0));
> +
> + // Update the header with the current address
> + sel.curr.execWidth = 1;
> +
> + // Put zero in the general state base address
> + if (isA64)
> + sel.MOV(headeraddr, GenRegister::toUniform(address,
> GEN_TYPE_UL));
> + else {
> + sel.MOV(headeraddr, GenRegister::toUniform(address,
> GEN_TYPE_UD));
> + sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
> + }
> + }
> + sel.pop();
> +
> + /* For block read we need to unpack the block date into values, and for
> different
> + * simdwidth and vector size with different type size, we may need to
> spilt the
> + * block read send message.
> + * We can only get a send message with 5 reg length
> + * so for different combination we have different message length and
> tmp vector size
> + * | simd8 | simd16 | simd8 | simd16
> + * r0 |header | | | |
> + * r1 |date | w0,w1 | w0 | dw0 | dw0
> + * r2 |date | w2,w3 | w1 | dw1 | dw0
> + * r3 |date | ...... | ...... | ...... | dw1
> + * r4 |date | ....... | ...... | ...... | dw1
> + */
> +
> + uint32_t totalSize = simdWidth * typeSize * vec_size;
> + uint32_t valueSize = simdWidth * typeSize;
> + uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
> + uint32_t msg_num = vec_size / tmp_size;
> + uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
> +
> for(uint32_t i = 0; i < tmp_size; i++)
> -
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), GEN_TYPE_UD));
> - sel.OBREAD(&valuesVec[0], vec_size, address, header, SI, &tmpVec[0],
> tmp_size);
> +
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)),
> genType));
> + for (uint32_t i = 0; i < msg_num; i++) {
> + if (i > 0) {
> + sel.push();
> + {
> + // Update the address in header
> + sel.curr.execWidth = 1;
> + sel.ADD(headeraddr, headeraddr, GenRegister::immud(128));
> + }
> + sel.pop();
> + }
> + sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size);
> + for (uint32_t j = 0; j < tmp_size; j++)
> + sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]);
> + }
> +
> }
>
> // check whether all binded table index point to constant memory
> @@ -5161,18 +5188,87 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> const uint32_t simdWidth = sel.ctx.getSimdWidth();
> const Type type = insn.getValueType();
> const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
> - const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> GEN_TYPE_UD);
> + const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD :
> GEN_TYPE_UW;
> + const RegisterFamily family = getFamily(type);
> + bool isA64 = SI == 255;
> +
> + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
> vector<GenRegister> valuesVec;
> + vector<GenRegister> tmpVec;
> for(uint32_t i = 0; i < vec_size; i++)
> valuesVec.push_back(sel.selReg(insn.getValue(i), type));
> - // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
> - uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
> - tmp_size = tmp_size == 0 ? 1 : tmp_size;
> - tmp_size = tmp_size > 4 ? 4 : tmp_size;
> - vector<GenRegister> tmpVec;
> +
> + GenRegister headeraddr;
> + if (isA64)
> + headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0),
> GEN_TYPE_UL);
> + else
> + headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
> + // Make header
> + sel.push();
> + {
> + // Copy r0 into the header first
> + sel.curr.execWidth = 8;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.MOV(header, GenRegister::ud8grf(0, 0));
> +
> + // Update the header with the current address
> + sel.curr.execWidth = 1;
> +
> + // Put zero in the general state base address
> + if (isA64)
> + sel.MOV(headeraddr, GenRegister::toUniform(address,
> GEN_TYPE_UL));
> + else {
> + sel.SHR(headeraddr, GenRegister::toUniform(address,
> GEN_TYPE_UD), GenRegister::immud(4));
> + sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
> + }
> + }
> + sel.pop();
> +
> + /* For block write we need to pack the block date into the tmp, and for
> different
> + * simdwidth and vector size with different type size, we may need to
> spilt the
> + * block write send message.
> + * We can only get a send message with 5 reg length
> + * so for different combination we have different message length and
> tmp vector size
> + * | simd8 | simd16 | simd8 | simd16
> + * r0 |header | | | |
> + * r1 |date | w0,w1 | w0 | dw0 | dw0
> + * r2 |date | w2,w3 | w1 | dw1 | dw0
> + * r3 |date | ...... | ...... | ...... | dw1
> + * r4 |date | ....... | ...... | ...... | dw1
> + */
> +
> + uint32_t totalSize = simdWidth * typeSize * vec_size;
> + uint32_t valueSize = simdWidth * typeSize;
> + uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
> + uint32_t msg_num = vec_size / tmp_size;
> + uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
> +
> for(uint32_t i = 0; i < tmp_size; i++)
> -
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), GEN_TYPE_UD));
> - sel.OBWRITE(address, &valuesVec[0], vec_size, header, SI, &tmpVec[0],
> tmp_size);
> +
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)),
> genType));
> + for (uint32_t i = 0; i < msg_num; i++) {
> + for (uint32_t j = 0; j < tmp_size; j++)
> + sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]);
> + if (i > 0) {
> + sel.push();
> + {
> + // Update the address in header
> + sel.curr.execWidth = 1;
> + sel.ADD(headeraddr, headeraddr, GenRegister::immud(8));
> + }
> + sel.pop();
> + }
> + sel.push();
> + // In simd8 mode, when data reg has more than 1 reg, execWidth 8
> will get wrong
> + // result, so set the execWidth to 16.
> + sel.curr.execWidth = 16;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size);
> + sel.pop();
> + }
> +
> +
> }
>
> virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
> @@ -7662,20 +7758,77 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> uint32_t vec_size = insn.getVectorSize();
> uint32_t simdWidth = sel.curr.execWidth;
> const Type type = insn.getType();
> + const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
> + uint32_t response_size = simdWidth * vec_size * typeSize / 32;
> + // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght
> is still 1
> + response_size = response_size ? response_size : 1;
> + uint32_t block_width = typeSize * simdWidth;
> + uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
> +
> +
> vector<GenRegister> valuesVec;
> vector<GenRegister> tmpVec;
> for (uint32_t i = 0; i < vec_size; ++i) {
> valuesVec.push_back(sel.selReg(insn.getDst(i), type));
> - if(simdWidth == 16)
> -
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), GEN_TYPE_UD));
> + if(simdWidth == 16 && typeSize == 4)
> + tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
> }
> - const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
> - const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
> - const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> GEN_TYPE_UD);
> - GenRegister *tmp = NULL;
> - if(simdWidth == 16)
> - tmp = &tmpVec[0];
> - sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp,
> insn.getImageIndex(), insn.getVectorSize());
> + const GenRegister coordx =
> GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32),
> GEN_TYPE_UD);
> + const GenRegister coordy =
> GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32),
> GEN_TYPE_UD);
> + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
> + const GenRegister offsetx =
> GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD);
> + const GenRegister offsety =
> GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD);
> + const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4);
> +
> + // Make header
> + sel.push();
> + // Copy r0 into the header first
> + sel.curr.execWidth = 8;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.MOV(header, GenRegister::ud8grf(0, 0));
> +
> + // Update the header with the coord
> + sel.curr.execWidth = 1;
> + sel.MOV(offsetx, coordx);
> + sel.MOV(offsety, coordy);
> + // Update block width and height
> + sel.MOV(blocksizereg, GenRegister::immud(blocksize));
> + sel.pop();
> +
> + if (simdWidth * typeSize < 64) {
> + sel.push();
> + sel.curr.execWidth = 8;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + // Now read the data
> + sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(),
> response_size);
> + sel.pop();
> + } else if (simdWidth * typeSize == 64) {
> + sel.push();
> + sel.curr.execWidth = 8;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(),
> vec_size);
> + for (uint32_t i = 0; i < vec_size; i++)
> + sel.MOV(valuesVec[i], tmpVec[i]);
> +
> + // Second half
> + // Update the header with the coord
> + sel.curr.execWidth = 1;
> + sel.ADD(offsetx, offsetx, GenRegister::immud(32));
> +
> + // Now read the data
> + sel.curr.execWidth = 8;
> + sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(),
> vec_size);
> +
> + // Move the reg to fit vector rule.
> + for (uint32_t i = 0; i < vec_size; i++)
> + sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]);
> + sel.pop();
> + } else NOT_IMPLEMENTED;
> +
> +
> return true;
> }
> DECL_CTOR(MediaBlockReadInstruction, 1, 1);
> @@ -7689,17 +7842,84 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
> using namespace ir;
> uint32_t vec_size = insn.getVectorSize();
> const Type type = insn.getType();
> - const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
> - const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
> + uint32_t simdWidth = sel.curr.execWidth;
> + const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD :
> GEN_TYPE_UW;
> + const RegisterFamily family = getFamily(type);
> + const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
> + // ushort in simd8 will have half reg, but data lenght is still 1
> + uint32_t data_size = simdWidth * vec_size * typeSize / 32;
> + data_size = data_size? data_size : 1;
> + uint32_t block_width = typeSize * simdWidth;
> + uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
> +
> +
> vector<GenRegister> valuesVec;
> vector<GenRegister> tmpVec;
> - for(uint32_t i = 0; i < vec_size; i++)
> - {
> - valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
> -
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), GEN_TYPE_UD));
> - }
> - const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> GEN_TYPE_UD);
> - sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0],
> insn.getImageIndex(), vec_size);
> + for (uint32_t i = 0; i < vec_size; ++i) {
> + valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
> + if(simdWidth == 16 && typeSize == 4)
> + tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
> + else
> +
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)),
> genType));
> + }
> + const GenRegister coordx =
> GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32),
> GEN_TYPE_UD);
> + const GenRegister coordy =
> GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32),
> GEN_TYPE_UD);
> + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
> + const GenRegister offsetx =
> GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD);
> + const GenRegister offsety =
> GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD);
> + const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4);
> +
> + // Make header
> + sel.push();
> + // Copy r0 into the header first
> + sel.curr.execWidth = 8;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.MOV(header, GenRegister::ud8grf(0, 0));
> +
> + // Update the header with the coord
> + sel.curr.execWidth = 1;
> + sel.MOV(offsetx, coordx);
> + sel.MOV(offsety, coordy);
> + // Update block width and height
> + sel.MOV(blocksizereg, GenRegister::immud(blocksize));
> + sel.pop();
> +
> + if (simdWidth * typeSize < 64) {
> + for (uint32_t i = 0; i < vec_size; ++i) {
> + sel.MOV(tmpVec[i], valuesVec[i]);
> + }
> + sel.push();
> + sel.curr.execWidth = 8;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + // Now write the data
> + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(),
> data_size);
> + sel.pop();
> + } else if (simdWidth * typeSize == 64) {
> + sel.push();
> + sel.curr.execWidth = 8;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + for (uint32_t i = 0; i < vec_size; i++)
> + sel.MOV(tmpVec[i], valuesVec[i]);
> + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(),
> vec_size);
> +
> + // Second half
> + // Update the header with the coord
> + sel.curr.execWidth = 1;
> + sel.ADD(offsetx, offsetx, GenRegister::immud(32));
> +
> + sel.curr.execWidth = 8;
> + for (uint32_t i = 0; i < vec_size; i++)
> + sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32));
> + // Now write the data
> + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(),
> vec_size);
> +
> + // Move the reg to fit vector rule.
> + sel.pop();
> + } else NOT_IMPLEMENTED;
> +
> return true;
> }
> DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
> --
> 2.7.4
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list