[Beignet] [PATCH V2] Backend: Refine block read/write instruction selection
Xiuli Pan
xiuli.pan at intel.com
Thu Dec 15 05:26:58 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
Move the block pack/unpack into instruction selection in order to get
optimization.
V2: Use ud8grf instead of f8grf to save a retype.
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
backend/src/backend/gen_context.cpp | 459 ++---------------------------
backend/src/backend/gen_insn_selection.cpp | 439 ++++++++++++++++++++-------
2 files changed, 346 insertions(+), 552 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 8288fa5..7ab5770 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3551,458 +3551,39 @@ namespace gbe
}
void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
- const GenRegister dst= ra->genReg(insn.dst(1));
- const GenRegister addrreg = ra->genReg(insn.src(0));
- uint32_t type = dst.type;
- uint32_t typesize = typeSize(type);
- const uint32_t vec_size = insn.extra.elem;
- const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type);
- const uint32_t simdWidth = p->curr.execWidth;
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
- GenRegister headeraddr;
- bool isA64 = insn.getbti() == 255;
+ const GenRegister header = ra->genReg(insn.src(0));
+ const GenRegister tmp = ra->genReg(insn.dst(0));
+ const uint32_t bti = insn.getbti();
+ const uint32_t ow_size = insn.extra.elem;
+ bool isA64 = bti == 255;
if (isA64)
- headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+ p->OBREADA64(tmp, header, bti, ow_size);
else
- headeraddr = GenRegister::offset(header, 0, 2*4);
-
- // Make header
- p->push();
- {
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0, 0));
-
- // Update the header with the current address
- p->curr.execWidth = 1;
- p->MOV(headeraddr, addr);
-
- // Put zero in the general state base address
- if (!isA64)
- p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
-
- }
- p->pop();
- // Now read the data, oword block read can only work with simd16 and no mask
- if (vec_size == 1) {
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64) {
- //p->curr.execWidth = 8;
- p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
- }
- else
- p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
- }
- p->pop();
- } else if (vec_size == 2) {
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
- else
- p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
- }
- p->pop();
- p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
- p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize ));
- } else if (vec_size == 4) {
- if (simdWidth == 8) {
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize);
- else
- p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
- }
- p->pop();
- for (uint32_t j = 0; j < 4; j++)
- p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
- } else {
- for (uint32_t i = 0; i < typesize / 2; i++) {
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
- }
- p->pop();
- }
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), 8);
- else
- p->OBREAD(tmp, header, insn.getbti(), 8);
- for (uint32_t j = 0; j < 8 / typesize ; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
- }
- }
- } else if (vec_size == 8) {
- if (simdWidth == 8) {
- for (uint32_t i = 0; i < typesize / 2; i++) {
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
- }
- p->pop();
- }
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), 8);
- else
- p->OBREAD(tmp, header, insn.getbti(), 8);
- }
- p->pop();
- for (uint32_t j = 0; j < 16 / typesize; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
- }
- } else {
- for (uint32_t i = 0; i < typesize ; i++) {
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
- }
- p->pop();
- }
- if (isA64)
- p->OBREADA64(tmp, header, insn.getbti(), 8);
- else
- p->OBREAD(tmp, header, insn.getbti(), 8);
- for (uint32_t j = 0; j < 8 / typesize; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
- }
- }
- } else NOT_SUPPORTED;
+ p->OBREAD(tmp, header, bti, ow_size);
}
void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
- const GenRegister addrreg = ra->genReg(insn.src(0));
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- uint32_t type = ra->genReg(insn.src(1)).type;
- uint32_t typesize = typeSize(type);
- const uint32_t vec_size = insn.extra.elem;
- const GenRegister tmp = GenRegister::offset(header, 1);
- const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
- GenRegister headeraddr;
- bool isA64 = insn.getbti() == 255;
+ const GenRegister header = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.getbti();
+ const uint32_t ow_size = insn.extra.elem;
+ bool isA64 = bti == 255;
if (isA64)
- headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+ p->OBWRITEA64(header, bti, ow_size);
else
- headeraddr = GenRegister::offset(header, 0, 2*4);
- const uint32_t simdWidth = p->curr.execWidth;
- uint32_t tmp_size = simdWidth * vec_size / 8;
- tmp_size = tmp_size > 4 ? 4 : tmp_size;
- uint32_t offset_size = isA64 ? 128 : 8;
-
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // Update the header with the current address
- p->curr.execWidth = 1;
- if (isA64)
- p->MOV(headeraddr, addr);
- else
- p->SHR(headeraddr, addr, GenRegister::immud(4));
-
- // Put zero in the general state base address
- if (!isA64)
- p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
-
- p->pop();
- // Now write the data, oword block write can only work with simd16 and no mask
- if (vec_size == 1) {
- p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1)));
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16);
- else
- p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
- }
- p->pop();
- } else if (vec_size == 2) {
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra->genReg(insn.src(1)));
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * typesize), type), ra->genReg(insn.src(2)));
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8);
- else
- p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
- }
- p->pop();
- } else if (vec_size == 4) {
- if (simdWidth == 8) {
- for (uint32_t i = 0; i < 4; i++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * simdWidth * typesize), type), ra->genReg(insn.src(1 + i)));
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), 2 * typesize);
- else
- p->OBWRITE(header, insn.getbti(), 2 * typesize);
- }
- p->pop();
- } else {
- for (uint32_t i = 0; i < typesize / 2; i++) {
- for (uint32_t j = 0; j < 8 / typesize; j++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
- }
- p->pop();
- }
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), 8);
- else
- p->OBWRITE(header, insn.getbti(), 8);
- }
- }
- } else if (vec_size == 8) {
- if (simdWidth == 8) {
- for (uint32_t i = 0; i < typesize / 2; i++) {
- for (uint32_t j = 0; j < 16 / typesize; j++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize)));
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
- }
- p->pop();
- }
- p->push();
- {
- p->curr.execWidth = 16;
- p->curr.noMask = 1;
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), 8);
- else
- p->OBWRITE(header, insn.getbti(), 8);
- }
- p->pop();
- }
- } else {
- for (uint32_t i = 0; i < typesize; i++) {
- for (uint32_t j = 0; j < 8 / typesize; j++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
- if (i > 0) {
- p->push();
- {
- // Update the address in header
- p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
- }
- p->pop();
- }
- if (isA64)
- p->OBWRITEA64(header, insn.getbti(), 8);
- else
- p->OBWRITE(header, insn.getbti(), 8);
- }
- }
- } else NOT_SUPPORTED;
-
+ p->OBWRITE(header, bti, ow_size);
}
void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
- const GenRegister dst = ra->genReg(insn.dst(1));
- const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
- const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
- const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
- const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
- size_t vec_size = insn.extra.elem;
- uint32_t type = dst.type;
- uint32_t typesize = typeSize(type);
- uint32_t block_width = typesize * simdWidth;
- uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
-
- if (simdWidth == 8)
- {
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- p->curr.execWidth = 8;
- // ushort in simd8 will have half reg, but response lenght is still 1
- uint32_t rsize = vec_size * typesize / 4;
- rsize = rsize ? rsize : 1;
- // Now read the data
- p->MBREAD(dst, header, insn.getbti(), rsize);
- p->pop();
-
- }
- else if (simdWidth == 16)
- {
- const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // First half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- // Now read the data
- p->curr.execWidth = 8;
- p->MBREAD(tmp, header, insn.getbti(), vec_size);
- for (uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i));
-
- if (typesize == 4)
- {
- // Second half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
- // Now read the data
- p->curr.execWidth = 8;
- p->MBREAD(tmp, header, insn.getbti(), vec_size);
-
- // Move the reg to fit vector rule.
- for (uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
- GenRegister::offset(tmp, i));
- }
- p->pop();
- } else NOT_IMPLEMENTED;
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister header = ra->genReg(insn.src(0));
+ const size_t rsize = insn.extra.elem;
+ p->MBREAD(dst, header, insn.getbti(), rsize);
}
void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) {
- const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
- const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister tmp = GenRegister::offset(header, 1);
- GenRegister offsetx, offsety, blocksizereg;
- size_t vec_size = insn.extra.elem;
- uint32_t type = ra->genReg(insn.src(2)).type;
- uint32_t typesize = typeSize(type);
- uint32_t block_width = typesize * simdWidth;
- uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16;
-
- offsetx = GenRegister::offset(header, 0, 0*4);
- offsety = GenRegister::offset(header, 0, 1*4);
- blocksizereg = GenRegister::offset(header, 0, 2*4);
-
- if (simdWidth == 8)
- {
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * block_width), type),
- ra->genReg(insn.src(2 + i)));
- // ushort in simd8 will have half reg, but reponse lenght is still 1
- uint32_t rsize = vec_size * typesize / 4;
- rsize = rsize ? rsize : 1;
- // Now read the data
- p->MBWRITE(header, insn.getbti(), rsize);
- p->pop();
-
- }
- else
- {
- p->push();
- // Copy r0 into the header first
- p->curr.execWidth = 8;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::ud8grf(0,0));
-
- // First half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->MOV(offsetx, coordx);
- p->MOV(offsety, coordy);
- // Update block width and height
- p->MOV(blocksizereg, GenRegister::immud(blocksize));
- // Now read the data
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra->genReg(insn.src(2 + i)), GEN_TYPE_UD));
- p->MBWRITE(header, insn.getbti(), vec_size);
-
- if (typesize == 4)
- {
- // Second half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
- // Now write the data
- p->MBWRITE(header, insn.getbti(), vec_size);
- }
-
- p->pop();
- }
+ const GenRegister header = ra->genReg(insn.dst(0));
+ const size_t msgsize = insn.extra.elem;
+ p->MBWRITE(header, insn.getbti(), msgsize);
}
BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 1cd6137..31effd1 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -759,13 +759,13 @@ namespace gbe
void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
GenRegister tmpData1, GenRegister tmpData2);
/*! Oblock read */
- void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+ void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t ow_size);
/*! Oblock write */
- void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+ void OBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t ow_size);
/*! Media block read */
- void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+ void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t block_size);
/*! Media block write */
- void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+ void MBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t block_size);
/* common functions for both binary instruction and sel_cmp and compare instruction.
It will handle the IMM or normal register assignment, and will try to avoid LOADI
@@ -2267,118 +2267,84 @@ namespace gbe
}
void Selection::Opaque::OBREAD(GenRegister* dsts,
uint32_t vec_size,
- GenRegister addr,
GenRegister header,
uint32_t bti,
- GenRegister* tmp,
- uint32_t tmp_size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + vec_size + tmp_size, 1);
+ uint32_t ow_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size, 1);
SelectionVector *vector = this->appendVector();
- insn->dst(0) = header;
+ insn->src(0) = header;
for (uint32_t i = 0; i < vec_size; ++i)
- insn->dst(1 + i) = dsts[i];
- for (uint32_t i = 0; i < tmp_size; ++i)
- insn->dst(1 + i + vec_size) = tmp[i];
- insn->src(0) = addr;
+ insn->dst(i) = dsts[i];
insn->setbti(bti);
- insn->extra.elem = vec_size; // number of vector size
+ insn->extra.elem = ow_size; // number of OWord size
// tmp regs for OWORD read dst
- vector->regNum = tmp_size;
- vector->reg = &insn->dst(1 + vec_size);
- vector->offsetID = 1 + vec_size;
+ vector->regNum = vec_size;
+ vector->reg = &insn->dst(0);
+ vector->offsetID = 0;
vector->isSrc = 0;
}
- void Selection::Opaque::OBWRITE(GenRegister addr,
+ void Selection::Opaque::OBWRITE(GenRegister header,
GenRegister* values,
uint32_t vec_size,
- GenRegister header,
uint32_t bti,
- GenRegister* tmp,
- uint32_t tmp_size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1);
+ uint32_t ow_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, vec_size + 1);
SelectionVector *vector = this->appendVector();
- insn->src(0) = addr;
+ insn->src(0) = header;
for (uint32_t i = 0; i < vec_size; ++i)
insn->src(i + 1) = values[i];
- insn->dst(0) = header;
- for (uint32_t i = 0; i < tmp_size; ++i)
- insn->dst(i + 1) = tmp[i];
insn->setbti(bti);
- insn->extra.elem = vec_size; // number of vector_size
+ insn->extra.elem = ow_size; // number of OWord_size
- // tmp regs for OWORD read dst
- vector->regNum = tmp_size + 1;
- vector->reg = &insn->dst(0);
+ // tmp regs for OWORD write header and values
+ vector->regNum = vec_size + 1;
+ vector->reg = &insn->src(0);
vector->offsetID = 0;
- vector->isSrc = 0;
+ vector->isSrc = 1;
+
}
void Selection::Opaque::MBREAD(GenRegister* dsts,
- GenRegister coordx,
- GenRegister coordy,
+ uint32_t tmp_size,
GenRegister header,
- GenRegister* tmp,
uint32_t bti,
- uint32_t vec_size) {
-
- uint32_t simdWidth = curr.execWidth;
- SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2);
- insn->dst(0) = header;
- for (uint32_t i = 0; i < vec_size; ++i) {
- insn->dst(i + 1) = dsts[i];
- if(simdWidth == 16)
- insn->dst(i + vec_size + 1) = tmp[i];
- }
- insn->src(0) = coordx;
- insn->src(1) = coordy;
+ uint32_t block_size) {
+
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, tmp_size, 1);
+ insn->src(0) = header;
insn->setbti(bti);
- insn->extra.elem = vec_size; // vector size
+ insn->extra.elem = block_size; // vector size
- // Only in simd 8 the data is in vector form
- if(simdWidth == 8) {
- SelectionVector *vector = this->appendVector();
- vector->regNum = vec_size;
- vector->reg = &insn->dst(1);
- vector->offsetID = 1;
- vector->isSrc = 0;
- }
- if(simdWidth == 16)
- {
- SelectionVector *vectortmp = this->appendVector();
- vectortmp->regNum = vec_size;
- vectortmp->reg = &insn->dst(vec_size + 1);
- vectortmp->offsetID = vec_size + 1;
- vectortmp->isSrc = 0;
+ for (uint32_t i = 0; i < tmp_size; ++i) {
+ insn->dst(i) = dsts[i];
}
+ SelectionVector *vector = this->appendVector();
+ vector->regNum = tmp_size;
+ vector->reg = &insn->dst(0);
+ vector->offsetID = 0;
+ vector->isSrc = 0;
}
- void Selection::Opaque::MBWRITE(GenRegister coordx,
- GenRegister coordy,
+ void Selection::Opaque::MBWRITE(GenRegister header,
GenRegister* values,
- GenRegister header,
- GenRegister* tmp,
+ uint32_t tmp_size,
uint32_t bti,
- uint32_t vec_size) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size);
+ uint32_t block_size) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 + tmp_size);
SelectionVector *vector = this->appendVector();
- insn->src(0) = coordx;
- insn->src(1) = coordy;
- for (uint32_t i = 0; i < vec_size; ++i)
- insn->src(2 + i) = values[i];
- insn->dst(0) = header;
- for (uint32_t i = 0; i < vec_size; ++i)
- insn->dst(1 + i) = tmp[i];
- insn->state = this->curr;
+ insn->src(0) = header;
+ for (uint32_t i = 0; i < tmp_size; ++i)
+ insn->src(1 + i) = values[i];
insn->setbti(bti);
- insn->extra.elem = vec_size; // vector size
+ insn->extra.elem = block_size; // msg size
// We need to put the header and the data together
- vector->regNum = 1 + vec_size;
+ vector->regNum = 1 + tmp_size;
vector->reg = &insn->dst(0);
vector->offsetID = 0;
- vector->isSrc = 0;
+ vector->isSrc = 1;
}
// Boiler plate to initialize the selection library at c++ pre-main
@@ -4715,18 +4681,79 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
const uint32_t simdWidth = sel.ctx.getSimdWidth();
const Type type = insn.getValueType();
const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+ const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+ const RegisterFamily family = getFamily(type);
+ bool isA64 = SI == 255;
+
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
vector<GenRegister> valuesVec;
+ vector<GenRegister> tmpVec;
for(uint32_t i = 0; i < vec_size; i++)
valuesVec.push_back(sel.selReg(insn.getValue(i), type));
- // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
- uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
- tmp_size = tmp_size == 0 ? 1 : tmp_size;
- tmp_size = tmp_size > 4 ? 4 : tmp_size;
- vector<GenRegister> tmpVec;
+
+ GenRegister headeraddr;
+ if (isA64)
+ headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+ else
+ headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+ // Make header
+ sel.push();
+ {
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the current address
+ sel.curr.execWidth = 1;
+
+ // Put zero in the general state base address
+ if (isA64)
+ sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+ else {
+ sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD));
+ sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+ }
+ }
+ sel.pop();
+
+ /* For block read we need to unpack the block date into values, and for different
+ * simdwidth and vector size with different type size, we may need to spilt the
+ * block read send message.
+ * We can only get a send message with 5 reg length
+ * so for different combination we have different message length and tmp vector size
+ * | simd8 | simd16 | simd8 | simd16
+ * r0 |header | | | |
+ * r1 |date | w0,w1 | w0 | dw0 | dw0
+ * r2 |date | w2,w3 | w1 | dw1 | dw0
+ * r3 |date | ...... | ...... | ...... | dw1
+ * r4 |date | ....... | ...... | ...... | dw1
+ */
+
+ uint32_t totalSize = simdWidth * typeSize * vec_size;
+ uint32_t valueSize = simdWidth * typeSize;
+ uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+ uint32_t msg_num = vec_size / tmp_size;
+ uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
for(uint32_t i = 0; i < tmp_size; i++)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
- sel.OBREAD(&valuesVec[0], vec_size, address, header, SI, &tmpVec[0], tmp_size);
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+ for (uint32_t i = 0; i < msg_num; i++) {
+ if (i > 0) {
+ sel.push();
+ {
+ // Update the address in header
+ sel.curr.execWidth = 1;
+ sel.ADD(headeraddr, headeraddr, GenRegister::immud(128));
+ }
+ sel.pop();
+ }
+ sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size);
+ for (uint32_t j = 0; j < tmp_size; j++)
+ sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]);
+ }
+
}
// check whether all binded table index point to constant memory
@@ -5161,18 +5188,80 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
const uint32_t simdWidth = sel.ctx.getSimdWidth();
const Type type = insn.getValueType();
const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+ const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+ const RegisterFamily family = getFamily(type);
+ bool isA64 = SI == 255;
+
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
vector<GenRegister> valuesVec;
+ vector<GenRegister> tmpVec;
for(uint32_t i = 0; i < vec_size; i++)
valuesVec.push_back(sel.selReg(insn.getValue(i), type));
- // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
- uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
- tmp_size = tmp_size == 0 ? 1 : tmp_size;
- tmp_size = tmp_size > 4 ? 4 : tmp_size;
- vector<GenRegister> tmpVec;
+
+ GenRegister headeraddr;
+ if (isA64)
+ headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+ else
+ headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+ // Make header
+ sel.push();
+ {
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the current address
+ sel.curr.execWidth = 1;
+
+ // Put zero in the general state base address
+ if (isA64)
+ sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+ else {
+ sel.SHR(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD), GenRegister::immud(4));
+ sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+ }
+ }
+ sel.pop();
+
+ /* For block write we need to pack the block date into the tmp, and for different
+ * simdwidth and vector size with different type size, we may need to spilt the
+ * block write send message.
+ * We can only get a send message with 5 reg length
+ * so for different combination we have different message length and tmp vector size
+ * | simd8 | simd16 | simd8 | simd16
+ * r0 |header | | | |
+ * r1 |date | w0,w1 | w0 | dw0 | dw0
+ * r2 |date | w2,w3 | w1 | dw1 | dw0
+ * r3 |date | ...... | ...... | ...... | dw1
+ * r4 |date | ....... | ...... | ...... | dw1
+ */
+
+ uint32_t totalSize = simdWidth * typeSize * vec_size;
+ uint32_t valueSize = simdWidth * typeSize;
+ uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+ uint32_t msg_num = vec_size / tmp_size;
+ uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
for(uint32_t i = 0; i < tmp_size; i++)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
- sel.OBWRITE(address, &valuesVec[0], vec_size, header, SI, &tmpVec[0], tmp_size);
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+ for (uint32_t i = 0; i < msg_num; i++) {
+ for (uint32_t j = 0; j < tmp_size; j++)
+ sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]);
+ if (i > 0) {
+ sel.push();
+ {
+ // Update the address in header
+ sel.curr.execWidth = 1;
+ sel.ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ }
+ sel.pop();
+ }
+ sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size);
+ }
+
+
}
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
@@ -7662,20 +7751,77 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
uint32_t vec_size = insn.getVectorSize();
uint32_t simdWidth = sel.curr.execWidth;
const Type type = insn.getType();
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ // ushort in simd8 will have half reg, but response lenght is still 1
+ uint32_t rsize = simdWidth * vec_size * typeSize / 32;
+ rsize = rsize ? rsize : 1;
+ uint32_t block_width = typeSize * simdWidth;
+ uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
vector<GenRegister> valuesVec;
vector<GenRegister> tmpVec;
for (uint32_t i = 0; i < vec_size; ++i) {
valuesVec.push_back(sel.selReg(insn.getDst(i), type));
- if(simdWidth == 16)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
+ if(simdWidth == 16 && typeSize == 4)
+ tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
}
- const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
- const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
- GenRegister *tmp = NULL;
- if(simdWidth == 16)
- tmp = &tmpVec[0];
- sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize());
+ const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+ const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD);
+ const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD);
+ const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4);
+
+ // Make header
+ sel.push();
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.MOV(offsetx, coordx);
+ sel.MOV(offsety, coordy);
+ // Update block width and height
+ sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+ sel.pop();
+
+ if (simdWidth * typeSize < 64) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ // Now read the data
+ sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), rsize);
+ sel.pop();
+ } else if (simdWidth * typeSize == 64) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), vec_size);
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(valuesVec[i], tmpVec[i]);
+
+ // Second half
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+ // Now read the data
+ sel.curr.execWidth = 8;
+ sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), vec_size);
+
+ // Move the reg to fit vector rule.
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]);
+ sel.pop();
+ } else NOT_IMPLEMENTED;
+
+
return true;
}
DECL_CTOR(MediaBlockReadInstruction, 1, 1);
@@ -7689,17 +7835,84 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
using namespace ir;
uint32_t vec_size = insn.getVectorSize();
const Type type = insn.getType();
- const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
- const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+ uint32_t simdWidth = sel.curr.execWidth;
+ const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+ const RegisterFamily family = getFamily(type);
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ // ushort in simd8 will have half reg, but response lenght is still 1
+ uint32_t msgsize = simdWidth * vec_size * typeSize / 32;
+ msgsize = msgsize ? msgsize : 1;
+ uint32_t block_width = typeSize * simdWidth;
+ uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
vector<GenRegister> valuesVec;
vector<GenRegister> tmpVec;
- for(uint32_t i = 0; i < vec_size; i++)
- {
- valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
- }
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
- sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
+ for (uint32_t i = 0; i < vec_size; ++i) {
+ valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
+ if(simdWidth == 16 && typeSize == 4)
+ tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
+ else
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+ }
+ const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+ const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+ const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD);
+ const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD);
+ const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4);
+
+ // Make header
+ sel.push();
+ // Copy r0 into the header first
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.MOV(offsetx, coordx);
+ sel.MOV(offsety, coordy);
+ // Update block width and height
+ sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+ sel.pop();
+
+ if (simdWidth * typeSize < 64) {
+ for (uint32_t i = 0; i < vec_size; ++i) {
+ sel.MOV(tmpVec[i], valuesVec[i]);
+ }
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ // Now read the data
+ sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), msgsize);
+ sel.pop();
+ } else if (simdWidth * typeSize == 64) {
+ sel.push();
+ sel.curr.execWidth = 8;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(tmpVec[i], valuesVec[i]);
+ sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+ // Second half
+ // Update the header with the coord
+ sel.curr.execWidth = 1;
+ sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+ // Now read the data
+ sel.curr.execWidth = 8;
+ for (uint32_t i = 0; i < vec_size; i++)
+ sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32));
+ sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+ // Move the reg to fit vector rule.
+ sel.pop();
+ } else NOT_IMPLEMENTED;
+
return true;
}
DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
--
2.7.4
More information about the Beignet
mailing list