[Beignet] [PATCH V2 15/15] Backend: Add A64 subgroup block read/write support
Xiuli Pan
xiuli.pan at intel.com
Wed Oct 19 06:37:24 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
For GEN8+ and OpenCL2.0 we use stateless buffer and need A64 buffer read
write. Add A64 encoder for Oword block read and write.
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
backend/src/backend/gen8_encoder.cpp | 70 +++++++++++++++++++++
backend/src/backend/gen8_encoder.hpp | 4 ++
backend/src/backend/gen8_instruction.hpp | 13 ++++
backend/src/backend/gen_context.cpp | 103 ++++++++++++++++++++++++-------
backend/src/backend/gen_defs.hpp | 3 +
backend/src/backend/gen_encoder.cpp | 8 +++
backend/src/backend/gen_encoder.hpp | 4 ++
backend/src/ir/instruction.cpp | 8 +--
8 files changed, 184 insertions(+), 29 deletions(-)
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 277260f..2f69116 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -637,4 +637,74 @@ namespace gbe
gen8_insn->bits1.da3srcacc.src2_abs = src2.absolute;
gen8_insn->bits1.da3srcacc.src2_negate = src2.negation;
}
+
+ static void setOBlockRWA64(GenEncoder *p,
+ GenNativeInstruction *insn,
+ uint32_t bti,
+ uint32_t size,
+ uint32_t msg_type,
+ uint32_t msg_length,
+ uint32_t response_length)
+ {
+ const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+ p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+ assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
+ Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+
+ gen8_insn->bits3.gen8_block_rw_a64.msg_type = msg_type;
+ gen8_insn->bits3.gen8_block_rw_a64.bti = bti;
+ // For OWord Block read, we use unaligned read
+ gen8_insn->bits3.gen8_block_rw_a64.msg_sub_type = msg_type == GEN8_P1_BLOCK_READ_A64 ? 1 : 0;
+ gen8_insn->bits3.gen8_block_rw_a64.block_size = size <= 2 ? size : (size == 4 ? 3 : 4);
+ gen8_insn->bits3.gen8_block_rw_a64.header_present = 1;
+ }
+
+ void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ const uint32_t msg_length = 1;
+ uint32_t rsize = size / 2;
+ uint32_t msgsize = size;
+ // When size is 1 OWord, which means half a reg, we need to know which half to use
+ if (size == 1) {
+ if (dst.subnr == 0)
+ msgsize = 0;
+ else
+ msgsize = 1;
+ }
+ rsize = rsize == 0 ? 1 : rsize;
+ const uint32_t response_length = rsize; // Size is in regs
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ setOBlockRWA64(this,
+ insn,
+ bti,
+ msgsize,
+ GEN8_P1_BLOCK_READ_A64,
+ msg_length,
+ response_length);
+
+ }
+
+ void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t size) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ uint32_t rsize = size / 2;
+ rsize = rsize == 0 ? 1 : rsize;
+ const uint32_t msg_length = 1 + rsize; // Size is in owords
+ const uint32_t response_length = 0;
+ uint32_t msgsize = size;
+ msgsize = msgsize == 1 ? 0 : msgsize;
+ this->setHeader(insn);
+ this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ setOBlockRWA64(this,
+ insn,
+ bti,
+ msgsize,
+ GEN8_P1_BLOCK_WRITE_A64,
+ msg_length,
+ response_length);
+ }
} /* End of the name space. */
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index 12b3765..b0aec3a 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -71,6 +71,10 @@ namespace gbe
uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc);
void MADM(GenRegister dst, GenRegister src0, GenRegister src1, GenRegister src2,
uint32_t dstAcc, uint32_t src0Acc, uint32_t src1Acc, uint32_t src2Acc);
+ /*! A64 OBlock read */
+ virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+ /*! A64 OBlock write */
+ virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize);
};
}
#endif /* __GBE_GEN8_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index 549948a..e76ecaa 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -604,6 +604,19 @@ union Gen8NativeInstruction
uint32_t end_of_thread:1;
} gen7_msg_gw;
+ struct {
+ uint32_t bti:8;
+ uint32_t block_size:3; // oword size
+ uint32_t msg_sub_type:2; // 00 OWord block R/W 01 Unaligned OWord block read 10 Oword Dual Block R/W 11 HWord Block R/W
+ uint32_t ignored:1;
+ uint32_t msg_type:5; // 10100 A64 block read, 10101 A64 block write
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } gen8_block_rw_a64;
+
struct {
uint32_t jip:32;
} gen8_branch;
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 6bb0f22..e10d89b 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3502,14 +3502,20 @@ namespace gbe
void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
const GenRegister dst= ra->genReg(insn.dst(1));
+ const GenRegister addrreg = ra->genReg(insn.src(0));
uint32_t type = dst.type;
uint32_t typesize = typeSize(type);
- const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
- const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
const uint32_t vec_size = insn.extra.elem;
const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type);
const uint32_t simdWidth = p->curr.execWidth;
+ const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+ const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
+ GenRegister headeraddr;
+ bool isA64 = insn.getbti() == 255;
+ if (isA64)
+ headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+ else
+ headeraddr = GenRegister::offset(header, 0, 2*4);
// Make header
p->push();
@@ -3525,7 +3531,9 @@ namespace gbe
p->MOV(headeraddr, addr);
// Put zero in the general state base address
- p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
+ if (!isA64)
+ p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
+
}
p->pop();
// Now read the data, oword block read can only work with simd16 and no mask
@@ -3534,7 +3542,12 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
+ if (isA64) {
+ //p->curr.execWidth = 8;
+ p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
+ }
+ else
+ p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
}
p->pop();
} else if (vec_size == 2) {
@@ -3542,7 +3555,10 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
+ if (isA64)
+ p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
+ else
+ p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
}
p->pop();
p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
@@ -3553,7 +3569,10 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
+ if (isA64)
+ p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize);
+ else
+ p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
}
p->pop();
for (uint32_t j = 0; j < 4; j++)
@@ -3569,7 +3588,10 @@ namespace gbe
}
p->pop();
}
- p->OBREAD(tmp, header, insn.getbti(), 8);
+ if (isA64)
+ p->OBREADA64(tmp, header, insn.getbti(), 8);
+ else
+ p->OBREAD(tmp, header, insn.getbti(), 8);
for (uint32_t j = 0; j < 8 / typesize ; j++)
p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
}
@@ -3590,7 +3612,10 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBREAD(tmp, header, insn.getbti(), 8);
+ if (isA64)
+ p->OBREADA64(tmp, header, insn.getbti(), 8);
+ else
+ p->OBREAD(tmp, header, insn.getbti(), 8);
}
p->pop();
for (uint32_t j = 0; j < 16 / typesize; j++)
@@ -3607,7 +3632,10 @@ namespace gbe
}
p->pop();
}
- p->OBREAD(tmp, header, insn.getbti(), 8);
+ if (isA64)
+ p->OBREADA64(tmp, header, insn.getbti(), 8);
+ else
+ p->OBREAD(tmp, header, insn.getbti(), 8);
for (uint32_t j = 0; j < 8 / typesize; j++)
p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
}
@@ -3616,16 +3644,23 @@ namespace gbe
}
void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
- const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+ const GenRegister addrreg = ra->genReg(insn.src(0));
const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
- const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
uint32_t type = ra->genReg(insn.src(1)).type;
uint32_t typesize = typeSize(type);
const uint32_t vec_size = insn.extra.elem;
const GenRegister tmp = GenRegister::offset(header, 1);
+ const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
+ GenRegister headeraddr;
+ bool isA64 = insn.getbti() == 255;
+ if (isA64)
+ headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+ else
+ headeraddr = GenRegister::offset(header, 0, 2*4);
const uint32_t simdWidth = p->curr.execWidth;
uint32_t tmp_size = simdWidth * vec_size / 8;
tmp_size = tmp_size > 4 ? 4 : tmp_size;
+ uint32_t offset_size = isA64 ? 128 : 8;
p->push();
// Copy r0 into the header first
@@ -3636,10 +3671,14 @@ namespace gbe
// Update the header with the current address
p->curr.execWidth = 1;
- p->SHR(headeraddr, addr, GenRegister::immud(4));
+ if (isA64)
+ p->MOV(headeraddr, addr);
+ else
+ p->SHR(headeraddr, addr, GenRegister::immud(4));
// Put zero in the general state base address
- p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
+ if (!isA64)
+ p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
p->pop();
// Now write the data, oword block write can only work with simd16 and no mask
@@ -3649,7 +3688,10 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
+ if (isA64)
+ p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16);
+ else
+ p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
}
p->pop();
} else if (vec_size == 2) {
@@ -3659,7 +3701,10 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
+ if (isA64)
+ p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8);
+ else
+ p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
}
p->pop();
} else if (vec_size == 4) {
@@ -3670,7 +3715,10 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBWRITE(header, insn.getbti(), 2 * typesize);
+ if (isA64)
+ p->OBWRITEA64(header, insn.getbti(), 2 * typesize);
+ else
+ p->OBWRITE(header, insn.getbti(), 2 * typesize);
}
p->pop();
} else {
@@ -3682,11 +3730,14 @@ namespace gbe
{
// Update the address in header
p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
}
p->pop();
}
- p->OBWRITE(header, insn.getbti(), 8);
+ if (isA64)
+ p->OBWRITEA64(header, insn.getbti(), 8);
+ else
+ p->OBWRITE(header, insn.getbti(), 8);
}
}
} else if (vec_size == 8) {
@@ -3699,7 +3750,7 @@ namespace gbe
{
// Update the address in header
p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
}
p->pop();
}
@@ -3707,7 +3758,10 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBWRITE(header, insn.getbti(), 8);
+ if (isA64)
+ p->OBWRITEA64(header, insn.getbti(), 8);
+ else
+ p->OBWRITE(header, insn.getbti(), 8);
}
p->pop();
}
@@ -3720,11 +3774,14 @@ namespace gbe
{
// Update the address in header
p->curr.execWidth = 1;
- p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
}
p->pop();
}
- p->OBWRITE(header, insn.getbti(), 8);
+ if (isA64)
+ p->OBWRITEA64(header, insn.getbti(), 8);
+ else
+ p->OBWRITE(header, insn.getbti(), 8);
}
}
} else NOT_SUPPORTED;
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index bcbb23f..de88e11 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -357,6 +357,9 @@ enum GenMessageTarget {
#define GEN75_P1_ATOMIC_COUNTER_4X2 12 //1100: Atomic Counter Operation 4X2
#define GEN75_P1_TYPED_SURFACE_WRITE 13 //1101: Typed Surface Write
+#define GEN8_P1_BLOCK_READ_A64 20 //10100
+#define GEN8_P1_BLOCK_WRITE_A64 21 //10101
+
/* Data port data cache scratch messages*/
#define GEN_SCRATCH_READ 0
#define GEN_SCRATCH_WRITE 1
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index a6f8db8..5d5f564 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1338,6 +1338,14 @@ namespace gbe
response_length);
}
+ void GenEncoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize) {
+ NOT_SUPPORTED;
+ }
+
+ void GenEncoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize) {
+ NOT_SUPPORTED;
+ }
+
void GenEncoder::EOT(uint32_t msg) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 0f835ca..963c811 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -275,6 +275,10 @@ namespace gbe
virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
/*! MBlock write */
virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+ /*! A64 OBlock read */
+ virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+ /*! A64 OBlock write */
+ virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize);
GBE_CLASS(GenEncoder); //!< Use custom allocators
virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 512055c..e722dbe 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1652,12 +1652,8 @@ namespace ir {
whyNot = "Wrong number of source.";
return false;
} else {
- const RegisterFamily fam = fn.getPointerFamily();
- for (uint32_t srcID = 1; srcID < this->srcNum; ++srcID) {
- const Register regID = fn.getRegister(src, srcID);
- if (UNLIKELY(checkRegisterData(fam, regID, fn, whyNot) == false))
- return false;
- }
+ if (UNLIKELY(checkRegisterData(FAMILY_DWORD, fn.getRegister(src, 1), fn, whyNot) == false))
+ return false;
}
break;
default:
--
2.7.4
More information about the Beignet
mailing list