[Beignet] [ocl2.0 4/4] GBE: new Load/Store Instruction Selection pattern
Ruiling Song
ruiling.song at intel.com
Thu Oct 29 00:19:18 PDT 2015
This patch add support for stateless load/store in instruction selection.
Because stateless message now only support 64bit address (A64) and SIMD8
mode. So we have to handle SIMD16 split and A32/A64 convert.
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/backend/gen_insn_selection.cpp | 652 ++++++++++++++++++++++++-----
1 file changed, 536 insertions(+), 116 deletions(-)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index b160db9..8089c97 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -754,9 +754,9 @@ namespace gbe
GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
}
- INLINE vector<GenRegister> getBTITemps(const ir::BTI &bti) {
+ INLINE vector<GenRegister> getBTITemps(const ir::AddressMode &AM) {
vector<GenRegister> temps;
- if (!bti.isConst) {
+ if (AM == ir::AM_DynamicBti) {
temps.push_back(selReg(reg(ir::FAMILY_WORD, true), ir::TYPE_U16));
temps.push_back(selReg(reg(ir::FAMILY_DWORD, true), ir::TYPE_U32));
}
@@ -3531,36 +3531,121 @@ namespace gbe
LoadInstructionPattern(void) : SelectionPattern(1, 1) {
this->opcodes.push_back(ir::OP_LOAD);
}
- void readDWord(Selection::Opaque &sel,
+ void untypedReadStateless(Selection::Opaque &sel,
+ GenRegister addr,
+ vector<GenRegister> &dst
+ ) const {
+ using namespace ir;
+ GenRegister addrQ;
+ unsigned simdWidth = sel.curr.execWidth;
+ unsigned addrBytes = typeSize(addr.type);
+ unsigned valueNum = dst.size();
+ bool isUniform = sel.isScalarReg(dst[0].reg());
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, addr);
+ } else if (addrBytes == 8) {
+ addrQ = addr;
+ } else
+ NOT_IMPLEMENTED;
+
+ if (simdWidth == 8) {
+ sel.UNTYPED_READA64(addrQ, dst.data(), valueNum, valueNum);
+ } else if (simdWidth == 16) {
+ std::vector<GenRegister> tmpData;
+ for (unsigned i = 0; i < (valueNum+1)/2; i++) {
+ tmpData.push_back(sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32));
+ }
+ sel.push();
+ /* first quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.UNTYPED_READA64(GenRegister::Qn(addrQ, 0), tmpData.data(), (valueNum+1)/2, valueNum);
+
+ sel.push();
+ if (isUniform)
+ sel.curr.execWidth = 1;
+ for (unsigned k = 0; k < valueNum; k++) {
+ sel.MOV(GenRegister::Qn(dst[k], 0), GenRegister::Qn(tmpData[k/2], k%2));
+ }
+ sel.pop();
+
+ /* second quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.UNTYPED_READA64(GenRegister::Qn(addrQ, 1), tmpData.data(), (valueNum+1)/2, valueNum);
+ if (isUniform)
+ sel.curr.execWidth = 1;
+ for (unsigned k = 0; k < valueNum; k++) {
+ sel.MOV(GenRegister::Qn(dst[k], 1), GenRegister::Qn(tmpData[k/2], k%2));
+ }
+ sel.pop();
+ }
+ }
+
+ void shootUntypedReadMsg(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
vector<GenRegister> &dst,
GenRegister addr,
uint32_t valueNum,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
- sel.UNTYPED_READ(addr, dst.data(), valueNum, b, sel.getBTITemps(bti));
+ using namespace ir;
+ unsigned addrBytes = typeSize(addr.type);
+ AddressMode AM = insn.getAddressMode();
+
+ /* Notes on uniform of LoadInstruction, all-lanes-active(noMask,noPredicate)
+ * property should only need be taken care when the value is UNIFORM, if the
+ * value is not uniform, just do things under predication or mask */
+ bool isUniform = sel.isScalarReg(dst[0].reg());
+ sel.push();
+ if (isUniform) {
+ sel.curr.noMask = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ }
+
+ if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+ if (AM == AM_DynamicBti) {
+ Register btiReg = insn.getBtiReg();
+ sel.UNTYPED_READ(addr, dst.data(), valueNum, sel.selReg(btiReg, TYPE_U32), btiTemp);
+ } else {
+ unsigned SI = insn.getSurfaceIndex();
+ sel.UNTYPED_READ(addr, dst.data(), valueNum, GenRegister::immud(SI), btiTemp);
+ }
+ } else if (addrSpace == ir::MEM_LOCAL || addrSpace == ir::MEM_CONSTANT ) {
+ // stateless mode, local/constant still use bti access
+ unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+ GenRegister addrDW = addr;
+ if (addrBytes == 8)
+ addrDW = convertU64ToU32(sel, addr);
+ vector<GenRegister> btiTemp;
+ sel.UNTYPED_READ(addrDW, dst.data(), valueNum, GenRegister::immud(bti), btiTemp);
+ } else {
+ untypedReadStateless(sel, addr, dst);
+ }
+ sel.pop();
}
void emitUntypedRead(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
vector<GenRegister> dst(valueNum);
for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
- readDWord(sel, dst, addr, valueNum, bti);
+ shootUntypedReadMsg(sel, insn, dst, addr, valueNum, addrSpace);
}
void emitDWordGather(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
- GBE_ASSERT(bti.isConst == 1);
GBE_ASSERT(insn.getValueNum() == 1);
const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
@@ -3568,7 +3653,7 @@ namespace gbe
GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
sel.push();
sel.curr.noMask = 1;
- sel.SAMPLE(&dst, 1, &addr, 1, bti.imm, 0, true, true);
+ sel.SAMPLE(&dst, 1, &addr, 1, BTI_CONSTANT, 0, true, true);
sel.pop();
return;
}
@@ -3584,50 +3669,117 @@ namespace gbe
sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
sel.pop();
- sel.DWORD_GATHER(dst, addrDW, bti.imm);
+ sel.DWORD_GATHER(dst, addrDW, BTI_CONSTANT);
+ }
+
+ void read64Legacy(Selection::Opaque &sel,
+ GenRegister addr,
+ vector<GenRegister> &dst,
+ GenRegister bti,
+ vector<GenRegister> &btiTemp) const {
+ const uint32_t valueNum = dst.size();
+ if (sel.hasLongType()) {
+ vector<GenRegister> tmp(valueNum);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+ tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ }
+
+ sel.READ64(addr, dst.data(), tmp.data(), valueNum, bti, true, btiTemp);
+ } else {
+ sel.READ64(addr, dst.data(), NULL, valueNum, bti, false, btiTemp);
+ }
+ }
+ void read64Stateless(Selection::Opaque &sel,
+ const GenRegister addr,
+ vector<GenRegister> dst) const {
+ using namespace ir;
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ unsigned valueNum = dst.size();
+ vector<GenRegister> tmp(valueNum);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+ tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ }
+ unsigned addrBytes = typeSize(addr.type);
+ GenRegister addrQ;
+
+ sel.push();
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, addr);
+ } else {
+ addrQ = addr;
+ }
+
+ if (simdWidth == 8) {
+ sel.READ64A64(addrQ, dst.data(), tmp.data(), valueNum);
+ } else {
+ assert(valueNum == 1);
+ GenRegister tmpAddr, tmpDst;
+ tmpAddr = GenRegister::Qn(addrQ, 0);
+ tmpDst = GenRegister::Qn(dst[0], 0);
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.READ64A64(tmpAddr, &tmpDst, tmp.data(), valueNum);
+
+ tmpAddr = GenRegister::Qn(addrQ, 1);
+ tmpDst = GenRegister::Qn(dst[0], 1);
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.READ64A64(tmpAddr, &tmpDst, tmp.data(), valueNum);
+ }
+ sel.pop();
}
void emitRead64(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
GenRegister addr,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
- GBE_ASSERT(bti.isConst == 1);
vector<GenRegister> dst(valueNum);
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
- if (sel.hasLongType()) {
- vector<GenRegister> tmp(valueNum);
- for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
- tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ bool isUniform = sel.isScalarReg(insn.getValue(0));
+ AddressMode AM = insn.getAddressMode();
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+ sel.push();
+ if (isUniform) {
+ sel.curr.noMask = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
}
-
- sel.READ64(addr, dst.data(), tmp.data(), valueNum, b, true, sel.getBTITemps(bti));
- } else {
- sel.READ64(addr, dst.data(), NULL, valueNum, b, false, sel.getBTITemps(bti));
- }
+ if (AM != AM_Stateless) {
+ GenRegister b;
+ if (AM == AM_DynamicBti) {
+ b = sel.selReg(insn.getBtiReg(), TYPE_U32);
+ } else {
+ b = GenRegister::immud(insn.getSurfaceIndex());
+ }
+ read64Legacy(sel, addr, dst, b, btiTemp);
+ } else if (addrSpace == MEM_LOCAL || addrSpace == MEM_CONSTANT) {
+ GenRegister b = GenRegister::immud(addrSpace == MEM_LOCAL? 0xfe : BTI_CONSTANT);
+ read64Legacy(sel, addr, dst, b, btiTemp);
+ } else {
+ read64Stateless(sel, addr, dst);
+ }
+ sel.pop();
}
void readByteAsDWord(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
const uint32_t elemSize,
GenRegister address,
GenRegister dst,
bool isUniform,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
- Register tmpReg = sel.reg(FAMILY_DWORD, isUniform);
+ Register tmpReg = sel.reg(FAMILY_DWORD);
GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32);
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
-
// Get dword aligned addr
sel.push();
if (isUniform) {
@@ -3637,9 +3789,11 @@ namespace gbe
sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
sel.pop();
sel.push();
+ vector<GenRegister> tmp;
+ tmp.push_back(tmpData);
+ shootUntypedReadMsg(sel, insn, tmp, tmpAddr, 1, addrSpace);
if (isUniform)
sel.curr.noMask = 1;
- sel.UNTYPED_READ(tmpAddr, &tmpData, 1, b, sel.getBTITemps(bti));
if (isUniform)
sel.curr.execWidth = 1;
@@ -3649,9 +3803,9 @@ namespace gbe
sel.SHR(tmpData, tmpData, tmpAddr);
if (elemSize == GEN_BYTE_SCATTER_WORD)
- sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+ sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), GenRegister::unpacked_uw(tmpReg, isUniform, sel.isLongReg(tmpReg)));
else if (elemSize == GEN_BYTE_SCATTER_BYTE)
- sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+ sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), GenRegister::unpacked_ub(tmpReg, isUniform));
sel.pop();
}
@@ -3660,7 +3814,7 @@ namespace gbe
const ir::LoadInstruction &insn,
const uint32_t elemSize,
GenRegister address,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
@@ -3679,7 +3833,7 @@ namespace gbe
tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
}
- readDWord(sel, tmp, address, tmpRegNum, bti);
+ shootUntypedReadMsg(sel, insn, tmp, address, tmpRegNum, addrSpace);
for(uint32_t i = 0; i < tmpRegNum; i++) {
unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
@@ -3736,11 +3890,81 @@ namespace gbe
sel.pop();
}
+ /* Used to transform address from 64bit to 32bit, note as dataport messages
+ * cannot accept scalar register, so here to convert to non-uniform
+ * register here. */
+ GenRegister convertU64ToU32(Selection::Opaque &sel,
+ GenRegister addr) const {
+ GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD);
+ GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ sel.MOV(dst, unpacked);
+ return dst;
+ }
+
+ void byteGatherStateless(Selection::Opaque &sel,
+ GenRegister addr,
+ GenRegister dst,
+ unsigned elemSize) const {
+ using namespace ir;
+ GenRegister addrQ;
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ unsigned addrBytes = typeSize(addr.type);
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, addr);
+ } else {
+ addrQ = addr;
+ }
+
+ sel.push();
+ if (simdWidth == 8) {
+ sel.BYTE_GATHERA64(dst, addrQ, elemSize);
+ } else if (simdWidth == 16) {
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.BYTE_GATHERA64(GenRegister::Qn(dst, 0), GenRegister::Qn(addrQ, 0), elemSize);
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.BYTE_GATHERA64(GenRegister::Qn(dst, 1), GenRegister::Qn(addrQ, 1), elemSize);
+ }
+ sel.pop();
+ }
+ void shootByteGatherMsg(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister dst,
+ GenRegister addr,
+ unsigned elemSize,
+ bool isUniform,
+ ir::AddressSpace addrSpace) const {
+ using namespace ir;
+ unsigned addrBytes = typeSize(addr.type);
+ AddressMode AM = insn.getAddressMode();
+ vector<GenRegister> btiTemp;
+ if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+ if (AM == AM_DynamicBti) {
+ Register btiReg = insn.getBtiReg();
+ sel.BYTE_GATHER(dst, addr, elemSize, sel.selReg(btiReg, TYPE_U32), btiTemp);
+ } else {
+ unsigned SI = insn.getSurfaceIndex();
+ sel.BYTE_GATHER(dst, addr, elemSize, GenRegister::immud(SI), btiTemp);
+ }
+ } else if (addrSpace == ir::MEM_LOCAL || addrSpace == ir::MEM_CONSTANT) {
+ unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+ GenRegister addrDW = addr;
+ if (addrBytes == 8) {
+ addrDW = convertU64ToU32(sel, addr);
+ }
+
+ sel.BYTE_GATHER(dst, addrDW, elemSize, GenRegister::immud(bti), btiTemp);
+ } else {
+ byteGatherStateless(sel, addr, dst, elemSize);
+ }
+ }
+
void emitUnalignedByteGather(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
const uint32_t elemSize,
GenRegister address,
- ir::BTI bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
@@ -3759,10 +3983,9 @@ namespace gbe
uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
vector<GenRegister> tmp(effectDataNum + 1);
- vector<GenRegister> tmp2(effectDataNum + 1);
vector<GenRegister> effectData(effectDataNum);
for(uint32_t i = 0; i < effectDataNum + 1; i++)
- tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
sel.push();
@@ -3776,7 +3999,6 @@ namespace gbe
do {
uint32_t width = remainedReg > 4 ? 4 : remainedReg;
vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
- vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
if (pos != 0) {
sel.push();
if (isUniform)
@@ -3784,7 +4006,7 @@ namespace gbe
sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
sel.pop();
}
- readDWord(sel, t1, alignedAddr, width, bti);
+ shootUntypedReadMsg(sel, insn, t1, alignedAddr, width, addrSpace);
remainedReg -= width;
pos += width;
} while(remainedReg);
@@ -3803,23 +4025,22 @@ namespace gbe
GBE_ASSERT(insn.getValueNum() == 1);
const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
- if(sel.getSlowByteGather())
- readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
+ if (sel.getSlowByteGather())
+ readByteAsDWord(sel, insn, elemSize, address, value, isUniform, addrSpace);
else {
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
-
// We need a temporary register if we read bytes or words
- Register dst = sel.reg(FAMILY_DWORD, isUniform);
+ Register dst = sel.reg(FAMILY_DWORD);
sel.push();
if (isUniform)
sel.curr.noMask = 1;
- sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, elemSize, b, sel.getBTITemps(bti));
+ shootByteGatherMsg(sel, insn, sel.selReg(dst, ir::TYPE_U32), address, elemSize, isUniform, addrSpace);
sel.pop();
sel.push();
if (isUniform) {
sel.curr.noMask = 1;
sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
}
if (elemSize == GEN_BYTE_SCATTER_WORD)
sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst));
@@ -3830,13 +4051,6 @@ namespace gbe
}
}
- // check whether all binded table index point to constant memory
- INLINE bool isAllConstant(const ir::BTI &bti) const {
- if (bti.isConst && bti.imm == BTI_CONSTANT)
- return true;
- return false;
- }
-
/*! Implements base class */
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
@@ -3850,49 +4064,39 @@ namespace gbe
insn.getAddressSpace() == MEM_MIXED);
//GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
- BTI bti;
- AddressMode am = insn.getAddressMode();
- if (am == AM_StaticBti) {
- bti.isConst = 1;
- bti.imm = insn.getSurfaceIndex();
- } else if (am == AM_DynamicBti) {
- bti.isConst = 0;
- bti.reg = insn.getBtiReg();
- } else {
- assert(0 && "stateless not supported yet");
- }
+ AddressSpace addrSpace = insn.getAddressSpace();
+
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(sel, type);
- bool allConstant = isAllConstant(bti);
- if (allConstant) {
+ if (addrSpace == MEM_CONSTANT) {
// XXX TODO read 64bit constant through constant cache
// Per HW Spec, constant cache messages can read at least DWORD data.
// So, byte/short data type, we have to read through data cache.
if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
- this->emitRead64(sel, insn, address, bti);
+ this->emitRead64(sel, insn, address, addrSpace);
else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
- this->emitDWordGather(sel, insn, address, bti);
+ this->emitDWordGather(sel, insn, address, addrSpace);
else if (insn.isAligned() == true)
- this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+ this->emitAlignedByteGather(sel, insn, elemSize, address, addrSpace);
else
- this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
+ this->emitUnalignedByteGather(sel, insn, elemSize, address, addrSpace);
} else {
if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
- this->emitRead64(sel, insn, address, bti);
+ this->emitRead64(sel, insn, address, addrSpace);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
- this->emitUntypedRead(sel, insn, address, bti);
+ this->emitUntypedRead(sel, insn, address, addrSpace);
else if (insn.isAligned())
- this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+ this->emitAlignedByteGather(sel, insn, elemSize, address, addrSpace);
else
- this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
+ this->emitUnalignedByteGather(sel, insn, elemSize, address, addrSpace);
}
markAllChildren(dag);
+
return true;
}
};
-
class StoreInstructionPattern : public SelectionPattern
{
public:
@@ -3900,44 +4104,278 @@ namespace gbe
StoreInstructionPattern(void) : SelectionPattern(1, 1) {
this->opcodes.push_back(ir::OP_STORE);
}
+ GenRegister convertU64ToU32(Selection::Opaque &sel,
+ GenRegister addr) const {
+ GenRegister unpacked = GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD);
+ GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ sel.MOV(dst, unpacked);
+ return dst;
+ }
+
+ void untypedWriteStateless(Selection::Opaque &sel,
+ GenRegister address,
+ vector<GenRegister> &value) const
+ {
+ using namespace ir;
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ unsigned int addrBytes = typeSize(address.type);
+ unsigned valueNum = value.size();
+ GenRegister addrQ;
+ if (addrBytes == 4) {
+ if (simdWidth == 8) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, address);
+ } else if (simdWidth == 16) {
+ addrQ = address;
+ }
+ } else if (addrBytes == 8) {
+ addrQ = address;
+ }
+
+ if (simdWidth == 8) {
+ vector<GenRegister> msg;
+ msg.push_back(addrQ);
+ for (unsigned k = 0; k < valueNum; k++)
+ msg.push_back(value[k]);
+
+ sel.UNTYPED_WRITEA64(msg.data(), valueNum+1, valueNum);
+ } else if (simdWidth == 16) {
+ vector<GenRegister> msgs;
+ for (unsigned k = 0; k < (valueNum+1)/2+1; k++) {
+ msgs.push_back(sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32));
+ }
+ bool valueScalar = sel.isScalarReg(value[0].reg());
+ sel.push();
+ /* do first quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0));
+ for (unsigned k = 0; k < valueNum; k++) {
+ sel.MOV(GenRegister::Qn(msgs[k/2+1], k%2), GenRegister::Qn(value[k], 0));
+ }
+ sel.UNTYPED_WRITEA64(msgs.data(), (valueNum+1)/2+1, valueNum);
+
+ /* do second quarter */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1));
+ for (unsigned k = 0; k < valueNum; k++)
+ sel.MOV(GenRegister::Qn(msgs[k/2+1], k%2), GenRegister::Qn(value[k], 1));
+ sel.UNTYPED_WRITEA64(msgs.data(), (valueNum+1)/2+1, valueNum);
+ sel.pop();
+ }
+ }
+
+ void shootUntypedWriteMsg(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ GenRegister &address,
+ vector<GenRegister> &value,
+ ir::AddressSpace addrSpace) const
+ {
+ using namespace ir;
+ unsigned int addrBytes = typeSize(address.type);
+ unsigned valueNum = value.size();
+ AddressMode AM = insn.getAddressMode();
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+
+ bool addrScalar = sel.isScalarReg(address.reg());
+
+ if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+ if (AM == AM_DynamicBti) {
+ Register btiReg = insn.getBtiReg();
+ sel.UNTYPED_WRITE(address, value.data(), valueNum, sel.selReg(btiReg, TYPE_U32), btiTemp);
+ } else {
+ unsigned SI = insn.getSurfaceIndex();
+ sel.UNTYPED_WRITE(address, value.data(), valueNum, GenRegister::immud(SI), btiTemp);
+ }
+ } else if (addrSpace == ir::MEM_LOCAL) {
+ GenRegister addr = address;
+ if (addrBytes == 8) {
+ addr = convertU64ToU32(sel, address);
+ }
+ sel.UNTYPED_WRITE(addr, value.data(), valueNum, GenRegister::immud(0xfe), btiTemp);
+ } else {
+ untypedWriteStateless(sel, address, value);
+ }
+ }
+
void emitUntypedWrite(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
GenRegister address,
- ir::BTI &bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
- vector<GenRegister> value(valueNum), tmps;
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+ vector<GenRegister> value(valueNum);
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
- sel.UNTYPED_WRITE(address, value.data(), valueNum, b, sel.getBTITemps(bti));
+
+ shootUntypedWriteMsg(sel, insn, address, value, addrSpace);
}
+ void write64Legacy(Selection::Opaque &sel,
+ GenRegister address,
+ vector<GenRegister> &value,
+ GenRegister bti,
+ vector<GenRegister> &btiTemp) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = value.size();
+ if (sel.hasLongType()) {
+ vector<GenRegister> tmp(valueNum);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+ tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ }
+ sel.WRITE64(address, value.data(), tmp.data(), valueNum, bti, true, btiTemp);
+ } else {
+ sel.WRITE64(address, value.data(), NULL, valueNum, bti, false, btiTemp);
+ }
+ }
+
+ void write64Stateless(Selection::Opaque &sel,
+ GenRegister address,
+ vector<GenRegister> &value) const
+ {
+ using namespace ir;
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ unsigned int addrBytes = typeSize(address.type);
+ unsigned valueNum = value.size();
+ vector<GenRegister> tmp(valueNum);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+ tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ }
+ GenRegister addrQ;
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, address);
+ } else {
+ addrQ = address;
+ }
+
+ sel.push();
+ if (simdWidth == 8) {
+ sel.WRITE64A64(addrQ, value.data(), tmp.data(), valueNum);
+ } else {
+ GenRegister tmpAddr, tmpSrc;
+ tmpAddr = GenRegister::Qn(addrQ, 0);
+ tmpSrc = GenRegister::Qn(value[0], 0);
+ GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+
+ /* SIMD16 long register is just enough for (SIMD8 A64 addr + SIMD8 long) */
+ sel.curr.execWidth = 8;
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.MOV(GenRegister::Qn(tmp, 0), tmpAddr);
+ sel.UNPACK_LONG(GenRegister::Qn(tmp, 1), tmpSrc);
+ sel.UNTYPED_WRITEA64(&tmp, 1, 2);
+
+ tmpAddr = GenRegister::Qn(addrQ, 1);
+ tmpSrc = GenRegister::Qn(value[0], 1);
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.MOV(GenRegister::Qn(tmp, 0), tmpAddr);
+ sel.UNPACK_LONG(GenRegister::Qn(tmp, 1), tmpSrc);
+ sel.UNTYPED_WRITEA64(&tmp, 1, 2);
+ }
+ sel.pop();
+ }
void emitWrite64(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
GenRegister address,
- ir::BTI &bti) const
+ ir::AddressSpace addrSpace) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
vector<GenRegister> src(valueNum);
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
- if (sel.hasLongType()) {
- vector<GenRegister> tmp(valueNum);
- for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
- tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
+ AddressMode AM = insn.getAddressMode();
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+ if (AM != AM_Stateless) {
+ GenRegister b;
+ if (AM == AM_DynamicBti) {
+ b = sel.selReg(insn.getBtiReg(), TYPE_U32);
+ } else {
+ b = GenRegister::immud(insn.getSurfaceIndex());
}
- sel.WRITE64(address, src.data(), tmp.data(), valueNum, b, true, sel.getBTITemps(bti));
+ write64Legacy(sel, address, src, b, btiTemp);
+ } else if (addrSpace == MEM_CONSTANT || addrSpace == MEM_LOCAL) {
+ GenRegister b = GenRegister::immud(addrSpace == MEM_CONSTANT ? BTI_CONSTANT : 0xfe);
+ write64Legacy(sel, address, src, b, btiTemp);
} else {
- sel.WRITE64(address, src.data(), NULL, valueNum, b, false, sel.getBTITemps(bti));
+ GBE_ASSERT(sel.hasLongType());
+ write64Stateless(sel, address, src);
+ }
+ }
+
+ void byteScatterStateless(Selection::Opaque &sel,
+ GenRegister address,
+ GenRegister data,
+ unsigned elemSize) const {
+ using namespace ir;
+ unsigned addrBytes = typeSize(address.type);
+ unsigned simdWidth = sel.ctx.getSimdWidth();
+ GenRegister addrQ;
+ if (addrBytes == 4) {
+ addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
+ sel.MOV(addrQ, address);
+ } else {
+ addrQ = address;
+ }
+ if (simdWidth == 8) {
+ GenRegister msg[2];
+ msg[0] = addrQ;
+ msg[1] = data;
+ sel.BYTE_SCATTERA64(msg, 2, elemSize);
+ } else if (simdWidth == 16) {
+ GenRegister msgs[2];
+ msgs[0] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ msgs[1] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+ sel.push();
+ sel.curr.execWidth = 8;
+ /* do first quarter */
+ sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 0));
+ sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(data, 0));
+ sel.BYTE_SCATTERA64(msgs, 2, elemSize);
+ /* do second quarter */
+ sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+ sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL), GenRegister::Qn(addrQ, 1));
+ sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(data, 1));
+ sel.BYTE_SCATTERA64(msgs, 2, elemSize);
+ sel.pop();
+ }
+ }
+ void shootByteScatterMsg(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ GenRegister address,
+ GenRegister data,
+ unsigned elemSize,
+ ir::AddressSpace addrSpace) const
+ {
+ using namespace ir;
+ unsigned addrBytes = typeSize(address.type);
+ AddressMode AM = insn.getAddressMode();
+ vector<GenRegister> btiTemp = sel.getBTITemps(AM);
+ if (AM != AM_Stateless) {
+ if (AM == AM_DynamicBti) {
+ Register btiReg = insn.getBtiReg();
+ sel.BYTE_SCATTER(address, data, elemSize, sel.selReg(btiReg, TYPE_U32), btiTemp);
+ } else {
+ unsigned SI = insn.getSurfaceIndex();
+ sel.BYTE_SCATTER(address, data, elemSize, GenRegister::immud(SI), btiTemp);
+ }
+ } else if (addrSpace == ir::MEM_LOCAL) {
+ GenRegister addr = address;
+ if (addrBytes == 8) {
+ addr = convertU64ToU32(sel, address);
+ }
+ sel.BYTE_SCATTER(addr, data, elemSize, GenRegister::immud(0xfe), btiTemp);
+ } else {
+ byteScatterStateless(sel, address, data, elemSize);
}
}
@@ -3945,13 +4383,12 @@ namespace gbe
const ir::StoreInstruction &insn,
const uint32_t elemSize,
GenRegister address,
- ir::BTI &bti,
+ ir::AddressSpace addrSpace,
bool isUniform) const
{
using namespace ir;
uint32_t valueNum = insn.getValueNum();
- GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
if(valueNum > 1) {
const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
vector<GenRegister> value(valueNum);
@@ -3971,54 +4408,37 @@ namespace gbe
sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
}
- sel.UNTYPED_WRITE(address, tmp.data(), tmpRegNum, b, sel.getBTITemps(bti));
+ shootUntypedWriteMsg(sel, insn, address, tmp, addrSpace);
} else {
const GenRegister value = sel.selReg(insn.getValue(0));
GBE_ASSERT(insn.getValueNum() == 1);
- const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+ const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
- sel.push();
- if (isUniform) {
- sel.curr.noMask = 1;
- sel.curr.execWidth = 1;
- }
+ if (elemSize == GEN_BYTE_SCATTER_WORD)
+ sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
+ else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+ sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
- if (elemSize == GEN_BYTE_SCATTER_WORD)
- sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
- else if (elemSize == GEN_BYTE_SCATTER_BYTE)
- sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
- sel.pop();
- sel.BYTE_SCATTER(address, tmp, elemSize, b, sel.getBTITemps(bti));
+ shootByteScatterMsg(sel, insn, address, tmp, elemSize, addrSpace);
}
}
-
virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
GenRegister address = sel.selReg(insn.getAddressRegister(), ir::TYPE_U32);
+ AddressSpace addrSpace = insn.getAddressSpace();
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(sel, type);
const bool isUniform = sel.isScalarReg(insn.getAddressRegister()) && sel.isScalarReg(insn.getValue(0));
- BTI bti;
- AddressMode am = insn.getAddressMode();
- if (am == AM_StaticBti) {
- bti.isConst = 1;
- bti.imm = insn.getSurfaceIndex();
- } else if (am == AM_DynamicBti) {
- bti.isConst = 0;
- bti.reg = insn.getBtiReg();
- } else {
- assert(0 && "stateless not supported yet");
- }
if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
- this->emitWrite64(sel, insn, address, bti);
+ this->emitWrite64(sel, insn, address, addrSpace);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
- this->emitUntypedWrite(sel, insn, address, bti);
+ this->emitUntypedWrite(sel, insn, address, addrSpace);
else {
- this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
+ this->emitByteScatter(sel, insn, elemSize, address, addrSpace, isUniform);
}
markAllChildren(dag);
@@ -4911,7 +5331,7 @@ namespace gbe
if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
- sel.ATOMIC(dst, genAtomicOp, msgPayload, src0, src1, src2, bti, sel.getBTITemps(b));
+ sel.ATOMIC(dst, genAtomicOp, msgPayload, src0, src1, src2, bti, sel.getBTITemps(AM));
markAllChildren(dag);
return true;
--
2.3.1
More information about the Beignet
mailing list