[Beignet] [PATCH 1/2] GBE: Refine uniform load logic.
Ruiling Song
ruiling.song at intel.com
Fri Jul 11 01:23:07 PDT 2014
Currently many dataport messages cannot support uniform
load/store. So we need to add a move instruction before
or after a dataport message to move data between uniform
and varying registers. The idea behind the patch is that
if a dataport message could not handle uniform register,
then explicitly handle uniform register before the message.
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/backend/gen_insn_selection.cpp | 174 ++++++++++++++++++++--------
backend/src/backend/gen_reg_allocation.cpp | 2 +
2 files changed, 127 insertions(+), 49 deletions(-)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index fb041de..565f203 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -904,7 +904,7 @@ namespace gbe
ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
SelectionBlock *block = insn->parent;
- const uint32_t simdWidth = insn->state.execWidth;
+ const uint32_t simdWidth =this->isScalarReg(insn->src(regID).reg()) ? 1 : insn->state.execWidth;
ir::Register tmp;
GenRegister gr;
@@ -939,7 +939,7 @@ namespace gbe
ir::Register tmp;
GenRegister gr;
this->block = block;
- tmp = this->reg(ir::getFamily(type));
+ tmp = this->reg(ir::getFamily(type),simdWidth == 1);
gr = this->selReg(tmp, type);
if (needMov) {
// Generate the MOV instruction and replace the register in the instruction
@@ -2693,6 +2693,29 @@ namespace gbe
}
}
+ GenRegister replaceUniformSource(Selection::Opaque &sel, ir::Register reg, ir::Type type) {
+ const bool isUniform = sel.isScalarReg(reg);
+ ir::RegisterFamily family = ir::getFamily(type);
+ const GenRegister uniform = sel.selReg(reg, type);
+ if(isUniform) {
+ const GenRegister varying = sel.selReg(sel.reg(family), type);
+ sel.push();
+ sel.curr.noMask = 1;
+ sel.MOV(varying, uniform);
+ sel.pop();
+ return varying;
+ }
+ return uniform;
+ }
+
+ void moveVarying2Uniform(Selection::Opaque &sel, GenRegister uniform, GenRegister varying) {
+ sel.push();
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
+ sel.MOV(uniform, varying);
+ sel.pop();
+ }
+
/*! Load instruction pattern */
DECL_PATTERN(LoadInstruction)
{
@@ -2704,9 +2727,31 @@ namespace gbe
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
vector<GenRegister> dst(valueNum);
- for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
- dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
- sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
+ bool isUniformDst = sel.isScalarReg(insn.getValue(0));
+ GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
+
+ if(isUniformDst) {
+ for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_FLOAT);
+ } else {
+ for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_FLOAT);
+ }
+
+ sel.push();
+ if(isUniformDst)
+ sel.curr.noMask = 1;
+ sel.UNTYPED_READ(tmpAddr, dst.data(), valueNum, bti);
+ sel.pop();
+
+ if(isUniformDst) {
+ sel.push();
+ sel.curr.noMask = 1;
+ sel.curr.execWidth = 1;
+ for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ sel.MOV(sel.selReg(insn.getValue(dstID), ir::TYPE_FLOAT), GenRegister::vec1(dst[dstID]));
+ sel.pop();
+ }
}
void emitDWordGather(Selection::Opaque &sel,
@@ -2715,21 +2760,32 @@ namespace gbe
uint32_t bti) const
{
using namespace ir;
- const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? 1 : sel.ctx.getSimdWidth();
+ const bool isUniformDst = sel.isScalarReg(insn.getValue(0));
+ const bool isUniformSrc = sel.isScalarReg(addr.reg());
+
GBE_ASSERT(insn.getValueNum() == 1);
GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
- // get dword based address
- GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD, simdWidth == 1));
+ // Get dword based address
+ GenRegister addrDW = GenRegister::udxgrf(sel.ctx.getSimdWidth(), sel.reg(FAMILY_DWORD));
sel.push();
- if (simdWidth == 1) {
+ if (isUniformSrc) {
sel.curr.noMask = 1;
- sel.curr.execWidth = 1;
}
sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
sel.pop();
- sel.DWORD_GATHER(dst, addrDW, bti);
+ if(isUniformDst) {
+ GenRegister tmp = GenRegister::fxgrf(sel.ctx.getSimdWidth(), sel.reg(FAMILY_DWORD));
+ sel.push();
+ sel.curr.noMask = 1;
+ sel.DWORD_GATHER(tmp, addrDW, bti);
+ sel.curr.execWidth = 1;
+ sel.MOV(dst, GenRegister::vec1(tmp));
+ sel.pop();
+ } else
+ sel.DWORD_GATHER(dst, addrDW, bti);
+
}
void emitRead64(Selection::Opaque &sel,
@@ -2741,24 +2797,29 @@ namespace gbe
const uint32_t valueNum = insn.getValueNum();
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
+ GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
+ /// TODO support uniform load for int64
+ GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
GenRegister dst[valueNum];
for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
- sel.READ64(addr, dst, valueNum, bti);
+ sel.READ64(tmpAddr, dst, valueNum, bti);
}
void emitByteGather(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
const uint32_t elemSize,
- GenRegister address,
+ GenRegister addr,
uint32_t bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
- const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
- 1 : sel.ctx.getSimdWidth();
+ const bool isUniformDst = sel.isScalarReg(insn.getValue(0));
+
if(valueNum > 1) {
+ GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
+
vector<GenRegister> dst(valueNum);
const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
@@ -2773,10 +2834,13 @@ namespace gbe
uint32_t tmpRegNum = typeSize*valueNum / 4;
vector<GenRegister> tmp(tmpRegNum);
for(uint32_t i = 0; i < tmpRegNum; i++) {
- tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
}
-
- sel.UNTYPED_READ(address, tmp.data(), tmpRegNum, bti);
+ sel.push();
+ if(isUniformDst)
+ sel.curr.noMask = 1;
+ sel.UNTYPED_READ(tmpAddr, tmp.data(), tmpRegNum, bti);
+ sel.pop();
for(uint32_t i = 0; i < tmpRegNum; i++) {
sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
@@ -2784,35 +2848,39 @@ namespace gbe
} else {
GBE_ASSERT(insn.getValueNum() == 1);
const GenRegister value = sel.selReg(insn.getValue(0));
+ const bool isUniformAddr = sel.isScalarReg(addr.reg());
GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
- Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1);
- GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD, simdWidth == 1));
- GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg);
+ Register tmpReg = sel.reg(FAMILY_DWORD);
+ GenRegister tmpAddr = GenRegister::udxgrf(sel.ctx.getSimdWidth(), sel.reg(FAMILY_DWORD));
+ GenRegister tmpData = GenRegister::udxgrf(sel.ctx.getSimdWidth(), tmpReg);
// Get dword aligned addr
sel.push();
- if (simdWidth == 1) {
- sel.curr.execWidth = 1;
+ if (isUniformAddr)
sel.curr.noMask = 1;
- }
- sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
+ sel.AND(tmpAddr, addr, GenRegister::immud(0xfffffffc));
sel.pop();
+
sel.push();
- if (simdWidth == 1)
+ if (isUniformDst)
sel.curr.noMask = 1;
sel.UNTYPED_READ(tmpAddr, &tmpData, 1, bti);
- if (simdWidth == 1)
+ if (isUniformDst) {
sel.curr.execWidth = 1;
+ addr = GenRegister::vec1(addr);
+ tmpAddr = GenRegister::vec1(tmpAddr);
+ tmpData = GenRegister::vec1(tmpData);
+ }
// Get the remaining offset from aligned addr
- sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
+ sel.AND(tmpAddr, addr, GenRegister::immud(0x3));
sel.SHL(tmpAddr, tmpAddr, GenRegister::immud(0x3));
sel.SHR(tmpData, tmpData, tmpAddr);
if (elemSize == GEN_BYTE_SCATTER_WORD)
- sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+ sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(tmpReg, isUniformDst));
else if (elemSize == GEN_BYTE_SCATTER_BYTE)
- sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+ sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(tmpReg, isUniformDst));
sel.pop();
}
}
@@ -2837,7 +2905,7 @@ namespace gbe
insn.getAddressSpace() == MEM_CONSTANT ||
insn.getAddressSpace() == MEM_PRIVATE ||
insn.getAddressSpace() == MEM_LOCAL);
- //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
+
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
@@ -2880,11 +2948,12 @@ namespace gbe
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
vector<GenRegister> value(valueNum);
+ GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
- addr = GenRegister::retype(addr, GEN_TYPE_F);
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
- value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
- sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
+ value[valueID] = replaceUniformSource(sel, insn.getValue(valueID), ir::TYPE_FLOAT);
+
+ sel.UNTYPED_WRITE(tmpAddr, value.data(), valueNum, bti);
}
void emitWrite64(Selection::Opaque &sel,
@@ -2896,12 +2965,12 @@ namespace gbe
const uint32_t valueNum = insn.getValueNum();
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
- addr = GenRegister::retype(addr, GEN_TYPE_UD);
GenRegister src[valueNum];
+ GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), TYPE_U32);
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
- src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
- sel.WRITE64(addr, src, valueNum, bti);
+ src[valueID] = replaceUniformSource(sel, insn.getValue(valueID), ir::TYPE_U64);
+ sel.WRITE64(tmpAddr, src, valueNum, bti);
}
void emitByteScatter(Selection::Opaque &sel,
@@ -2914,6 +2983,8 @@ namespace gbe
const uint32_t simdWidth = sel.ctx.getSimdWidth();
uint32_t valueNum = insn.getValueNum();
+ GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
+
if(valueNum > 1) {
const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
vector<GenRegister> value(valueNum);
@@ -2933,7 +3004,7 @@ namespace gbe
sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize);
}
- sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
+ sel.UNTYPED_WRITE(tmpAddr, tmp.data(), tmpRegNum, bti);
} else {
const GenRegister value = sel.selReg(insn.getValue(0));
GBE_ASSERT(insn.getValueNum() == 1);
@@ -2943,7 +3014,7 @@ namespace gbe
} else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
}
- sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
+ sel.BYTE_SCATTER(tmpAddr, tmp, elemSize, bti);
}
}
@@ -3367,18 +3438,23 @@ namespace gbe
const AddressSpace space = insn.getAddressSpace();
const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
const uint32_t srcNum = insn.getSrcNum();
- GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32); //address
+ GenRegister src0 = replaceUniformSource(sel, insn.getSrc(0), TYPE_U32); //address
GenRegister src1 = src0, src2 = src0;
- if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
- if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
+ if(srcNum > 1) src1 = replaceUniformSource(sel, insn.getSrc(1), TYPE_U32);
+ if(srcNum > 2) src2 = replaceUniformSource(sel, insn.getSrc(2), TYPE_U32);
+
GenRegister dst = sel.selReg(insn.getDst(0), TYPE_U32);
+ const bool isUniformDst = sel.isScalarReg(insn.getDst(0));
+ GenRegister tmpDst = isUniformDst ? sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32) : dst;
+
GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
if(space == MEM_LOCAL && sel.needPatchSLMAddr()){
GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
src0 = temp;
}
- sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, bti);
+ sel.ATOMIC(tmpDst, genAtomicOp, srcNum, src0, src1, src2, bti);
+ if(isUniformDst) moveVarying2Uniform(sel, dst, tmpDst);
return true;
}
DECL_CTOR(AtomicInstruction, 1, 1);
@@ -3589,12 +3665,12 @@ namespace gbe
if (insn.getSamplerOffset() != 0) {
// U, lod, [V], [W]
GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
- msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
+ msgPayloads[0] = replaceUniformSource(sel, insn.getSrc(0), insn.getSrcType());
msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
if (srcNum > 1)
- msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
+ msgPayloads[2] = replaceUniformSource(sel, insn.getSrc(1), insn.getSrcType());
if (srcNum > 2)
- msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
+ msgPayloads[3] = replaceUniformSource(sel, insn.getSrc(2), insn.getSrcType());
// Clear the lod to zero.
sel.MOV(msgPayloads[1], GenRegister::immud(0));
msgLen = srcNum + 1;
@@ -3602,7 +3678,7 @@ namespace gbe
// U, V, [W]
GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT);
for (valueID = 0; valueID < srcNum; ++valueID)
- msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+ msgPayloads[valueID] = replaceUniformSource(sel, insn.getSrc(valueID), insn.getSrcType());
msgLen = srcNum;
}
// We switch to a fixup bti for linear filter on a image1d array sampling.
@@ -3637,7 +3713,7 @@ namespace gbe
uint32_t valueID = 0;
msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
for(uint32_t msgID = 1; msgID < 1 + coordNum; msgID++, valueID++)
- msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
+ msgs[msgID] = replaceUniformSource(sel, insn.getSrc(msgID - 1), insn.getCoordType());
// fake u.
if (insn.getSrc(1) == ir::ocl::invalid)
@@ -3648,7 +3724,7 @@ namespace gbe
// LOD.
msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
for(uint32_t msgID = 5; valueID < insn.getSrcNum(); msgID++, valueID++)
- msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+ msgs[msgID] = replaceUniformSource(sel, insn.getSrc(valueID), insn.getSrcType());
}
sel.push();
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index b7fbc93..e1e5b93 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -315,6 +315,8 @@ namespace gbe
else {
ir::Register tmp;
ir::Type type = getIRType(vector->reg[regID].type);
+ // We currently don't have scalar register in SelectionVector
+ GBE_ASSERT(ctx.sel->isScalarReg(reg) == false);
tmp = this->replaceReg(selection, vector->insn, regID, vector->isSrc, type);
const VectorLocation location = std::make_pair(vector, regID);
this->vectorMap.insert(std::make_pair(tmp, location));
--
1.7.10.4
More information about the Beignet
mailing list