[Beignet] [PATCH 1/2] GBE: relax the batch byte/short load vector size restrication.
Zhigang Gong
zhigang.gong at intel.com
Wed Aug 27 19:46:03 PDT 2014
Previous restrication is that the vector size must be multiple
of DWORD. This restrication prevent the vload2/3 of char or
vload3 of ushort to be optimized. This patch relax this restrication
on the vload path.
Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
backend/src/backend/gen_context.cpp | 6 ++--
backend/src/backend/gen_insn_selection.cpp | 39 +++++++++++-------------
backend/src/llvm/llvm_gen_backend.cpp | 3 +-
backend/src/llvm/llvm_loadstore_optimization.cpp | 3 +-
4 files changed, 24 insertions(+), 27 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index ba4a8f8..883fa39 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1693,7 +1693,7 @@ namespace gbe
void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
for(uint32_t i = 0; i < insn.dstNum; i++) {
- p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.dstNum, i));
+ p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.extra.elem, i));
}
}
@@ -1702,12 +1702,12 @@ namespace gbe
p->push();
if(simdWidth == 8) {
for(uint32_t i = 0; i < insn.srcNum; i++)
- p->MOV(GenRegister::splitReg(dst, insn.srcNum, i), ra->genReg(insn.src(i)));
+ p->MOV(GenRegister::splitReg(dst, insn.extra.elem, i), ra->genReg(insn.src(i)));
} else {
// when destination expands two registers, the source must span two registers.
p->curr.execWidth = 8;
for(uint32_t i = 0; i < insn.srcNum; i++) {
- GenRegister dsti = GenRegister::splitReg(dst, insn.srcNum, i);
+ GenRegister dsti = GenRegister::splitReg(dst, insn.extra.elem, i);
GenRegister src = ra->genReg(insn.src(i));
p->curr.quarterControl = 0;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 8478616..1258e54 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -575,10 +575,10 @@ namespace gbe
void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
/*! DWord scatter (for constant cache read) */
void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
- /*! Unpack the uint to char4 */
- void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum);
- /*! pack the char4 to uint */
- void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum);
+ /*! Unpack the uint to charN */
+ void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum);
+ /*! pack the charN to uint */
+ void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum);
/*! Extended math function (2 arguments) */
void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
/*! Extended math function (1 argument) */
@@ -1255,16 +1255,18 @@ namespace gbe
srcVector->reg = &insn->src(0);
}
- void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum) {
+ void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1);
insn->src(0) = src;
+ insn->extra.elem = 4 / elemSize;
for(uint32_t i = 0; i < elemNum; i++)
insn->dst(i) = dst[i];
}
- void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum) {
+ void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum) {
SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum);
for(uint32_t i = 0; i < elemNum; i++)
insn->src(i) = src[i];
+ insn->extra.elem = 4 / elemSize;
insn->dst(0) = dst;
}
@@ -2862,9 +2864,7 @@ namespace gbe
for(uint32_t i = 0; i < valueNum; i++)
dst[i] = sel.selReg(insn.getValue(i), getType(family));
- uint32_t tmpRegNum = typeSize*valueNum / 4;
- if (tmpRegNum == 0)
- tmpRegNum = 1;
+ uint32_t tmpRegNum = (typeSize*valueNum + 3) / 4;
vector<GenRegister> tmp(tmpRegNum);
vector<GenRegister> tmp2(tmpRegNum);
vector<Register> tmpReg(tmpRegNum);
@@ -2875,15 +2875,10 @@ namespace gbe
readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
- if (valueNum > 1) {
- for(uint32_t i = 0; i < tmpRegNum; i++)
- sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
- }
- else {
- if (elemSize == GEN_BYTE_SCATTER_WORD)
- sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UW), sel.unpacked_uw(tmpReg[0]));
- else if (elemSize == GEN_BYTE_SCATTER_BYTE)
- sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UB), sel.unpacked_ub(tmpReg[0]));
+ for(uint32_t i = 0; i < tmpRegNum; i++) {
+ unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
+ 4/typeSize : (valueNum - i * (4 / typeSize));
+ sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], typeSize, elemNum);
}
}
@@ -2948,7 +2943,7 @@ namespace gbe
for(uint32_t i = 0; i < valueNum; i++)
dst[i] = sel.selReg(insn.getValue(i), getType(family));
- uint32_t effectDataNum = typeSize*valueNum / 4;
+ uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
vector<GenRegister> tmp(effectDataNum + 1);
vector<GenRegister> tmp2(effectDataNum + 1);
vector<GenRegister> effectData(effectDataNum);
@@ -2986,7 +2981,9 @@ namespace gbe
getEffectByteData(sel, effectData, tmp, effectDataNum, address, simdWidth);
for(uint32_t i = 0; i < effectDataNum; i++) {
- sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], 4/typeSize);
+ unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
+ 4/typeSize : (valueNum - i * (4 / typeSize));
+ sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize, elemNum);
}
} else {
GBE_ASSERT(insn.getValueNum() == 1);
@@ -3148,7 +3145,7 @@ namespace gbe
vector<GenRegister> tmp(tmpRegNum);
for(uint32_t i = 0; i < tmpRegNum; i++) {
tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
- sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize);
+ sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
}
sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index b956bc6..8f0d5c2 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3515,7 +3515,8 @@ handle_write_image:
emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
}
}
- else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
+ else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
+ (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
} else {
for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index 19726b0..ae91af7 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -259,8 +259,7 @@ namespace gbe {
while(size > 1) {
unsigned vecSize = (size >= 16) ? 16 :
(size >= 8 ? 8 :
- (size >= 4 ? 4 :
- (size >= 2 ? 2 : size)));
+ (size >= 4 ? 4 : size));
SmallVector<Instruction*, 16> mergedVec(merged.begin() + pos, merged.begin() + pos + vecSize);
if(isLoad)
mergeLoad(BB, mergedVec);
--
1.8.3.2
More information about the Beignet
mailing list