[Beignet] [Patch V2 2/2] Chv: Add chv backend support.
Yang Rong
rong.r.yang at intel.com
Wed Apr 29 00:35:15 PDT 2015
The chv's backend is almost same as bdw. But some long register restrictions:
1. ARF registers must never be used with 64b datatype.
2. Source and Destination horizontal stride must be aligned to the same qword.
3. Source and Destination offset must be the same, except the case of scalar source.
Add ChvContent in gen8_context.cpp to handle it. The chv's encoder is same as Gen8Encoder.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
backend/src/backend/gen8_context.cpp | 124 +++++++++++++++++++++++++++--
backend/src/backend/gen8_context.hpp | 23 ++++++
backend/src/backend/gen_insn_selection.cpp | 47 ++++++++++-
backend/src/backend/gen_insn_selection.hpp | 7 ++
backend/src/backend/gen_program.cpp | 2 +-
5 files changed, 193 insertions(+), 10 deletions(-)
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 2cdb248..2346f2a 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -328,7 +328,7 @@ namespace gbe
return GenRegister::unpacked_ud(reg.nr, reg.subnr + offset);
}
- static void calculateFullU64MUL(GenEncoder* p, GenRegister src0, GenRegister src1, GenRegister dst_h,
+ void Gen8Context::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
{
src0.type = src1.type = GEN_TYPE_UD;
@@ -374,7 +374,7 @@ namespace gbe
p->ADD(dst_h, dst_h, tmp);
}
- static void calculateFullS64MUL(GenEncoder* p, GenRegister src0, GenRegister src1, GenRegister dst_h,
+ void Gen8Context::calculateFullS64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
GenRegister dst_l, GenRegister s0_abs, GenRegister s1_abs,
GenRegister tmp0, GenRegister tmp1, GenRegister sign, GenRegister flagReg)
{
@@ -392,7 +392,7 @@ namespace gbe
s0_abs.type = s1_abs.type = GEN_TYPE_L;
p->MOV(s0_abs, GenRegister::abs(src0));
p->MOV(s1_abs, GenRegister::abs(src1));
- calculateFullU64MUL(p, s0_abs, s1_abs, dst_h, dst_l, tmp0, tmp1);
+ calculateFullU64MUL(s0_abs, s1_abs, dst_h, dst_l, tmp0, tmp1);
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
@@ -429,11 +429,11 @@ namespace gbe
if(src0.type == GEN_TYPE_UL) {
GBE_ASSERT(src1.type == GEN_TYPE_UL);
- calculateFullU64MUL(p, src0, src1, dst_h, dst_l, tmp0, tmp1);
+ calculateFullU64MUL(src0, src1, dst_h, dst_l, tmp0, tmp1);
} else {
GBE_ASSERT(src0.type == GEN_TYPE_L);
GBE_ASSERT(src1.type == GEN_TYPE_L);
- calculateFullS64MUL(p, src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
+ calculateFullS64MUL(src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
tmp1, sign, flagReg);
}
}
@@ -458,7 +458,7 @@ namespace gbe
GBE_ASSERT(src2.type == GEN_TYPE_UL);
dst_l.type = dst_h.type = GEN_TYPE_UL;
tmp0.type = tmp1.type = GEN_TYPE_UL;
- calculateFullU64MUL(p, src0, src1, dst_h, dst_l, tmp0, tmp1);
+ calculateFullU64MUL(src0, src1, dst_h, dst_l, tmp0, tmp1);
/* Inplement the logic:
dst_l += src2;
@@ -493,7 +493,7 @@ namespace gbe
GBE_ASSERT(src1.type == GEN_TYPE_L);
GBE_ASSERT(src2.type == GEN_TYPE_L);
- calculateFullS64MUL(p, src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
+ calculateFullS64MUL(src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
tmp1, sign, flagReg);
GenRegister sum = sign;
@@ -876,4 +876,114 @@ namespace gbe
}
p->pop();
}
+
+ void ChvContext::newSelection(void) {
+ this->sel = GBE_NEW(SelectionChv, *this);
+ }
+
+ void ChvContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+ GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
+ {
+ src0.type = src1.type = GEN_TYPE_UD;
+ dst_h.type = dst_l.type = GEN_TYPE_UL;
+ s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL;
+
+ //GenRegister tmp;
+
+ GenRegister s0l = unpacked_ud(src0);
+ GenRegister s1l = unpacked_ud(src1);
+ GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, reuse s0l_s1h
+ GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, reuse dst_l
+
+ p->MOV(s0h, GenRegister::offset(s0l, 0, 4));
+ p->MOV(s1h, GenRegister::offset(s1l, 0, 4));
+
+ /* High 32 bits X High 32 bits. */
+ p->MUL(dst_h, s0h, s1h);
+ /* High 32 bits X low 32 bits. */
+ p->MUL(s0h_s1l, s0h, s1l);
+ /* Low 32 bits X high 32 bits. */
+ p->MUL(s0l_s1h, s0l, s1h);
+ /* Low 32 bits X low 32 bits. */
+ p->MUL(dst_l, s0l, s1l);
+
+ /* Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + 1 - 2^(N+1), here N = 32
+ The max of addding 2 32bits integer to it is
+ 2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1
+ which means the product s0h_s1l adds dst_l's high 32 bits and then adds s0l_s1h's low 32 bits will not
+ overflow and have no carry.
+ By this manner, we can avoid using acc register, which has a lot of restrictions. */
+
+ GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h);
+ p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l);
+
+ p->SHR(s0l_s1h, s0l_s1h, GenRegister::immud(32));
+ GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h);
+ p->ADD(dst_h, dst_h, s0l_s1h_h);
+
+ GenRegister dst_l_h = unpacked_ud(s0l_s1h);
+ p->MOV(dst_l_h, unpacked_ud(dst_l, 1));
+ p->ADD(s0h_s1l, s0h_s1l, dst_l_h);
+
+ // No longer need s0l_s1h
+ GenRegister tmp = s0l_s1h;
+
+ p->SHL(tmp, s0h_s1l, GenRegister::immud(32));
+ GenRegister tmp_unpacked = unpacked_ud(tmp, 1);
+ p->MOV(unpacked_ud(dst_l, 1), tmp_unpacked);
+
+ p->SHR(tmp, s0h_s1l, GenRegister::immud(32));
+ p->ADD(dst_h, dst_h, tmp);
+ }
+
+ void ChvContext::emitI64MULInstruction(const SelectionInstruction &insn)
+ {
+ GenRegister src0 = ra->genReg(insn.src(0));
+ GenRegister src1 = ra->genReg(insn.src(1));
+ GenRegister dst = ra->genReg(insn.dst(0));
+ GenRegister res = ra->genReg(insn.dst(1));
+
+ src0.type = src1.type = GEN_TYPE_UD;
+ dst.type = GEN_TYPE_UL;
+ res.type = GEN_TYPE_UL;
+
+ /* Low 32 bits X low 32 bits. */
+ GenRegister s0l = unpacked_ud(src0);
+ GenRegister s1l = unpacked_ud(src1);
+ p->MUL(dst, s0l, s1l);
+
+ /* Low 32 bits X high 32 bits. */
+ GenRegister s1h = unpacked_ud(res);
+ p->MOV(s1h, unpacked_ud(src1, 1));
+
+ p->MUL(res, s0l, s1h);
+ p->SHL(res, res, GenRegister::immud(32));
+ p->ADD(dst, dst, res);
+
+ /* High 32 bits X low 32 bits. */
+ GenRegister s0h = unpacked_ud(res);
+ p->MOV(s0h, unpacked_ud(src0, 1));
+
+ p->MUL(res, s0h, s1l);
+ p->SHL(res, res, GenRegister::immud(32));
+ p->ADD(dst, dst, res);
+ }
+
+ void ChvContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+ if (sz == 0)
+ sz = 8;
+ GBE_ASSERT(sz%4 == 0);
+ GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ for (int i = 0; i < sz/2; i++) {
+ p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+ GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+ }
+ p->pop();
+ }
+
}
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index b296a3d..8827955 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -82,6 +82,29 @@ namespace gbe
virtual void newSelection(void);
void packLongVec(GenRegister unpacked, GenRegister packed, uint32_t simd);
void unpackLongVec(GenRegister packed, GenRegister unpacked, uint32_t simd);
+ void calculateFullS64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+ GenRegister dst_l, GenRegister s0_abs, GenRegister s1_abs,
+ GenRegister tmp0, GenRegister tmp1, GenRegister sign, GenRegister flagReg);
+ virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+ GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+ };
+
+ class ChvContext : public Gen8Context
+ {
+ public:
+ virtual ~ChvContext(void) { }
+ ChvContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+ : Gen8Context(unit, name, deviceID, relaxMath) {
+ };
+ virtual void emitI64MULInstruction(const SelectionInstruction &insn);
+
+ protected:
+ virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
+ private:
+ virtual void newSelection(void);
+ virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+ GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
};
}
#endif /* __GBE_GEN8_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 19a3c24..9e15ae0 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -361,6 +361,8 @@ namespace gbe
void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
bool hasLongType() const { return bHasLongType; }
void setHasLongType(bool b) { bHasLongType = b; }
+ bool hasLongRegRestrict() { return bLongRegRestrict; }
+ void setLongRegRestrict(bool b) { bLongRegRestrict = b; }
void setLdMsgOrder(uint32_t type) { ldMsgOrder = type; }
uint32_t getLdMsgOrder() const { return ldMsgOrder; }
/*! indicate whether a register is a scalar/uniform register. */
@@ -720,6 +722,7 @@ namespace gbe
uint32_t currAuxLabel;
bool bHas32X32Mul;
bool bHasLongType;
+ bool bLongRegRestrict;
uint32_t ldMsgOrder;
INLINE ir::LabelIndex newAuxLabel()
{
@@ -760,7 +763,7 @@ namespace gbe
curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
- bHas32X32Mul(false), bHasLongType(false), ldMsgOrder(LD_MSG_ORDER_IVB)
+ bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), ldMsgOrder(LD_MSG_ORDER_IVB)
{
const ir::Function &fn = ctx.getFunction();
this->regNum = fn.regNum();
@@ -1918,6 +1921,12 @@ namespace gbe
this->opaque->setHasLongType(true);
}
+ SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {
+ this->opaque->setHas32X32Mul(true);
+ this->opaque->setHasLongType(true);
+ this->opaque->setLongRegRestrict(true);
+ }
+
Selection9::Selection9(GenContext &ctx) : Selection(ctx) {
this->opaque->setHas32X32Mul(true);
this->opaque->setHasLongType(true);
@@ -4137,7 +4146,41 @@ namespace gbe
sel.MOV(dst, unpacked);
}
}
- } else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
+ } else if (sel.hasLongType() && sel.hasLongRegRestrict() && dstFamily == FAMILY_QWORD && srcFamily != FAMILY_QWORD) {
+ // Convert i32/i16/i8/float to i64/double if hasLongRegRestrict(src and dst hstride must be aligned to the same qword).
+ GenRegister unpacked;
+ GenRegister unpacked_src = src;
+
+ sel.push();
+ if (sel.isScalarReg(insn.getSrc(0))) {
+ sel.curr.execWidth = 1;
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ }
+
+ if(srcType == ir::TYPE_FLOAT) {
+ unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+ unpacked = GenRegister::retype(unpacked, GEN_TYPE_F);
+ } else if(srcFamily == FAMILY_DWORD) {
+ unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+ unpacked = GenRegister::retype(unpacked, dstType == TYPE_U64 ? GEN_TYPE_UD : GEN_TYPE_D);
+ } else if(srcFamily == FAMILY_WORD) {
+ unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+ unpacked = GenRegister::retype(unpacked, dstType == TYPE_U64 ? GEN_TYPE_UW : GEN_TYPE_W);
+ } else if(srcFamily == FAMILY_BYTE) {
+ GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, sel.isScalarReg(insn.getSrc(0))));
+ tmp = GenRegister::retype(tmp, dstType == TYPE_U64 ? GEN_TYPE_UW : GEN_TYPE_W);
+ unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+ unpacked = GenRegister::retype(unpacked, dstType == TYPE_U64 ? GEN_TYPE_UW : GEN_TYPE_W);
+ sel.MOV(tmp, src);
+ unpacked_src = tmp;
+ } else
+ GBE_ASSERT(0);
+
+ sel.MOV(unpacked, unpacked_src);
+ sel.pop();
+ sel.MOV(dst, unpacked);
+ }else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
(src.isdf() && dstType == ir::TYPE_FLOAT)) { // float and double conversion
ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
sel.MOV_DF(dst, src, sel.selReg(r, TYPE_U64));
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 7c9bce5..dee35bb 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -293,6 +293,13 @@ namespace gbe
Selection8(GenContext &ctx);
};
+ class SelectionChv: public Selection
+ {
+ public:
+ /*! Initialize internal structures used for the selection */
+ SelectionChv(GenContext &ctx);
+ };
+
class Selection9: public Selection
{
public:
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index f53d5fb..c761a2f 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -167,7 +167,7 @@ namespace gbe {
} else if (IS_BROADWELL(deviceID)) {
ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
} else if (IS_CHERRYVIEW(deviceID)) {
- ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
+ ctx = GBE_NEW(ChvContext, unit, name, deviceID, relaxMath);
} else if (IS_SKYLAKE(deviceID)) {
ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath);
}
--
1.8.3.2
More information about the Beignet
mailing list