[Beignet] [PATCH 2/2] Use the Byte Gather after HSW when read byte/shor.
Yang, Rong R
rong.r.yang at intel.com
Mon Jun 15 20:34:25 PDT 2015
Yes, the vector load merged optimization, aligned byte/short vector load may be same as split load, and unaligned byte/short vector, the split load may be better than merged load.
I will send a new patch to handle unaligned byte/short vector load.
> -----Original Message-----
> From: Zou, Nanhai
> Sent: Tuesday, June 16, 2015 07:01
> To: Yang, Rong R; beignet at lists.freedesktop.org
> Cc: Yang, Rong R
> Subject: RE: [Beignet] [PATCH 2/2] Use the Byte Gather after HSW when
> read byte/shor.
>
> Should the unaligned optimization we did in vload/vstore also gone after
> HSW?
>
> Thanks
> Zou Nanhai
>
> > -----Original Message-----
> > From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf
> > Of Yang Rong
> > Sent: Monday, June 15, 2015 2:46 PM
> > To: beignet at lists.freedesktop.org
> > Cc: Yang, Rong R
> > Subject: [Beignet] [PATCH 2/2] Use the Byte Gather after HSW when read
> > byte/shor.
> >
> > After HSW, the byte gather's performance issue has gone, so needn't
> > read dword and extract.
> > But for multi dst load, the combine reduce the address calc, but need
> > the extract the dst, maybe performance is approximate, so still use the old
> logic.
> >
> > Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> > ---
> > backend/src/backend/gen_insn_selection.cpp | 36
> > ++++++++++++++++++++++++++++--
> > 1 file changed, 34 insertions(+), 2 deletions(-)
> >
> > diff --git a/backend/src/backend/gen_insn_selection.cpp
> > b/backend/src/backend/gen_insn_selection.cpp
> > index d63c7e3..d289e8e 100644
> > --- a/backend/src/backend/gen_insn_selection.cpp
> > +++ b/backend/src/backend/gen_insn_selection.cpp
> > @@ -365,6 +365,8 @@ namespace gbe
> > void setLongRegRestrict(bool b) { bLongRegRestrict = b; }
> > void setLdMsgOrder(uint32_t type) { ldMsgOrder = type; }
> > uint32_t getLdMsgOrder() const { return ldMsgOrder; }
> > + void setSlowByteGather(bool b) { slowByteGather = b; }
> > + bool getSlowByteGather() { return slowByteGather; }
> > /*! indicate whether a register is a scalar/uniform register. */
> > INLINE bool isPartialWrite(const ir::Register ®) const {
> > return partialWriteRegs.find(reg.value()) !=
> > partialWriteRegs.end(); @@ -740,6 +742,7 @@ namespace gbe
> > bool bHasLongType;
> > bool bLongRegRestrict;
> > uint32_t ldMsgOrder;
> > + bool slowByteGather;
> > INLINE ir::LabelIndex newAuxLabel()
> > {
> > currAuxLabel++;
> > @@ -779,7 +782,8 @@ namespace gbe
> > curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
> > maxInsnNum(ctx.getFunction().getLargestBlockSize()),
> > dagPool(maxInsnNum),
> > stateNum(0), vectorNum(0), bwdCodeGeneration(false),
> > currAuxLabel(ctx.getFunction().labelNum()),
> > - bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false),
> > ldMsgOrder(LD_MSG_ORDER_IVB)
> > + bHas32X32Mul(false), bHasLongType(false),
> > + bLongRegRestrict(false),
> > ldMsgOrder(LD_MSG_ORDER_IVB),
> > + slowByteGather(false)
> > {
> > const ir::Function &fn = ctx.getFunction();
> > this->regNum = fn.regNum();
> > @@ -2025,26 +2029,31 @@ namespace gbe
> > Selection::Selection(GenContext &ctx) {
> > this->blockList = NULL;
> > this->opaque = GBE_NEW(Selection::Opaque, ctx);
> > + this->opaque->setSlowByteGather(true);
> > }
> >
> > Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
> > + this->opaque->setSlowByteGather(false);
> > }
> >
> > Selection8::Selection8(GenContext &ctx) : Selection(ctx) {
> > this->opaque->setHas32X32Mul(true);
> > this->opaque->setHasLongType(true);
> > + this->opaque->setSlowByteGather(false);
> > }
> >
> > SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {
> > this->opaque->setHas32X32Mul(true);
> > this->opaque->setHasLongType(true);
> > this->opaque->setLongRegRestrict(true);
> > + this->opaque->setSlowByteGather(false);
> > }
> >
> > Selection9::Selection9(GenContext &ctx) : Selection(ctx) {
> > this->opaque->setHas32X32Mul(true);
> > this->opaque->setHasLongType(true);
> > this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
> > + this->opaque->setSlowByteGather(false);
> > }
> >
> > void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t
> > msgNum, @@ -3519,8 +3528,31 @@ namespace gbe
> > GBE_ASSERT(insn.getValueNum() == 1);
> > const GenRegister value = sel.selReg(insn.getValue(0),
> > insn.getValueType());
> > GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize ==
> > GEN_BYTE_SCATTER_BYTE);
> > + if(sel.getSlowByteGather())
> > + readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
> > + else {
> > + GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) :
> > sel.selReg(bti.reg, ir::TYPE_U32);
> > + GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD,
> > + true), ir::TYPE_U16);
> > +
> > + // We need a temporary register if we read bytes or words
> > + Register dst = sel.reg(FAMILY_DWORD, isUniform);
> > + sel.push();
> > + if (isUniform)
> > + sel.curr.noMask = 1;
> > + sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address,
> > elemSize, b, bti.isConst ? NULL : & tmpFlag);
> > + sel.pop();
> >
> > - readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
> > + sel.push();
> > + if (isUniform) {
> > + sel.curr.noMask = 1;
> > + sel.curr.execWidth = 1;
> > + }
> > + if (elemSize == GEN_BYTE_SCATTER_WORD)
> > + sel.MOV(GenRegister::retype(value, GEN_TYPE_UW),
> > GenRegister::unpacked_uw(dst));
> > + else if (elemSize == GEN_BYTE_SCATTER_BYTE)
> > + sel.MOV(GenRegister::retype(value, GEN_TYPE_UB),
> > GenRegister::unpacked_ub(dst));
> > + sel.pop();
> > + }
> > }
> > }
> >
> > --
> > 1.8.3.2
> >
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list