[Beignet] [PATCH 2/2] Use the Byte Gather after HSW when read byte/shor.
Zou, Nanhai
nanhai.zou at intel.com
Mon Jun 15 16:00:52 PDT 2015
Should the unaligned optimization we did in vload/vstore also gone after HSW?
Thanks
Zou Nanhai
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Yang Rong
> Sent: Monday, June 15, 2015 2:46 PM
> To: beignet at lists.freedesktop.org
> Cc: Yang, Rong R
> Subject: [Beignet] [PATCH 2/2] Use the Byte Gather after HSW when read
> byte/shor.
>
> After HSW, the byte gather's performance issue has gone, so needn't read
> dword and extract.
> But for multi dst load, the combine reduce the address calc, but need the
> extract the dst, maybe performance is approximate, so still use the old logic.
>
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
> backend/src/backend/gen_insn_selection.cpp | 36
> ++++++++++++++++++++++++++++--
> 1 file changed, 34 insertions(+), 2 deletions(-)
>
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index d63c7e3..d289e8e 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -365,6 +365,8 @@ namespace gbe
> void setLongRegRestrict(bool b) { bLongRegRestrict = b; }
> void setLdMsgOrder(uint32_t type) { ldMsgOrder = type; }
> uint32_t getLdMsgOrder() const { return ldMsgOrder; }
> + void setSlowByteGather(bool b) { slowByteGather = b; }
> + bool getSlowByteGather() { return slowByteGather; }
> /*! indicate whether a register is a scalar/uniform register. */
> INLINE bool isPartialWrite(const ir::Register ®) const {
> return partialWriteRegs.find(reg.value()) != partialWriteRegs.end();
> @@ -740,6 +742,7 @@ namespace gbe
> bool bHasLongType;
> bool bLongRegRestrict;
> uint32_t ldMsgOrder;
> + bool slowByteGather;
> INLINE ir::LabelIndex newAuxLabel()
> {
> currAuxLabel++;
> @@ -779,7 +782,8 @@ namespace gbe
> curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
> maxInsnNum(ctx.getFunction().getLargestBlockSize()),
> dagPool(maxInsnNum),
> stateNum(0), vectorNum(0), bwdCodeGeneration(false),
> currAuxLabel(ctx.getFunction().labelNum()),
> - bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false),
> ldMsgOrder(LD_MSG_ORDER_IVB)
> + bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false),
> ldMsgOrder(LD_MSG_ORDER_IVB),
> + slowByteGather(false)
> {
> const ir::Function &fn = ctx.getFunction();
> this->regNum = fn.regNum();
> @@ -2025,26 +2029,31 @@ namespace gbe
> Selection::Selection(GenContext &ctx) {
> this->blockList = NULL;
> this->opaque = GBE_NEW(Selection::Opaque, ctx);
> + this->opaque->setSlowByteGather(true);
> }
>
> Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
> + this->opaque->setSlowByteGather(false);
> }
>
> Selection8::Selection8(GenContext &ctx) : Selection(ctx) {
> this->opaque->setHas32X32Mul(true);
> this->opaque->setHasLongType(true);
> + this->opaque->setSlowByteGather(false);
> }
>
> SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {
> this->opaque->setHas32X32Mul(true);
> this->opaque->setHasLongType(true);
> this->opaque->setLongRegRestrict(true);
> + this->opaque->setSlowByteGather(false);
> }
>
> Selection9::Selection9(GenContext &ctx) : Selection(ctx) {
> this->opaque->setHas32X32Mul(true);
> this->opaque->setHasLongType(true);
> this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
> + this->opaque->setSlowByteGather(false);
> }
>
> void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t
> msgNum, @@ -3519,8 +3528,31 @@ namespace gbe
> GBE_ASSERT(insn.getValueNum() == 1);
> const GenRegister value = sel.selReg(insn.getValue(0),
> insn.getValueType());
> GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize
> == GEN_BYTE_SCATTER_BYTE);
> + if(sel.getSlowByteGather())
> + readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
> + else {
> + GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) :
> sel.selReg(bti.reg, ir::TYPE_U32);
> + GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD,
> + true), ir::TYPE_U16);
> +
> + // We need a temporary register if we read bytes or words
> + Register dst = sel.reg(FAMILY_DWORD, isUniform);
> + sel.push();
> + if (isUniform)
> + sel.curr.noMask = 1;
> + sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address,
> elemSize, b, bti.isConst ? NULL : & tmpFlag);
> + sel.pop();
>
> - readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
> + sel.push();
> + if (isUniform) {
> + sel.curr.noMask = 1;
> + sel.curr.execWidth = 1;
> + }
> + if (elemSize == GEN_BYTE_SCATTER_WORD)
> + sel.MOV(GenRegister::retype(value, GEN_TYPE_UW),
> GenRegister::unpacked_uw(dst));
> + else if (elemSize == GEN_BYTE_SCATTER_BYTE)
> + sel.MOV(GenRegister::retype(value, GEN_TYPE_UB),
> GenRegister::unpacked_ub(dst));
> + sel.pop();
> + }
> }
> }
>
> --
> 1.8.3.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list