[Beignet] [PATCH 2/2] [OCL20] gbe: add AtomicA64 instructions with stateless access.

Song, Ruiling ruiling.song at intel.com
Mon Dec 28 00:42:45 PST 2015


A64 messages are designed to handle stateless messages.
Stateless is used to represent memory access without specified binding table index.
So, some code in your code make things not clear.

Ruiling
> --- a/backend/src/backend/gen8_context.cpp
> +++ b/backend/src/backend/gen8_context.cpp
> @@ -1029,6 +1029,33 @@ namespace gbe
> 
>      p->UNTYPED_WRITEA64(addr, elemNum*2);
>    }
> +  void Gen8Context::emitAtomicA64Instruction(const SelectionInstruction
> &insn)
> +  {
> +    const GenRegister src = ra->genReg(insn.src(0));
> +    const GenRegister dst = ra->genReg(insn.dst(0));
> +    const uint32_t function = insn.extra.function;
> +    unsigned srcNum = insn.extra.elem;
> +
> +    const GenRegister bti = ra->genReg(insn.src(srcNum));
> +
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      p->ATOMICA64(dst, function, src, bti, srcNum);
> +    } else {

Should not arrive here. A64 message's address always means absolute GPU virtual address.
The beforemessage/afterMessage is meaningless for stateless access.
> +      GenRegister flagTemp = ra->genReg(insn.dst(1));
> +      GenRegister btiTmp = ra->genReg(insn.dst(2));
> +
> +      unsigned desc = p->generateAtomicMessageDesc(function, 0, srcNum);
> +
> +      unsigned jip0 = beforeMessage(insn, bti, flagTemp, btiTmp, desc);
> +      p->push();
> +        p->curr.predicate = GEN_PREDICATE_NORMAL;
> +        p->curr.useFlag(insn.state.flag, insn.state.subFlag);
> +        p->ATOMICA64(dst, function, src, GenRegister::addr1(0), srcNum);
> +      p->pop();
> +      afterMessage(insn, bti, flagTemp, btiTmp, jip0);
> +    }
> +  }
> +

> +
> +  void Gen8Encoder::ATOMICA64(GenRegister dst, uint32_t function,
> GenRegister src, GenRegister bti, uint32_t srcNum) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +
> +    this->setHeader(insn);
> +    insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
> +
> +    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +    if (bti.file == GEN_IMMEDIATE_VALUE) {
> +      this->setSrc1(insn, GenRegister::immud(0));
> +      setAtomicA64MessageDesc(insn, function, bti.value.ud, srcNum);
> +    } else {

Same as above comment.
> +      this->setSrc1(insn, bti);
> +    }
> +  }
> +


> 
> +  void Selection::Opaque::ATOMICA64(Reg dst, uint32_t function,
> +                                 uint32_t msgPayload, vector<GenRegister> src,
> +                                 GenRegister bti,
> +                                 vector<GenRegister> temps) {
> +    unsigned dstNum = 1 + temps.size();
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMICA64,
> dstNum, msgPayload + 1);
> +

Seems that stateless messages don't use flag register.
> +    if (bti.file != GEN_IMMEDIATE_VALUE) {
> +      insn->state.flag = 0;
> +      insn->state.subFlag = 1;
> +    }
> +
> +    insn->dst(0) = dst;
> +    if(temps.size()) {
> +      insn->dst(1) = temps[0];
> +      insn->dst(2) = temps[1];
> +    }
> +
> +    for (uint32_t elemID = 0; elemID < msgPayload; ++elemID)
> +      insn->src(elemID) = src[elemID];
> +    insn->src(msgPayload) = bti;
> +
> +    insn->extra.function = function;
> +    insn->extra.elem = msgPayload;
> +
> +    SelectionVector *vector = this->appendVector();
> +    vector->regNum = msgPayload; //bti not included in SelectionVector
> +    vector->offsetID = 0;
> +    vector->reg = &insn->src(0);
> +    vector->isSrc = 1;
> +  }
> +
>    void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); }
>    void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
>    void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0,
> 0); }
> @@ -5481,6 +5517,76 @@ namespace gbe
>            this->opcodes.push_back(ir::Opcode(op));
>      }
> 
> +    /* Used to transform address from 64bit to 32bit, note as dataport
> messages
> +     * cannot accept scalar register, so here to convert to non-uniform
> +     * register here. */
> +    GenRegister convertU64ToU32(Selection::Opaque &sel,
> +                                GenRegister addr) const {
> +      GenRegister unpacked =
> GenRegister::retype(sel.unpacked_ud(addr.reg()), GEN_TYPE_UD);
> +      GenRegister dst = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
> +      sel.MOV(dst, unpacked);
> +      return dst;
> +    }
> +
> +    void untypedAtomicA64Stateless(Selection::Opaque &sel,
> +                              const ir::AtomicInstruction &insn,
> +                              unsigned msgPayload,
> +                              GenRegister dst,
> +                              GenRegister addr,
> +                              GenRegister src1,
> +                              GenRegister src2,
> +                              GenRegister bti) const {
> +      using namespace ir;
> +      GenRegister addrQ;
> +      const AtomicOps atomicOp = insn.getAtomicOpcode();
> +      GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
> +      unsigned addrBytes = typeSize(addr.type);
> +      GBE_ASSERT(msgPayload <= 3);
> +
> +      unsigned simdWidth = sel.curr.execWidth;
> +      AddressMode AM = insn.getAddressMode();
> +      if (addrBytes == 4) {
> +        addrQ = sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64);
> +        sel.MOV(addrQ, addr);
> +      } else {
> +        addrQ = addr;
> +      }
> +
> +      if (simdWidth == 8) {
> +        vector<GenRegister> msgs;
> +        msgs.push_back(addr);
> +        msgs.push_back(src1);
> +        msgs.push_back(src2);
> +        sel.ATOMICA64(dst, genAtomicOp, msgPayload, msgs, bti,
> sel.getBTITemps(AM));
> +      } else if (simdWidth == 16) {
> +        vector<GenRegister> msgs;
> +        for (unsigned k = 0; k < msgPayload; k++) {
> +          msgs.push_back(sel.selReg(sel.reg(ir::FAMILY_DWORD),
> ir::TYPE_U32));
> +        }
> +        sel.push();
> +        /* first quarter */
> +        sel.curr.execWidth = 8;
> +        sel.curr.quarterControl = GEN_COMPRESSION_Q1;
> +        sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL),
> GenRegister::Qn(addrQ, 0));
> +        if(msgPayload > 1)
> +          sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src1, 0));
> +        if(msgPayload > 2)
> +          sel.MOV(GenRegister::Qn(msgs[1], 1), GenRegister::Qn(src2, 0));
> +        sel.ATOMICA64(GenRegister::Qn(dst, 0), genAtomicOp, msgPayload,
> msgs, bti, sel.getBTITemps(AM));
> +
> +        /* second quarter */
> +        sel.curr.execWidth = 8;
> +        sel.curr.quarterControl = GEN_COMPRESSION_Q2;
> +        sel.MOV(GenRegister::retype(msgs[0], GEN_TYPE_UL),
> GenRegister::Qn(addrQ, 1));
> +        if(msgPayload > 1)
> +          sel.MOV(GenRegister::Qn(msgs[1], 0), GenRegister::Qn(src1, 1));
> +        if(msgPayload > 2)
> +          sel.MOV(GenRegister::Qn(msgs[1], 1), GenRegister::Qn(src2, 1));
> +        sel.ATOMICA64(GenRegister::Qn(dst, 1), genAtomicOp, msgPayload,
> msgs, bti, sel.getBTITemps(AM));
> +        sel.pop();
> +      }
> +    }
> +
>      INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
>        using namespace ir;
>        const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
> @@ -5489,6 +5595,15 @@ namespace gbe
>        const AtomicOps atomicOp = insn.getAtomicOpcode();
>        unsigned srcNum = insn.getSrcNum();
>        unsigned msgPayload;
> +      Register reg = insn.getAddressRegister();
> +      GenRegister address = sel.selReg(reg,
> getType(sel.getRegisterFamily(reg)));
> +      AddressSpace addrSpace = insn.getAddressSpace();
> +      GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
> +                 insn.getAddressSpace() == MEM_PRIVATE ||
> +                 insn.getAddressSpace() == MEM_LOCAL ||
> +                 insn.getAddressSpace() == MEM_GENERIC ||
> +                 insn.getAddressSpace() == MEM_MIXED);
> +      unsigned addrBytes = typeSize(address.type);
> 
>        AddressMode AM = insn.getAddressMode();
>        if (AM == AM_DynamicBti) {
> @@ -5508,7 +5623,16 @@ namespace gbe
>        if(msgPayload > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
> 
>        GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
> -      sel.ATOMIC(dst, genAtomicOp, msgPayload, src0, src1, src2, bti,
> sel.getBTITemps(AM));
> +      if (AM == AM_DynamicBti) {
> +        sel.ATOMIC(dst, genAtomicOp, msgPayload, address, src1, src2, bti,
> sel.getBTITemps(AM));
> +      } else if (addrSpace == ir::MEM_LOCAL) {
> +        // stateless mode, local still use bti access
> +        GenRegister addrDW = address;
> +        if (addrBytes == 8)
> +          addrDW = convertU64ToU32(sel, address);
> +        sel.ATOMIC(dst, genAtomicOp, msgPayload, addrDW, src1, src2,
> GenRegister::immud(0xfe), sel.getBTITemps(AM));
> +      } else
Looks like you let StaticBti address mode also goes here? Stateless should only process stateless situation. That would make code easy to maintain.
> +        untypedAtomicA64Stateless(sel, insn, msgPayload, dst, address, src1,
> src2, bti);
> 
>        markAllChildren(dag);
>        return true;
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list