[Beignet] [PATCH 05/13] Backend: Establishing the thread/TID-EUID map.
Yang, Rong R
rong.r.yang at intel.com
Wed Dec 9 00:15:54 PST 2015
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> junyan.he at inbox.com
> Sent: Tuesday, December 1, 2015 16:11
> To: beignet at lists.freedesktop.org
> Subject: [Beignet] [PATCH 05/13] Backend: Establishing the thread/TID-EUID
> map.
>
> From: Junyan He <junyan.he at linux.intel.com>
>
> We need to use forward message to send data and sync threads within the
> same work group. The HW lack the feature to get the TID and EUID of other
> threads. So we need to establish a map for this usage.
>
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
> backend/src/backend/gen_insn_selection.cpp | 128
> +++++++++++++++++++++++++++-
> backend/src/llvm/llvm_gen_backend.cpp | 35 +++++++-
> 2 files changed, 157 insertions(+), 6 deletions(-)
>
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 884f89d..5b08958 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -495,6 +495,8 @@ namespace gbe
> uint32_t vectorNum;
> /*! If true, generate code backward */
> bool bwdCodeGeneration;
> + /*! If true, the thread map has already been stored */
> + bool storeThreadMap;
> /*! To make function prototypes more readable */
> typedef const GenRegister &Reg;
>
> @@ -806,8 +808,9 @@ namespace gbe
> ctx(ctx), block(NULL),
> curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
> maxInsnNum(ctx.getFunction().getLargestBlockSize()),
> dagPool(maxInsnNum),
> - stateNum(0), vectorNum(0), bwdCodeGeneration(false),
> currAuxLabel(ctx.getFunction().labelNum()),
> - bHas32X32Mul(false), bHasLongType(false), bHasDoubleType(false),
> bHasHalfType(false), bLongRegRestrict(false),
> + stateNum(0), vectorNum(0), bwdCodeGeneration(false),
> storeThreadMap(false),
> + currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false),
> bHasLongType(false),
> + bHasDoubleType(false), bHasHalfType(false),
> + bLongRegRestrict(false),
> ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false)
> {
> const ir::Function &fn = ctx.getFunction(); @@ -5967,6 +5970,112 @@
> namespace gbe
> /*! WorkGroup instruction pattern */
> DECL_PATTERN(WorkGroupInstruction)
> {
> + INLINE bool storeThreadID(Selection::Opaque &sel, uint32_t slmAddr)
> const
> + {
> + using namespace ir;
> + GenRegister sr0_0 = GenRegister::retype(GenRegister::sr(0),
> GEN_TYPE_UW);
> + const uint32_t simdWidth = sel.ctx.getSimdWidth();
> + GenRegister tmp;
> + GenRegister addr;
> + vector<GenRegister> fakeTemps;
> + fakeTemps.push_back(GenRegister::null());
> + fakeTemps.push_back(GenRegister::null());
Need not push here. I think use empty vector is ok.
> +
> + if (simdWidth == 16) {
> + tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> + addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> + } else {
> + tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U32), GEN_TYPE_UD);
> + addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U32), GEN_TYPE_UD);
> + }
> +
> + sr0_0.vstride = GEN_VERTICAL_STRIDE_0;
> + sr0_0.hstride = GEN_HORIZONTAL_STRIDE_0;
> + sr0_0.width = GEN_WIDTH_1;
Use GenRegister::vec1 here?
> + sel.push(); {
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.curr.execWidth = 8;
> +
> + sel.MOV(tmp, sr0_0);
> +
> + sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32),
> GenRegister::immud(2));
> + sel.ADD(addr, addr, GenRegister::immud(slmAddr));
> +
> + sel.push(); {
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.push(); {
> + sel.curr.execWidth = 1;
> + sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x01));
> + } sel.pop();
> + sel.curr.flag = 0;
> + sel.curr.subFlag = 1;
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> + sel.BYTE_SCATTER(addr, tmp, 1, GenRegister::immw(0xfe),
> fakeTemps);
> + } sel.pop();
> + } sel.pop();
> + return true;
> + }
> +
> + INLINE GenRegister getNextThreadID(Selection::Opaque &sel, uint32_t
> slmAddr) const
> + {
> + using namespace ir;
> + const uint32_t simdWidth = sel.ctx.getSimdWidth();
> + GenRegister addr;
> + GenRegister nextThread;
> + GenRegister tid;
> + vector<GenRegister> fakeTemps;
> + fakeTemps.push_back(GenRegister::null());
> + fakeTemps.push_back(GenRegister::null());
Same as storeThreadID.
> +
> + if (simdWidth == 16) {
> + addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> + nextThread = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> + tid = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> + } else {
> + addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
> + nextThread = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
> + tid = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
> + }
> +
> + sel.push(); {
> + sel.curr.execWidth = 8;
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32),
> + GenRegister::immud(1));
> +
> + /* Wrap the next thread id. */
> + sel.push(); {
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.curr.flag = 0;
> + sel.curr.subFlag = 1;
> + sel.CMP(GEN_CONDITIONAL_EQ, nextThread, sel.selReg(ocl::threadn,
> ir::TYPE_U32), GenRegister::null());
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> + sel.MOV(nextThread, GenRegister::immud(0));
> + } sel.pop();
> +
> + sel.MUL(addr, nextThread, GenRegister::immud(2));
> + sel.ADD(addr, addr, GenRegister::immud(slmAddr));
> +
> + sel.push(); {
> + sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.noMask = 1;
> + sel.push(); {
> + sel.curr.execWidth = 1;
> + sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x010));
> + } sel.pop();
> + sel.curr.flag = 0;
> + sel.curr.subFlag = 1;
> + sel.curr.predicate = GEN_PREDICATE_NORMAL;
> + sel.BYTE_GATHER(tid, addr, 1, GenRegister::immw(0xfe), fakeTemps);
> + } sel.pop();
> +
> + } sel.pop();
> + return tid;
> + }
> +
> INLINE bool emitWGBroadcast(Selection::Opaque &sel, const
> ir::WorkGroupInstruction &insn) const {
> /* 1. BARRIER Ensure all the threads have set the correct value for the
> var which will be broadcasted.
> 2. CMP IDs Compare the local IDs with the specified ones in the
> function call.
> @@ -6042,6 +6151,21 @@ namespace gbe
>
> if (workGroupOp == WORKGROUP_OP_BROADCAST) {
> return emitWGBroadcast(sel, insn);
> + } else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD &&
> workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) {
> + const uint32_t slmAddr = insn.getSlmAddr();
> + /* First, we create the TheadID/localID map, in order to get
> + which thread hold the next 16 workitems. */
> +
> + if (!sel.storeThreadMap) {
> + this->storeThreadID(sel, slmAddr);
> + sel.storeThreadMap = true;
> + }
> +
> + /* Then we insert a barrier to make sure all the var we are interested in
> + have been assigned the final value. */
> + sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
> + sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
> +
> + /* Third, get the next thread ID which we will Forward MSG to. */
> + GenRegister nextThreadID = getNextThreadID(sel, slmAddr);
> } else {
> GBE_ASSERT(0);
> }
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 2137814..d50ed42 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -3711,6 +3711,20 @@ namespace gbe
> GBE_ASSERT(f.getwgBroadcastSLM() >= 0);
> }
>
> + if (f.gettidMapSLM() < 0 && opcode >=
> ir::WORKGROUP_OP_REDUCE_ADD && opcode <=
> ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
> + /* Because we can not know the thread ID and the EUID for every
> physical
> + thead which the work items execute on before the run time. We need
> to
> + sync the thread execution order when using work group functions. We
> + create the workitems/threadID map table in slm.
> + When we come to here, the global thread local vars should have all
> been
> + allocated, so it's safe for us to steal a piece of SLM for this usage. */
> + uint32_t mapSize = sizeof(uint16_t) * 64;// at most 64 thread for one
> subslice.
> + f.setUseSLM(true);
> + uint32_t oldSlm = f.getSLMSize();
> + f.setSLMSize(oldSlm + mapSize);
> + f.settidMapSLM(oldSlm);
> + GBE_ASSERT(f.gettidMapSLM() >= 0);
> + }
>
> CallSite::arg_iterator AI = CS.arg_begin();
> CallSite::arg_iterator AE = CS.arg_end(); @@ -3731,10 +3745,23 @@
> namespace gbe
> ctx.WORKGROUP(ir::WORKGROUP_OP_BROADCAST,
> (uint32_t)f.getwgBroadcastSLM(), getRegister(&I), srcTuple, argNum,
> getType(ctx, (*CS.arg_begin())->getType()));
> } else {
> - const ir::Register src = this->getRegister(*(AI++));
> - const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1);
> - ctx.WORKGROUP(opcode, (uint32_t)0, getRegister(&I), srcTuple, 1,
> - getType(ctx, (*CS.arg_begin())->getType()));
> + ConstantInt *sign = dyn_cast<ConstantInt>(AI);
> + GBE_ASSERT(sign);
> + bool isSign = sign->getZExtValue();
> + AI++;
> + ir::Type ty;
> + if (isSign) {
> + ty = getType(ctx, (*AI)->getType());
> + } else {
> + ty = getUnsignedType(ctx, (*AI)->getType());
> + }
> +
> + ir::Register src[3];
> + src[0] = ir::ocl::threadn;
> + src[1] = ir::ocl::threadid;
> + src[2] = this->getRegister(*(AI++));
> + const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3);
> + ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(),
> + getRegister(&I), srcTuple, 3, ty);
> }
>
> GBE_ASSERT(AI == AE);
> --
> 1.7.9.5
>
>
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list