[Beignet] [Patch v2] GBE: use soft mask to handle the barrier call.
Zhigang Gong
zhigang.gong at linux.intel.com
Mon Dec 30 23:55:45 PST 2013
On Tue, Dec 31, 2013 at 03:35:28PM +0800, Zhigang Gong wrote:
> As the GPU is running under predication control, the following IR
> may lead one single barrier be called twice at runtime.
>
> A:
> barrier()
> instructions after barrier()
>
> B:
> ...
> BR(cond) A
>
> C:
> ...
> BR A
>
> When it runs to B's BR instruction, and if any of the condition bits is
> true, it will jump to block A to execute the barrier. Then latter, if
> any of the condition bits is false, it will continue to execute the
> block C's code and at the end of the C block, it jump to A to execute
> the barrier again.
>
> If on the other thread, all the condition bits are true, then it triggers
> a hang.
>
> And even if all the threads run the same count of barrier, it may cause
> incorrect result, as it executes the instructions after barrier() in block
> A before all the work items hit the barrier point.
>
> The solution to fix this issue is to use a soft mask register. The register
> is shared by all barrier call. We initialize it to !emask at the beginning
> of the program.
>
> barrierMask = !emask.
>
> Then when it runs into the barrier call, we set current predication bits
> to the mask register, and check whether all the lanes are set. If any of
> the lanes is disabled, we simply jump to next basic block. Then latter
> when it runs into barrier again, we can set more bits/lanes to 1, and
> check it again, if all the bits are 1, then we set the preciation flag 0,0
> to all 1 and execute the barrier call and after the wait, we reinitialize
> the barrierMask to !emask, and run all the other instructions after the
> barrier() in block A with all lanes enabled.
>
> After this patch, we can fix the hang issue when testing the opencv's
> transpose test cases.
>
> v2:
> 1. If there are still some lanes not reach the barrier, we need to set all
> the finished lanes' block ip to FFFF, and we also need to clear all the
> flag0 to zero. Thus we can avoid to execute those instructions after the
> barrier too early.
> 2. fix some typos.
>
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
> backend/src/backend/context.cpp | 5 ++-
> backend/src/backend/gen_context.cpp | 59 ++++++++++++++++++++++++++++
> backend/src/backend/gen_insn_selection.cpp | 40 +++++++++----------
> backend/src/backend/gen_insn_selection.hpp | 1 +
> backend/src/backend/gen_reg_allocation.cpp | 7 +++-
> backend/src/backend/program.h | 1 +
> backend/src/ir/profile.cpp | 4 +-
> backend/src/ir/profile.hpp | 5 ++-
> 8 files changed, 95 insertions(+), 27 deletions(-)
>
> diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
> index 88375ad..d389d01 100644
> --- a/backend/src/backend/context.cpp
> +++ b/backend/src/backend/context.cpp
> @@ -432,6 +432,7 @@ namespace gbe
> this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
> this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 0, sizeof(uint32_t)));
> this->insertCurbeReg(ir::ocl::notemask, this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t)));
> + this->insertCurbeReg(ir::ocl::barriermask, this->newCurbeEntry(GBE_CURBE_BARRIER_MASK, 0, sizeof(uint32_t)));
>
> // Go over the arguments and find the related patch locations
> const uint32_t argNum = fn.argNum();
> @@ -687,7 +688,9 @@ namespace gbe
> reg == ir::ocl::goffset2 ||
> reg == ir::ocl::workdim ||
> reg == ir::ocl::emask ||
> - reg == ir::ocl::notemask)
> + reg == ir::ocl::notemask ||
> + reg == ir::ocl::barriermask
> + )
> return true;
> return false;
> }
> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> index 0028c73..6f33e84 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -100,6 +100,7 @@ namespace gbe
> /* clear all the bit in f0.0. */
> p->curr.execWidth = 1;
> p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immud(0x0000));
> + /* clear the barrier mask bits to all zero0*/
> p->curr.noMask = 0;
> p->curr.useFlag(0, 0);
> p->curr.execWidth = execWidth;
> @@ -109,6 +110,7 @@ namespace gbe
> p->curr.execWidth = 1;
> p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
> p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF));
> + p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
> p->pop();
> }
>
> @@ -1502,7 +1504,64 @@ namespace gbe
>
> void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) {
> const GenRegister src = ra->genReg(insn.src(0));
> + const GenRegister fenceDst = ra->genReg(insn.dst(0));
> + uint32_t barrierType = insn.extra.barrierType;
> + const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
> + GenRegister blockIP;
> + uint32_t exeWidth = p->curr.execWidth;
> + uint32_t label = insn.parent->bb->getNextBlock()->getLabelIndex();
> +
> + if (exeWidth == 16)
> + blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
> + else if (exeWidth == 8)
> + blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
> +
> + p->push();
> + /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions
> + after the barrier, If there is any lane still remains zero. */
> + p->MOV(blockIP, GenRegister::immuw(0xFFFF));
> + p->curr.noMask = 1;
> + p->curr.execWidth = 1;
> + p->MOV(GenRegister::flag(0, 0), GenRegister::immuw(0));
No need to clear flag0 here, as we will jump to next BB directly.
> + this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
> + if (exeWidth == 16)
> + p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
> + else if (exeWidth == 8)
> + p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
> + else
> + NOT_IMPLEMENTED;
> + p->curr.inversePredicate = 1;
> + // If not all channel is set to 1, the barrier is still waiting for other lanes to complete,
> + // jump to next basic block.
> + p->JMPI(GenRegister::immud(0));
> + p->curr.predicate = GEN_PREDICATE_NONE;
> + p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
> + p->pop();
> +
> + p->push();
> + p->curr.useFlag(0, 0);
> + /* Restore the blockIP to current label. */
> + p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex()));
> + if (barrierType == ir::syncGlobalBarrier) {
> + p->FENCE(fenceDst);
> + p->MOV(fenceDst, fenceDst);
> + }
> + p->curr.predicate = GEN_PREDICATE_NONE;
> + // As only the payload.2 is used and all the other regions are ignored
> + // SIMD8 mode here is safe.
> + p->curr.execWidth = 8;
> + p->curr.physicalFlag = 0;
> + p->curr.noMask = 1;
> + // Copy barrier id from r0.
> + p->AND(src, barrierId, GenRegister::immud(0x0f000000));
> + // A barrier is OK to start the thread synchronization *and* SLM fence
> p->BARRIER(src);
> + // Now we wait for the other threads
> + p->curr.execWidth = 1;
> + p->WAIT();
> + // we executed the barrier then restore the barrier soft mask to initial value.
> + p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
> + p->pop();
> }
>
> void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index 1e28843..8e6586b 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -493,7 +493,7 @@ namespace gbe
> /*! Saturated subtraction of 64-bit integer */
> void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
> /*! Encode a barrier instruction */
> - void BARRIER(GenRegister src);
> + void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType);
> /*! Encode a barrier instruction */
> void FENCE(GenRegister dst);
> /*! Encode a label instruction */
> @@ -818,9 +818,11 @@ namespace gbe
> insn->index = uint16_t(index);
> }
>
> - void Selection::Opaque::BARRIER(GenRegister src) {
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 0, 1);
> + void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) {
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, 1);
> insn->src(0) = src;
> + insn->dst(0) = fence;
> + insn->extra.barrierType = barrierType;
> }
>
> void Selection::Opaque::FENCE(GenRegister dst) {
> @@ -2235,29 +2237,25 @@ namespace gbe
> {
> using namespace ir;
> const ir::Register reg = sel.reg(FAMILY_DWORD);
> -
> + const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
> + const GenRegister tempFlag = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
> + const GenRegister flagReg = GenRegister::flag(0, 0);
> const uint32_t params = insn.getParameters();
> - if(params == syncGlobalBarrier) {
> - const ir::Register fenceDst = sel.reg(FAMILY_DWORD);
> - sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32));
> - }
>
> sel.push();
> sel.curr.predicate = GEN_PREDICATE_NONE;
> -
> - // As only the payload.2 is used and all the other regions are ignored
> - // SIMD8 mode here is safe.
> - sel.curr.execWidth = 8;
> - sel.curr.physicalFlag = 0;
> sel.curr.noMask = 1;
> - // Copy barrier id from r0.
> - sel.AND(GenRegister::ud8grf(reg), GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000));
> -
> - // A barrier is OK to start the thread synchronization *and* SLM fence
> - sel.BARRIER(GenRegister::f8grf(reg));
> - // Now we wait for the other threads
> sel.curr.execWidth = 1;
> - sel.WAIT();
> + sel.OR(barrierMask, flagReg, barrierMask);
> + sel.MOV(tempFlag, barrierMask);
> + sel.pop();
> +
> + // A barrier is OK to start the thread synchronization *and* SLM fence
> + sel.push();
> + //sel.curr.predicate = GEN_PREDICATE_NONE;
> + sel.curr.flagIndex = (uint16_t)tempFlag.value.reg;
> + sel.curr.physicalFlag = 0;
> + sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
> sel.pop();
> return true;
> }
> @@ -3110,7 +3108,7 @@ namespace gbe
> sel.curr.flagIndex = uint16_t(pred);
> sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
>
> - // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ALL8/16
> + // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16
> // will check those bits as well.
> sel.curr.predicate = GEN_PREDICATE_NONE;
> sel.curr.execWidth = 1;
> diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
> index 2422b2b..7566928 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -111,6 +111,7 @@ namespace gbe
> uint16_t scratchOffset;
> uint16_t scratchMsgHeader;
> };
> + uint32_t barrierType;
> } extra;
> /*! Gen opcode */
> uint8_t opcode;
> diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
> index 02fc534..2e2be04 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -385,9 +385,10 @@ namespace gbe
> grfBooleans.insert(spill.reg);
> spill = interval;
> }
> - // We will a grf for the current register
> - else
> + // We will use a grf for the current register
> + else {
> grfBooleans.insert(reg);
> + }
> }
> else
> allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
> @@ -638,6 +639,8 @@ namespace gbe
> this->intervals[ocl::emask].maxID = INT_MAX;
> this->intervals[ocl::notemask].minID = 0;
> this->intervals[ocl::notemask].maxID = INT_MAX;
> +// this->intervals[ocl::barriermask].minID = 0;
> +// this->intervals[ocl::barriermask].maxID = INT_MAX;
> this->intervals[ocl::retVal].minID = INT_MAX;
> this->intervals[ocl::retVal].maxID = -INT_MAX;
>
> diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
> index 9a3570e..4705578 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -79,6 +79,7 @@ enum gbe_curbe_type {
> GBE_CURBE_THREAD_NUM,
> GBE_CURBE_EMASK,
> GBE_CURBE_NOT_EMASK,
> + GBE_CURBE_BARRIER_MASK,
> };
>
> /*! Extra arguments use the negative range of sub-values */
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
> index 9c3f333..ef3ea28 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -40,7 +40,8 @@ namespace ir {
> "stack_pointer",
> "block_ip",
> "barrier_id", "thread_number",
> - "work_dimension", "sampler_info", "emask", "notemask", "retVal"
> + "work_dimension", "sampler_info",
> + "emask", "notemask", "barriermask", "retVal"
> };
>
> #if GBE_DEBUG
> @@ -79,6 +80,7 @@ namespace ir {
> DECL_NEW_REG(FAMILY_WORD, samplerinfo);
> DECL_NEW_REG(FAMILY_WORD, emask);
> DECL_NEW_REG(FAMILY_WORD, notemask);
> + DECL_NEW_REG(FAMILY_WORD, barriermask);
> DECL_NEW_REG(FAMILY_WORD, retVal);
> }
> #undef DECL_NEW_REG
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
> index 90c504f..d84c48a 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -67,8 +67,9 @@ namespace ir {
> static const Register samplerinfo = Register(23); // store sampler info.
> static const Register emask = Register(24); // store the emask bits for the branching fix.
> static const Register notemask = Register(25); // store the !emask bits for the branching fix.
> - static const Register retVal = Register(26); // helper register to do data flow analysis.
> - static const uint32_t regNum = 27; // number of special registers
> + static const Register barriermask = Register(26); // software mask for barrier.
> + static const Register retVal = Register(27); // helper register to do data flow analysis.
> + static const uint32_t regNum = 28; // number of special registers
> extern const char *specialRegMean[]; // special register name.
> } /* namespace ocl */
>
> --
> 1.7.9.5
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list