[Beignet] [Patch v2] GBE: use soft mask to handle the barrier call.

Mon Dec 30 23:55:45 PST 2013

On Tue, Dec 31, 2013 at 03:35:28PM +0800, Zhigang Gong wrote:
> As the GPU is running under predication control, the following IR
> may lead one single barrier be called twice at runtime.
> 
> A:
>   barrier()
>   instructions after barrier()
> 
> B:
>   ...
>   BR(cond) A
> 
> C:
>   ...
>   BR A
> 
> When it runs to B's BR instruction, and if any of the condition bits is
> true, it will jump to block A to execute the barrier. Then latter, if
> any of the condition bits is false, it will continue to execute the
> block C's code and at the end of the C block, it jump to A to execute
> the barrier again.
> 
> If on the other thread, all the condition bits are true, then it triggers
> a hang.
> 
> And even if all the threads run the same count of barrier, it may cause
> incorrect result, as it executes the instructions after barrier() in block
> A before all the work items hit the barrier point.
> 
> The solution to fix this issue is to use a soft mask register. The register
> is shared by all barrier call. We initialize it to !emask at the beginning
> of the program.
> 
> barrierMask = !emask.
> 
> Then when it runs into the barrier call, we set current predication bits
> to the mask register, and check whether all the lanes are set. If any of
> the lanes is disabled, we simply jump to next basic block. Then latter
> when it runs into barrier again, we can set more bits/lanes to 1, and
> check it again, if all the bits are 1, then we set the preciation flag 0,0
> to all 1 and execute the barrier call and after the wait, we reinitialize
> the barrierMask to !emask, and run all the other instructions after the
> barrier() in block A with all lanes enabled.
> 
> After this patch, we can fix the hang issue when testing the opencv's
> transpose test cases.
> 
> v2:
> 1. If there are still some lanes not reach the barrier, we need to set all
>    the finished lanes' block ip to FFFF, and we also need to clear all the
>    flag0 to zero. Thus we can avoid to execute those instructions after the
>    barrier too early.
> 2. fix some typos.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/backend/context.cpp            |    5 ++-
>  backend/src/backend/gen_context.cpp        |   59 ++++++++++++++++++++++++++++
>  backend/src/backend/gen_insn_selection.cpp |   40 +++++++++----------
>  backend/src/backend/gen_insn_selection.hpp |    1 +
>  backend/src/backend/gen_reg_allocation.cpp |    7 +++-
>  backend/src/backend/program.h              |    1 +
>  backend/src/ir/profile.cpp                 |    4 +-
>  backend/src/ir/profile.hpp                 |    5 ++-
>  8 files changed, 95 insertions(+), 27 deletions(-)
> 
> diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
> index 88375ad..d389d01 100644
> --- a/backend/src/backend/context.cpp
> +++ b/backend/src/backend/context.cpp
> @@ -432,6 +432,7 @@ namespace gbe
>      this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
>      this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 0,  sizeof(uint32_t)));
>      this->insertCurbeReg(ir::ocl::notemask, this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t)));
> +    this->insertCurbeReg(ir::ocl::barriermask, this->newCurbeEntry(GBE_CURBE_BARRIER_MASK, 0, sizeof(uint32_t)));
>  
>      // Go over the arguments and find the related patch locations
>      const uint32_t argNum = fn.argNum();
> @@ -687,7 +688,9 @@ namespace gbe
>          reg == ir::ocl::goffset2  ||
>          reg == ir::ocl::workdim   ||
>          reg == ir::ocl::emask     ||
> -        reg == ir::ocl::notemask)
> +        reg == ir::ocl::notemask  ||
> +        reg == ir::ocl::barriermask
> +      )
>        return true;
>      return false;
>    }
> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> index 0028c73..6f33e84 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -100,6 +100,7 @@ namespace gbe
>      /* clear all the bit in f0.0. */
>      p->curr.execWidth = 1;
>      p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immud(0x0000));
> +    /* clear the barrier mask bits to all zero0*/
>      p->curr.noMask = 0;
>      p->curr.useFlag(0, 0);
>      p->curr.execWidth = execWidth;
> @@ -109,6 +110,7 @@ namespace gbe
>      p->curr.execWidth = 1;
>      p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
>      p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF));
> +    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
>      p->pop();
>    }
>  
> @@ -1502,7 +1504,64 @@ namespace gbe
>  
>    void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) {
>      const GenRegister src = ra->genReg(insn.src(0));
> +    const GenRegister fenceDst = ra->genReg(insn.dst(0));
> +    uint32_t barrierType = insn.extra.barrierType;
> +    const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
> +    GenRegister blockIP;
> +    uint32_t exeWidth = p->curr.execWidth;
> +    uint32_t label = insn.parent->bb->getNextBlock()->getLabelIndex();
> +
> +    if (exeWidth == 16)
> +      blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
> +    else if (exeWidth == 8)
> +      blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
> +
> +    p->push();
> +    /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions
> +       after the barrier, If there is any lane still remains zero. */
> +    p->MOV(blockIP, GenRegister::immuw(0xFFFF));
> +    p->curr.noMask = 1;
> +    p->curr.execWidth = 1;
> +    p->MOV(GenRegister::flag(0, 0), GenRegister::immuw(0));
No need to clear flag0 here, as we will jump to next BB directly. 

> +    this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
> +    if (exeWidth == 16)
> +      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
> +    else if (exeWidth == 8)
> +      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
> +    else
> +      NOT_IMPLEMENTED;
> +    p->curr.inversePredicate = 1;
> +    // If not all channel is set to 1, the barrier is still waiting for other lanes to complete,
> +    // jump to next basic block.
> +    p->JMPI(GenRegister::immud(0));
> +    p->curr.predicate = GEN_PREDICATE_NONE;
> +    p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
> +    p->pop();
> +
> +    p->push();
> +    p->curr.useFlag(0, 0);
> +    /* Restore the blockIP to current label. */
> +    p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex()));
> +    if (barrierType == ir::syncGlobalBarrier) {
> +      p->FENCE(fenceDst);
> +      p->MOV(fenceDst, fenceDst);
> +    }
> +    p->curr.predicate = GEN_PREDICATE_NONE;
> +    // As only the payload.2 is used and all the other regions are ignored
> +    // SIMD8 mode here is safe.
> +    p->curr.execWidth = 8;
> +    p->curr.physicalFlag = 0;
> +    p->curr.noMask = 1;
> +    // Copy barrier id from r0.
> +    p->AND(src, barrierId, GenRegister::immud(0x0f000000));
> +    // A barrier is OK to start the thread synchronization *and* SLM fence
>      p->BARRIER(src);
> +    // Now we wait for the other threads
> +    p->curr.execWidth = 1;
> +    p->WAIT();
> +    // we executed the barrier then restore the barrier soft mask to initial value.
> +    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
> +    p->pop();
>    }
>  
>    void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index 1e28843..8e6586b 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -493,7 +493,7 @@ namespace gbe
>      /*! Saturated subtraction of 64-bit integer */
>      void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
>      /*! Encode a barrier instruction */
> -    void BARRIER(GenRegister src);
> +    void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType);
>      /*! Encode a barrier instruction */
>      void FENCE(GenRegister dst);
>      /*! Encode a label instruction */
> @@ -818,9 +818,11 @@ namespace gbe
>      insn->index = uint16_t(index);
>    }
>  
> -  void Selection::Opaque::BARRIER(GenRegister src) {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 0, 1);
> +  void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, 1);
>      insn->src(0) = src;
> +    insn->dst(0) = fence;
> +    insn->extra.barrierType = barrierType;
>    }
>  
>    void Selection::Opaque::FENCE(GenRegister dst) {
> @@ -2235,29 +2237,25 @@ namespace gbe
>      {
>        using namespace ir;
>        const ir::Register reg = sel.reg(FAMILY_DWORD);
> -
> +      const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
> +      const GenRegister tempFlag = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
> +      const GenRegister flagReg = GenRegister::flag(0, 0);
>        const uint32_t params = insn.getParameters();
> -      if(params == syncGlobalBarrier) {
> -        const ir::Register fenceDst = sel.reg(FAMILY_DWORD);
> -        sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32));
> -      }
>  
>        sel.push();
>          sel.curr.predicate = GEN_PREDICATE_NONE;
> -
> -        // As only the payload.2 is used and all the other regions are ignored
> -        // SIMD8 mode here is safe.
> -        sel.curr.execWidth = 8;
> -        sel.curr.physicalFlag = 0;
>          sel.curr.noMask = 1;
> -        // Copy barrier id from r0.
> -        sel.AND(GenRegister::ud8grf(reg), GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000));
> -
> -        // A barrier is OK to start the thread synchronization *and* SLM fence
> -        sel.BARRIER(GenRegister::f8grf(reg));
> -        // Now we wait for the other threads
>          sel.curr.execWidth = 1;
> -        sel.WAIT();
> +        sel.OR(barrierMask, flagReg, barrierMask);
> +        sel.MOV(tempFlag, barrierMask);
> +      sel.pop();
> +
> +      // A barrier is OK to start the thread synchronization *and* SLM fence
> +      sel.push();
> +      //sel.curr.predicate = GEN_PREDICATE_NONE;
> +      sel.curr.flagIndex = (uint16_t)tempFlag.value.reg;
> +      sel.curr.physicalFlag = 0;
> +      sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
>        sel.pop();
>        return true;
>      }
> @@ -3110,7 +3108,7 @@ namespace gbe
>            sel.curr.flagIndex = uint16_t(pred);
>            sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
>  
> -        // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ALL8/16
> +        // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16
>          // will check those bits as well.
>            sel.curr.predicate = GEN_PREDICATE_NONE;
>            sel.curr.execWidth = 1;
> diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
> index 2422b2b..7566928 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -111,6 +111,7 @@ namespace gbe
>          uint16_t scratchOffset;
>          uint16_t scratchMsgHeader;
>        };
> +      uint32_t barrierType;
>      } extra;
>      /*! Gen opcode */
>      uint8_t opcode;
> diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
> index 02fc534..2e2be04 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -385,9 +385,10 @@ namespace gbe
>              grfBooleans.insert(spill.reg);
>              spill = interval;
>            }
> -          // We will a grf for the current register
> -          else
> +          // We will use a grf for the current register
> +          else {
>              grfBooleans.insert(reg);
> +          }
>          }
>          else
>            allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
> @@ -638,6 +639,8 @@ namespace gbe
>      this->intervals[ocl::emask].maxID = INT_MAX;
>      this->intervals[ocl::notemask].minID = 0;
>      this->intervals[ocl::notemask].maxID = INT_MAX;
> +//    this->intervals[ocl::barriermask].minID = 0;
> +//    this->intervals[ocl::barriermask].maxID = INT_MAX;
>      this->intervals[ocl::retVal].minID = INT_MAX;
>      this->intervals[ocl::retVal].maxID = -INT_MAX;
>  
> diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
> index 9a3570e..4705578 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -79,6 +79,7 @@ enum gbe_curbe_type {
>    GBE_CURBE_THREAD_NUM,
>    GBE_CURBE_EMASK,
>    GBE_CURBE_NOT_EMASK,
> +  GBE_CURBE_BARRIER_MASK,
>  };
>  
>  /*! Extra arguments use the negative range of sub-values */
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
> index 9c3f333..ef3ea28 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -40,7 +40,8 @@ namespace ir {
>          "stack_pointer",
>          "block_ip",
>          "barrier_id", "thread_number",
> -        "work_dimension", "sampler_info", "emask", "notemask", "retVal"
> +        "work_dimension", "sampler_info",
> +        "emask", "notemask", "barriermask", "retVal"
>      };
>  
>  #if GBE_DEBUG
> @@ -79,6 +80,7 @@ namespace ir {
>        DECL_NEW_REG(FAMILY_WORD, samplerinfo);
>        DECL_NEW_REG(FAMILY_WORD, emask);
>        DECL_NEW_REG(FAMILY_WORD, notemask);
> +      DECL_NEW_REG(FAMILY_WORD, barriermask);
>        DECL_NEW_REG(FAMILY_WORD, retVal);
>      }
>  #undef DECL_NEW_REG
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
> index 90c504f..d84c48a 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -67,8 +67,9 @@ namespace ir {
>      static const Register samplerinfo = Register(23); // store sampler info.
>      static const Register emask = Register(24);    // store the emask bits for the branching fix.
>      static const Register notemask = Register(25); // store the !emask bits for the branching fix.
> -    static const Register retVal = Register(26);   // helper register to do data flow analysis.
> -    static const uint32_t regNum = 27;             // number of special registers
> +    static const Register barriermask = Register(26); // software mask for barrier.
> +    static const Register retVal = Register(27);   // helper register to do data flow analysis.
> +    static const uint32_t regNum = 28;             // number of special registers
>      extern const char *specialRegMean[];           // special register name.
>    } /* namespace ocl */
>  
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet