[Beignet] [Patch v2] GBE: use soft mask to handle the barrier call.

Mon Dec 30 23:35:28 PST 2013

As the GPU is running under predication control, the following IR
may lead one single barrier be called twice at runtime.

A:
  barrier()
  instructions after barrier()

B:
  ...
  BR(cond) A

C:
  ...
  BR A

When it runs to B's BR instruction, and if any of the condition bits is
true, it will jump to block A to execute the barrier. Then latter, if
any of the condition bits is false, it will continue to execute the
block C's code and at the end of the C block, it jump to A to execute
the barrier again.

If on the other thread, all the condition bits are true, then it triggers
a hang.

And even if all the threads run the same count of barrier, it may cause
incorrect result, as it executes the instructions after barrier() in block
A before all the work items hit the barrier point.

The solution to fix this issue is to use a soft mask register. The register
is shared by all barrier call. We initialize it to !emask at the beginning
of the program.

barrierMask = !emask.

Then when it runs into the barrier call, we set current predication bits
to the mask register, and check whether all the lanes are set. If any of
the lanes is disabled, we simply jump to next basic block. Then latter
when it runs into barrier again, we can set more bits/lanes to 1, and
check it again, if all the bits are 1, then we set the preciation flag 0,0
to all 1 and execute the barrier call and after the wait, we reinitialize
the barrierMask to !emask, and run all the other instructions after the
barrier() in block A with all lanes enabled.

After this patch, we can fix the hang issue when testing the opencv's
transpose test cases.

v2:
1. If there are still some lanes not reach the barrier, we need to set all
   the finished lanes' block ip to FFFF, and we also need to clear all the
   flag0 to zero. Thus we can avoid to execute those instructions after the
   barrier too early.
2. fix some typos.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/context.cpp            |    5 ++-
 backend/src/backend/gen_context.cpp        |   59 ++++++++++++++++++++++++++++
 backend/src/backend/gen_insn_selection.cpp |   40 +++++++++----------
 backend/src/backend/gen_insn_selection.hpp |    1 +
 backend/src/backend/gen_reg_allocation.cpp |    7 +++-
 backend/src/backend/program.h              |    1 +
 backend/src/ir/profile.cpp                 |    4 +-
 backend/src/ir/profile.hpp                 |    5 ++-
 8 files changed, 95 insertions(+), 27 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 88375ad..d389d01 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -432,6 +432,7 @@ namespace gbe
     this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
     this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 0,  sizeof(uint32_t)));
     this->insertCurbeReg(ir::ocl::notemask, this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t)));
+    this->insertCurbeReg(ir::ocl::barriermask, this->newCurbeEntry(GBE_CURBE_BARRIER_MASK, 0, sizeof(uint32_t)));
 
     // Go over the arguments and find the related patch locations
     const uint32_t argNum = fn.argNum();
@@ -687,7 +688,9 @@ namespace gbe
         reg == ir::ocl::goffset2  ||
         reg == ir::ocl::workdim   ||
         reg == ir::ocl::emask     ||
-        reg == ir::ocl::notemask)
+        reg == ir::ocl::notemask  ||
+        reg == ir::ocl::barriermask
+      )
       return true;
     return false;
   }
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0028c73..6f33e84 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -100,6 +100,7 @@ namespace gbe
     /* clear all the bit in f0.0. */
     p->curr.execWidth = 1;
     p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immud(0x0000));
+    /* clear the barrier mask bits to all zero0*/
     p->curr.noMask = 0;
     p->curr.useFlag(0, 0);
     p->curr.execWidth = execWidth;
@@ -109,6 +110,7 @@ namespace gbe
     p->curr.execWidth = 1;
     p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
     p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF));
+    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
     p->pop();
   }
 
@@ -1502,7 +1504,64 @@ namespace gbe
 
   void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister fenceDst = ra->genReg(insn.dst(0));
+    uint32_t barrierType = insn.extra.barrierType;
+    const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    GenRegister blockIP;
+    uint32_t exeWidth = p->curr.execWidth;
+    uint32_t label = insn.parent->bb->getNextBlock()->getLabelIndex();
+
+    if (exeWidth == 16)
+      blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
+    else if (exeWidth == 8)
+      blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+
+    p->push();
+    /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions
+       after the barrier, If there is any lane still remains zero. */
+    p->MOV(blockIP, GenRegister::immuw(0xFFFF));
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    p->MOV(GenRegister::flag(0, 0), GenRegister::immuw(0));
+    this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
+    if (exeWidth == 16)
+      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+    else if (exeWidth == 8)
+      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+    else
+      NOT_IMPLEMENTED;
+    p->curr.inversePredicate = 1;
+    // If not all channel is set to 1, the barrier is still waiting for other lanes to complete,
+    // jump to next basic block.
+    p->JMPI(GenRegister::immud(0));
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
+    p->pop();
+
+    p->push();
+    p->curr.useFlag(0, 0);
+    /* Restore the blockIP to current label. */
+    p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex()));
+    if (barrierType == ir::syncGlobalBarrier) {
+      p->FENCE(fenceDst);
+      p->MOV(fenceDst, fenceDst);
+    }
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    // As only the payload.2 is used and all the other regions are ignored
+    // SIMD8 mode here is safe.
+    p->curr.execWidth = 8;
+    p->curr.physicalFlag = 0;
+    p->curr.noMask = 1;
+    // Copy barrier id from r0.
+    p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+    // A barrier is OK to start the thread synchronization *and* SLM fence
     p->BARRIER(src);
+    // Now we wait for the other threads
+    p->curr.execWidth = 1;
+    p->WAIT();
+    // we executed the barrier then restore the barrier soft mask to initial value.
+    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
+    p->pop();
   }
 
   void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 1e28843..8e6586b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -493,7 +493,7 @@ namespace gbe
     /*! Saturated subtraction of 64-bit integer */
     void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
     /*! Encode a barrier instruction */
-    void BARRIER(GenRegister src);
+    void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType);
     /*! Encode a barrier instruction */
     void FENCE(GenRegister dst);
     /*! Encode a label instruction */
@@ -818,9 +818,11 @@ namespace gbe
     insn->index = uint16_t(index);
   }
 
-  void Selection::Opaque::BARRIER(GenRegister src) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 0, 1);
+  void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, 1);
     insn->src(0) = src;
+    insn->dst(0) = fence;
+    insn->extra.barrierType = barrierType;
   }
 
   void Selection::Opaque::FENCE(GenRegister dst) {
@@ -2235,29 +2237,25 @@ namespace gbe
     {
       using namespace ir;
       const ir::Register reg = sel.reg(FAMILY_DWORD);
-
+      const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
+      const GenRegister tempFlag = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
+      const GenRegister flagReg = GenRegister::flag(0, 0);
       const uint32_t params = insn.getParameters();
-      if(params == syncGlobalBarrier) {
-        const ir::Register fenceDst = sel.reg(FAMILY_DWORD);
-        sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32));
-      }
 
       sel.push();
         sel.curr.predicate = GEN_PREDICATE_NONE;
-
-        // As only the payload.2 is used and all the other regions are ignored
-        // SIMD8 mode here is safe.
-        sel.curr.execWidth = 8;
-        sel.curr.physicalFlag = 0;
         sel.curr.noMask = 1;
-        // Copy barrier id from r0.
-        sel.AND(GenRegister::ud8grf(reg), GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000));
-
-        // A barrier is OK to start the thread synchronization *and* SLM fence
-        sel.BARRIER(GenRegister::f8grf(reg));
-        // Now we wait for the other threads
         sel.curr.execWidth = 1;
-        sel.WAIT();
+        sel.OR(barrierMask, flagReg, barrierMask);
+        sel.MOV(tempFlag, barrierMask);
+      sel.pop();
+
+      // A barrier is OK to start the thread synchronization *and* SLM fence
+      sel.push();
+      //sel.curr.predicate = GEN_PREDICATE_NONE;
+      sel.curr.flagIndex = (uint16_t)tempFlag.value.reg;
+      sel.curr.physicalFlag = 0;
+      sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
       sel.pop();
       return true;
     }
@@ -3110,7 +3108,7 @@ namespace gbe
           sel.curr.flagIndex = uint16_t(pred);
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
 
-        // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ALL8/16
+        // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16
         // will check those bits as well.
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.curr.execWidth = 1;
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 2422b2b..7566928 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -111,6 +111,7 @@ namespace gbe
         uint16_t scratchOffset;
         uint16_t scratchMsgHeader;
       };
+      uint32_t barrierType;
     } extra;
     /*! Gen opcode */
     uint8_t opcode;
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 02fc534..2e2be04 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -385,9 +385,10 @@ namespace gbe
             grfBooleans.insert(spill.reg);
             spill = interval;
           }
-          // We will a grf for the current register
-          else
+          // We will use a grf for the current register
+          else {
             grfBooleans.insert(reg);
+          }
         }
         else
           allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
@@ -638,6 +639,8 @@ namespace gbe
     this->intervals[ocl::emask].maxID = INT_MAX;
     this->intervals[ocl::notemask].minID = 0;
     this->intervals[ocl::notemask].maxID = INT_MAX;
+//    this->intervals[ocl::barriermask].minID = 0;
+//    this->intervals[ocl::barriermask].maxID = INT_MAX;
     this->intervals[ocl::retVal].minID = INT_MAX;
     this->intervals[ocl::retVal].maxID = -INT_MAX;
 
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 9a3570e..4705578 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -79,6 +79,7 @@ enum gbe_curbe_type {
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_EMASK,
   GBE_CURBE_NOT_EMASK,
+  GBE_CURBE_BARRIER_MASK,
 };
 
 /*! Extra arguments use the negative range of sub-values */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 9c3f333..ef3ea28 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,7 +40,8 @@ namespace ir {
         "stack_pointer",
         "block_ip",
         "barrier_id", "thread_number",
-        "work_dimension", "sampler_info", "emask", "notemask", "retVal"
+        "work_dimension", "sampler_info",
+        "emask", "notemask", "barriermask", "retVal"
     };
 
 #if GBE_DEBUG
@@ -79,6 +80,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_WORD, samplerinfo);
       DECL_NEW_REG(FAMILY_WORD, emask);
       DECL_NEW_REG(FAMILY_WORD, notemask);
+      DECL_NEW_REG(FAMILY_WORD, barriermask);
       DECL_NEW_REG(FAMILY_WORD, retVal);
     }
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 90c504f..d84c48a 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -67,8 +67,9 @@ namespace ir {
     static const Register samplerinfo = Register(23); // store sampler info.
     static const Register emask = Register(24);    // store the emask bits for the branching fix.
     static const Register notemask = Register(25); // store the !emask bits for the branching fix.
-    static const Register retVal = Register(26);   // helper register to do data flow analysis.
-    static const uint32_t regNum = 27;             // number of special registers
+    static const Register barriermask = Register(26); // software mask for barrier.
+    static const Register retVal = Register(27);   // helper register to do data flow analysis.
+    static const uint32_t regNum = 28;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
-- 
1.7.9.5