[Beignet] [PATCH 1/2] GBE: Clear Flag register to fix a gpu hang.

Thu Aug 22 01:52:04 PDT 2013

When group size not aligned to simdWidth, prediction any8/16h will
calculate pmask also using flag register bits mapped to non-active
lanes. As flag register is not cleared by default, any8/16h used
for jmpi instruction may cause wrong jump, and possibly infinite loop.

So, we clear Flag register to 0 to make any8/16h prediction work correct.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/gen_context.cpp        |   13 +++++++++++++
 backend/src/backend/gen_context.hpp        |    1 +
 backend/src/backend/gen_insn_selection.cpp |    3 +++
 3 files changed, 17 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 6eeab51..a029719 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -88,6 +88,18 @@ namespace gbe
     }
   }
 
+  void GenContext::clearFlagRegister(void) {
+    // when group size not aligned to simdWidth, flag register need clear to
+    // make prediction(any8/16h) work correctly
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), GenRegister::immud(0x0));
+    p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), GenRegister::immud(0x0));
+    p->pop();
+  }
+
   void GenContext::emitStackPointer(void) {
     using namespace ir;
 
@@ -1091,6 +1103,7 @@ namespace gbe
     schedulePostRegAllocation(*this, *this->sel);
     if (OCL_OUTPUT_REG_ALLOC)
       ra->outputAllocation();
+    this->clearFlagRegister();
     this->emitStackPointer();
     this->emitInstructionStream();
     this->patchBranches();
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 8b481d0..f66ec95 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -61,6 +61,7 @@ namespace gbe
     INLINE const ir::Function &getFunction(void) const { return fn; }
     /*! Simd width chosen for the current function */
     INLINE uint32_t getSimdWidth(void) const { return simdWidth; }
+    void clearFlagRegister(void);
     /*! Emit the per-lane stack pointer computation */
     void emitStackPointer(void);
     /*! Emit the instructions */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 55db48e..bca08ba 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2621,6 +2621,9 @@ namespace gbe
           sel.CMP(GEN_CONDITIONAL_G, ip, GenRegister::immuw(nextLabel));
 
           // Branch to the jump target
+          // XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not
+          // work correct, as flag register bits mapped to non-active lanes tend
+          // to be zero.
           if (simdWidth == 8)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
           else if (simdWidth == 16)
-- 
1.7.9.5