[Beignet] [PATCH 4/8] HSW: Workaround the slm address issue.

Mon May 12 08:12:01 PDT 2014

Each work group has it's own slm offset, and when dispatch threads,
TSG will handle it automatic in IVB. But it will fail in HSW.
After check, all work group's slm offset are 0, even the slm index is
correct in R0.0. So calc the slm offset for slm index, and add it
to the slm address.
TODO: need to find the root casue.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/gen75_context.cpp      | 32 +++++++++++++++++++
 backend/src/backend/gen75_context.hpp      |  4 +++
 backend/src/backend/gen_context.cpp        |  8 ++++-
 backend/src/backend/gen_context.hpp        | 11 +++++--
 backend/src/backend/gen_encoder.cpp        |  3 ++
 backend/src/backend/gen_encoder.hpp        |  1 +
 backend/src/backend/gen_insn_selection.cpp | 50 +++++++++++++++++++++---------
 backend/src/backend/gen_insn_selection.hpp |  7 +++++
 backend/src/backend/program.h              |  1 +
 backend/src/ir/profile.cpp                 |  3 +-
 backend/src/ir/profile.hpp                 |  3 +-
 11 files changed, 102 insertions(+), 21 deletions(-)

diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index d9933f2..f22a6ab 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -22,7 +22,39 @@
 
 #include "backend/gen75_context.hpp"
 #include "backend/gen75_encoder.hpp"
+#include "backend/gen_program.hpp"
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_insn_scheduling.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "sys/cvar.hpp"
+#include "ir/function.hpp"
+#include "ir/value.hpp"
+#include <cstring>
 
 namespace gbe
 {
+  void Gen75Context::emitSLMOffset(void) {
+    if(kernel->getUseSLM() == false)
+      return;
+
+    const GenRegister slm_offset = ra->genReg(GenRegister::ud1grf(ir::ocl::slmoffset));
+    const GenRegister slm_index = GenRegister::ud1grf(0, 0);
+    //the slm index is hold in r0.0 24-27 bit, in 4K unit, shift left 12 to get byte unit
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->SHR(slm_offset, slm_index, GenRegister::immud(12));
+    p->pop();
+  }
+
+  void Gen75Context::allocSLMOffsetCurbe(void) {
+    if(fn.getUseSLM())
+      allocCurbeReg(ir::ocl::slmoffset, GBE_CURBE_SLM_OFFSET);
+  }
+
+  void Gen75Context::newSelection(void) {
+    this->sel = GBE_NEW(Selection75, *this);
+  }
 }
diff --git a/backend/src/backend/gen75_context.hpp b/backend/src/backend/gen75_context.hpp
index 2bb6e17..bd0986c 100644
--- a/backend/src/backend/gen75_context.hpp
+++ b/backend/src/backend/gen75_context.hpp
@@ -41,6 +41,10 @@ namespace gbe
       return GBE_NEW(Gen75Encoder, this->simdWidth, 75, deviceID);
     }
 
+  private:
+    virtual void emitSLMOffset(void);
+    virtual void allocSLMOffsetCurbe(void);
+    virtual void newSelection(void);
   };
 }
 #endif /* __GBE_GEN75_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 689136f..adeb852 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -67,7 +67,7 @@ namespace gbe
     GBE_SAFE_DELETE(sel);
     GBE_SAFE_DELETE(p);
     this->p = generateEncoder();
-    this->sel = GBE_NEW(Selection, *this);
+    this->newSelection();
     this->ra = GBE_NEW(GenRegAllocator, *this);
     this->branchPos2.clear();
     this->branchPos3.clear();
@@ -75,6 +75,10 @@ namespace gbe
     this->errCode = NO_ERROR;
   }
 
+  void GenContext::newSelection(void) {
+    this->sel = GBE_NEW(Selection, *this);
+  }
+
   void GenContext::emitInstructionStream(void) {
     // Emit Gen ISA
     for (auto &block : *sel->blockList)
@@ -1921,6 +1925,7 @@ namespace gbe
     allocCurbeReg(one, GBE_CURBE_ONE);
     if (stackUse.size() != 0)
       allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
+    allocSLMOffsetCurbe();
     // Go over the arguments and find the related patch locations
     const uint32_t argNum = fn.argNum();
     for (uint32_t argID = 0u; argID < argNum; ++argID) {
@@ -2002,6 +2007,7 @@ namespace gbe
       ra->outputAllocation();
     this->clearFlagRegister();
     this->emitStackPointer();
+    this->emitSLMOffset();
     this->emitInstructionStream();
     if (this->patchBranches() == false)
       return false;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 21ee28d..ba4cdc6 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -194,15 +194,20 @@ namespace gbe
     virtual GenEncoder* generateEncoder(void) {
       return GBE_NEW(GenEncoder, this->simdWidth, 7, deviceID);
     }
+    /*! allocate a new curbe register and insert to curbe pool. */
+    void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
 
   private:
     CompileErrorCode errCode;
     bool ifEndifFix;
     /*! Build the curbe patch list for the given kernel */
     void buildPatchList(void);
-    /*! allocate a new curbe register and insert to curbe pool. */
-    void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
-
+    /*! Calc the group's slm offset from R0.0, to work around HSW SLM bug*/
+    virtual void emitSLMOffset(void) { };
+    /*! allocate group's slm offset in curbe, only for HSW */
+    virtual void allocSLMOffsetCurbe(void) { };
+    /*! new selection of device */
+    virtual void newSelection(void);
     friend class GenRegAllocator;               //!< need to access errCode directly.
 
   };
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index ceaa199..a4d353d 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -225,6 +225,9 @@ namespace gbe
     this->curr.inversePredicate = 0;
   }
 
+  GenEncoder::~GenEncoder() {
+  }
+
   void GenEncoder::push(void) {
     assert(stateNum < MAX_STATE_NUM);
     stack[stateNum++] = curr;
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 5129d99..5af69cb 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -66,6 +66,7 @@ namespace gbe
   public:
     /*! simdWidth is the default width for the instructions */
     GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID, int jump_width = 1);
+    virtual ~GenEncoder();
     /*! Size of the stack (should be large enough) */
     enum { MAX_STATE_NUM = 16 };
     /*! Push the current instruction state */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 88ec408..77a2bc2 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -316,6 +316,9 @@ namespace gbe
     INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
     /*! spill a register (insert spill/unspill instructions) */
     INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
+    /*! should add per thread offset to the local memory address when load/store/atomic */
+    bool needPatchSLMAddr() const { return patchSLMAddr; }
+    void setPatchSLMAddr(bool b) { patchSLMAddr = b; }
     /*! indicate whether a register is a scalar/uniform register. */
     INLINE bool isScalarReg(const ir::Register &reg) const {
       const ir::RegisterData &regData = getRegisterData(reg);
@@ -577,6 +580,7 @@ namespace gbe
   private:
     /*! Auxiliary label for if/endif. */ 
     uint16_t currAuxLabel;
+    bool patchSLMAddr;
     INLINE ir::LabelIndex newAuxLabel()
     {
       currAuxLabel++;
@@ -610,12 +614,11 @@ namespace gbe
     return src0DAG->child[src0ID] == src1DAG->child[src1ID];
   }
 
-
   Selection::Opaque::Opaque(GenContext &ctx) :
     ctx(ctx), block(NULL),
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
-    stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum())
+    stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()), patchSLMAddr(false)
   {
     const ir::Function &fn = ctx.getFunction();
     this->regNum = fn.regNum();
@@ -1569,6 +1572,10 @@ namespace gbe
     this->opaque = GBE_NEW(Selection::Opaque, ctx);
   }
 
+  Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setPatchSLMAddr(true);
+  }
+
   void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
                                       uint32_t bti, bool is3D) {
     uint32_t elemID = 0;
@@ -2654,7 +2661,7 @@ namespace gbe
 
     INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn) const {
       using namespace ir;
-      const GenRegister address = sel.selReg(insn.getAddress());
+      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
       const AddressSpace space = insn.getAddressSpace();
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
                  insn.getAddressSpace() == MEM_CONSTANT ||
@@ -2663,6 +2670,11 @@ namespace gbe
       GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
+      if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
+        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+        address = temp;
+      }
       if (insn.getAddressSpace() == MEM_CONSTANT) {
         // XXX TODO read 64bit constant through constant cache
         // Per HW Spec, constant cache messages can read at least DWORD data.
@@ -2692,32 +2704,30 @@ namespace gbe
   {
     void emitUntypedWrite(Selection::Opaque &sel,
                           const ir::StoreInstruction &insn,
+                          GenRegister addr,
                           uint32_t bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      const uint32_t addrID = ir::StoreInstruction::addressIndex;
-      GenRegister addr;
       vector<GenRegister> value(valueNum);
 
-      addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);;
+      addr = GenRegister::retype(addr, GEN_TYPE_F);
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
       sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
     }
 
     void emitWrite64(Selection::Opaque &sel,
-                          const ir::StoreInstruction &insn,
-                          uint32_t bti) const
+                     const ir::StoreInstruction &insn,
+                     GenRegister addr,
+                     uint32_t bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      const uint32_t addrID = ir::StoreInstruction::addressIndex;
-      GenRegister addr;
       uint32_t srcID;
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
+      addr = GenRegister::retype(addr, GEN_TYPE_F);
       // The first 16 DWORD register space is for temporary usage at encode stage.
       uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
       GenRegister src[valueNum];
@@ -2782,12 +2792,17 @@ namespace gbe
       const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
+      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+      if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
+        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+        address = temp;
+      }
       if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
-        this->emitWrite64(sel, insn, bti);
+        this->emitWrite64(sel, insn, address, bti);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
-        this->emitUntypedWrite(sel, insn, bti);
+        this->emitUntypedWrite(sel, insn, address, bti);
       else {
-        const GenRegister address = sel.selReg(insn.getAddress());
         this->emitByteScatter(sel, insn, elemSize, address, bti);
       }
       return true;
@@ -3055,12 +3070,17 @@ namespace gbe
       const AddressSpace space = insn.getAddressSpace();
       const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const uint32_t srcNum = insn.getSrcNum();
-      const GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
+      GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
       GenRegister src1 = src0, src2 = src0;
       if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
       if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
       GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
+      if(space == MEM_LOCAL && sel.needPatchSLMAddr()){
+        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+        src0 = temp;
+      }
       sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, bti);
       return true;
     }
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 150feb5..c703b51 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -241,6 +241,13 @@ namespace gbe
     GBE_CLASS(Selection);
   };
 
+  class Selection75: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      Selection75(GenContext &ctx);
+  };
+
 } /* namespace gbe */
 
 #endif /*  __GEN_INSN_SELECTION_HPP__ */
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index d90ada3..8727966 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -78,6 +78,7 @@ enum gbe_curbe_type {
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_ZERO,
   GBE_CURBE_ONE,
+  GBE_CURBE_SLM_OFFSET,
 };
 
 /*! Extra arguments use the negative range of sub-values */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index ddf53d1..d583df9 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -41,7 +41,7 @@ namespace ir {
         "block_ip",
         "barrier_id", "thread_number", "work_dimension",
         "zero", "one",
-        "retVal"
+        "retVal", "slm_offset"
     };
 
 #if GBE_DEBUG
@@ -81,6 +81,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, zero, 1);
       DECL_NEW_REG(FAMILY_DWORD, one, 1);
       DECL_NEW_REG(FAMILY_WORD, retVal, 1);
+      DECL_NEW_REG(FAMILY_WORD, slmoffset, 1);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 0652a25..4a8062b 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -68,7 +68,8 @@ namespace ir {
     static const Register zero = Register(24);     //  scalar register holds zero.
     static const Register one = Register(25);     //  scalar register holds one. 
     static const Register retVal = Register(26);   // helper register to do data flow analysis.
-    static const uint32_t regNum = 27;             // number of special registers
+    static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
+    static const uint32_t regNum = 28;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
-- 
1.8.3.2