[Beignet] [PATCH] GBE: remove stacksize 64KB limitation.

Sun Feb 14 06:42:16 UTC 2016

If stacksize large 64KB, the formula of calculate the stackptr should
change, form "threadId * perThreadSize + laneId*perLaneSize" to
"(threadId * simdWidth + laneId)*perLaneSize", to avoid Dword * Dword.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/context.cpp       |  2 +-
 backend/src/backend/gen75_context.cpp | 33 +++++++++++++--------------
 backend/src/backend/gen_context.cpp   | 42 +++++++++++++++--------------------
 backend/src/backend/gen_context.hpp   |  3 ++-
 4 files changed, 36 insertions(+), 44 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 5adeabc..0991786 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -398,7 +398,7 @@ namespace gbe
     uint32_t stackSize = 128;
     while (stackSize < fn.getStackSize()) {
       stackSize *= 3;
-      GBE_ASSERT(stackSize <= 64*KB);
+      //GBE_ASSERT(stackSize <= 64*KB);
     }
     this->kernel->stackSize = stackSize;
   }
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index fa8b029..37063d7 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -66,37 +66,34 @@ namespace gbe
 
     // Check that everything is consistent in the kernel code
     const uint32_t perLaneSize = kernel->getStackSize();
-    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
 
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
-
-    loadLaneID(stackptr);
+    // borrow block ip as temporary register as we will
+    // initialize block ip latter.
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
 
     // We compute the per-lane stack pointer here
-    // private address start from zero
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
       //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
-      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
-      p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
-      p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
-      p->curr.execWidth = this->simdWidth;
-      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
-      p->curr.execWidth = 1;
-      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
-      p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
-      if(perThreadSize > 0xffff) {
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
-      } else
-        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+      p->AND(stackptr, GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
+      p->SHR(stackptr, stackptr, GenRegister::immud(7));
+      p->SHL(tmpReg, tmpReg, GenRegister::immud(2));
+      p->ADD(tmpReg, tmpReg, stackptr); //threadId
+
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
       p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+      loadLaneID(stackptr);
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
+      p->MUL(stackptr, stackptr, GenRegister::immud(perLaneSize)); // (threadId * simdWidth + laneId)*perLaneSize
     p->pop();
   }
 
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0ea0dd0..99190d3 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -148,33 +148,33 @@ namespace gbe
   }
 
   /* Get proper block ip register according to current label width. */
-  static GenRegister getBlockIP(GenContext &ctx) {
+  GenRegister GenContext::getBlockIP(void) {
     GenRegister blockip;
-    if (!ctx.isDWLabel())
-      blockip = ctx.ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+    if (!isDWLabel())
+      blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
     else
-      blockip = ctx.ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
+      blockip = ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
     return blockip;
   }
 
   /* Set current block ip register to a specified constant label value. */
-  static void setBlockIP(GenContext &ctx, GenRegister blockip, uint32_t label) {
-    if (!ctx.isDWLabel())
-      ctx.p->MOV(blockip, GenRegister::immuw(label));
+  void GenContext::setBlockIP(GenRegister blockip, uint32_t label) {
+    if (!isDWLabel())
+      p->MOV(blockip, GenRegister::immuw(label));
     else
-      ctx.p->MOV(blockip, GenRegister::immud(label));
+      p->MOV(blockip, GenRegister::immud(label));
   }
 
   void GenContext::clearFlagRegister(void) {
     // when group size not aligned to simdWidth, flag register need clear to
     // make prediction(any8/16h) work correctly
-    const GenRegister blockip = getBlockIP(*this);
+    const GenRegister blockip = getBlockIP();
     p->push();
       p->curr.noMask = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      setBlockIP(*this, blockip, getMaxLabel());
+      setBlockIP(blockip, getMaxLabel());
       p->curr.noMask = 0;
-      setBlockIP(*this, blockip, 0);
+      setBlockIP(blockip, 0);
       p->curr.execWidth = 1;
       if (ra->isAllocated(ir::ocl::zero))
         p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::zero)), GenRegister::immuw(0));
@@ -219,7 +219,6 @@ namespace gbe
 
     // Check that everything is consistent in the kernel code
     const uint32_t perLaneSize = kernel->getStackSize();
-    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
 
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
@@ -228,28 +227,23 @@ namespace gbe
     const GenRegister stackptr = ra->genReg(selStatckPtr);
     // borrow block ip as temporary register as we will
     // initialize block ip latter.
-    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP(*this)), GEN_TYPE_UD);
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
 
     loadLaneID(stackptr);
 
     // We compute the per-lane stack pointer here
-    // threadId * perThreadSize + laneId*perLaneSize
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
     // let private address start from zero
     //p->MOV(stackptr, GenRegister::immud(0));
     p->push();
       p->curr.execWidth = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
       p->curr.execWidth = this->simdWidth;
-      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
-      p->curr.execWidth = 1;
-      if(perThreadSize > 0xffff) {
-        p->MUL(tmpReg, tmpReg, GenRegister::immuw(perLaneSize));
-        p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
-      } else
-        p->MUL(tmpReg, tmpReg, GenRegister::immuw(perThreadSize));
-      p->curr.execWidth = this->simdWidth;
-      p->ADD(stackptr, stackptr, tmpReg);
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
+      p->MUL(stackptr, stackptr, GenRegister::immud(perLaneSize)); // (threadId * simdWidth + laneId)*perLaneSize
     p->pop();
   }
 
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 22ec0ea..25cce85 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -110,7 +110,8 @@ namespace gbe
     }
 
     void loadLaneID(GenRegister dst);
-
+    GenRegister getBlockIP(void);
+    void setBlockIP(GenRegister blockip, uint32_t label);
     void collectShifter(GenRegister dest, GenRegister src);
     void loadTopHalf(GenRegister dest, GenRegister src);
     void storeTopHalf(GenRegister dest, GenRegister src);
-- 
2.1.4