[Beignet] [PATCH V2] remove GBE_CURBE_STACK_POINTER in payload

Mon Aug 24 13:03:51 PDT 2015

initialize the data inside kernel with packed integer vector

V2: call functions from ctx, instead of ctx.registerAllocator
Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 backend/src/backend/context.cpp            | 10 ++++-----
 backend/src/backend/context.hpp            |  2 +-
 backend/src/backend/gen75_context.cpp      |  4 +++-
 backend/src/backend/gen_context.cpp        | 33 ++++++++++++++++++++++++++++--
 backend/src/backend/gen_context.hpp        |  2 ++
 backend/src/backend/gen_reg_allocation.cpp | 27 +++++++++++++++---------
 backend/src/backend/program.h              |  1 -
 backend/src/backend/program.hpp            |  2 +-
 src/cl_command_queue_gen7.c                |  9 --------
 9 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index b8dfa8c..33b2409 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -373,8 +373,8 @@ namespace gbe
     return this->kernel;
   }
 
-  int16_t Context::allocate(int16_t size, int16_t alignment) {
-    return registerAllocator->allocate(size, alignment);
+  int16_t Context::allocate(int16_t size, int16_t alignment, bool bFwd) {
+    return registerAllocator->allocate(size, alignment, bFwd);
   }
 
   void Context::deallocate(int16_t offset) { registerAllocator->deallocate(offset); }
@@ -396,10 +396,10 @@ namespace gbe
 
   void Context::buildStack(void) {
     const auto &stackUse = dag->getUse(ir::ocl::stackptr);
-    if (stackUse.size() == 0)  // no stack is used if stackptr is unused
+    if (stackUse.size() == 0) {  // no stack is used if stackptr is unused
+      this->kernel->stackSize = 0;
       return;
-    // Be sure that the stack pointer is set
-    // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
+    }
     uint32_t stackSize = 128;
     while (stackSize < fn.getStackSize()) {
       stackSize *= 3;
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index faa7c8a..079967d 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -85,7 +85,7 @@ namespace gbe
       return JIPs.find(insn) != JIPs.end();
     }
     /*! Allocate some memory in the register file */
-    int16_t allocate(int16_t size, int16_t alignment);
+    int16_t allocate(int16_t size, int16_t alignment, bool bFwd=0);
     /*! Deallocate previously allocated memory */
     void deallocate(int16_t offset);
     /*! Spilt a block into 2 blocks, for some registers allocate together but  deallocate seperate */
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index b9dfb18..7d407c3 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -67,7 +67,7 @@ namespace gbe
     using namespace ir;
 
     // Only emit stack pointer computation if we use a stack
-    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+    if (kernel->getStackSize() == 0)
       return;
 
     // Check that everything is consistent in the kernel code
@@ -80,6 +80,8 @@ namespace gbe
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
 
+    loadLaneID(stackptr);
+
     // We compute the per-lane stack pointer here
     // private address start from zero
     p->push();
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0c301dd..25fdf08 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -176,11 +176,39 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::loadLaneID(GenRegister dst) {
+    const GenRegister laneID = GenRegister::immv(0x76543210);
+    GenRegister dst_;
+    if (dst.type == GEN_TYPE_UW)
+      dst_ = dst;
+    else
+      dst_ = GenRegister::uw16grf(126,0);
+
+    p->push();
+      uint32_t execWidth = p->curr.execWidth;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      if (execWidth == 8)
+        p->MOV(dst_, laneID);
+      else {
+        p->curr.execWidth = 8;
+        p->MOV(dst_, laneID);
+        //Packed Unsigned Half-Byte Integer Vector does not work
+        //have to mock by adding 8 to the singed vector
+        const GenRegister eight = GenRegister::immuw(8);
+        p->ADD(GenRegister::offset(dst_, 0, 16), dst_, eight);
+        p->curr.execWidth = 16;
+      }
+      if (dst.type != GEN_TYPE_UW)
+        p->MOV(dst, dst_);
+    p->pop();
+  }
+
   void GenContext::emitStackPointer(void) {
     using namespace ir;
 
     // Only emit stack pointer computation if we use a stack
-    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+    if (kernel->getStackSize() == 0)
       return;
 
     // Check that everything is consistent in the kernel code
@@ -193,6 +221,8 @@ namespace gbe
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
 
+    loadLaneID(stackptr);
+
     // We compute the per-lane stack pointer here
     // threadId * perThreadSize + laneId*perLaneSize
     // let private address start from zero
@@ -2254,7 +2284,6 @@ namespace gbe
         INSERT_REG(numgroup0, GROUP_NUM_X)
         INSERT_REG(numgroup1, GROUP_NUM_Y)
         INSERT_REG(numgroup2, GROUP_NUM_Z)
-        INSERT_REG(stackptr, STACK_POINTER)
         INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
         INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
         do {} while(0);
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 8ef725f..34f9293 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -107,6 +107,8 @@ namespace gbe
       return this->liveness->getLiveIn(bb);
     }
 
+    void loadLaneID(GenRegister dst);
+
     void collectShifter(GenRegister dest, GenRegister src);
     void loadTopHalf(GenRegister dest, GenRegister src);
     void storeTopHalf(GenRegister dest, GenRegister src);
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 4cb88e9..39f1934 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -133,8 +133,8 @@ namespace gbe
     void validateFlag(Selection &selection, SelectionInstruction &insn);
     /*! Allocate the GRF registers */
     bool allocateGRFs(Selection &selection);
-    /*! Create gen registers for all preallocated curbe registers. */
-    void allocatePayloadRegs(void);
+    /*! Create gen registers for all preallocated special registers. */
+    void allocateSpecialRegs(void);
     /*! Create a Gen register from a register set in the payload */
     void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0);
     /*! Create the intervals for each register */
@@ -228,7 +228,7 @@ namespace gbe
     this->intervals[reg].maxID = 0;
   }
 
-  INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
+  INLINE void GenRegAllocator::Opaque::allocateSpecialRegs(void) {
     using namespace ir;
     for(auto &it : this->ctx.curbeRegs)
       allocatePayloadReg(it.first, it.second);
@@ -248,6 +248,19 @@ namespace gbe
       allocatePayloadReg(reg, it->second, subOffset);
       ctx.splitBlock(it->second, subOffset);
     }
+
+    if (RA.contains(ocl::stackbuffer)) {
+      uint32_t regSize = 0;
+      this->getRegAttrib(ocl::stackptr, regSize);
+      uint32_t offset = this->ctx.allocate(regSize, regSize, 1);
+      RA.insert(std::make_pair(ocl::stackptr, offset));
+    }
+
+    // Group and barrier IDs are always allocated by the hardware in r0
+    RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
+    RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
+    RA.insert(std::make_pair(ocl::groupid2,  7*sizeof(float))); // r0.7
+    RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
   }
 
   bool GenRegAllocator::Opaque::createGenReg(const Selection &selection, const GenRegInterval &interval) {
@@ -1001,13 +1014,7 @@ namespace gbe
       this->intervals.push_back(ir::Register(regID));
 
     // Allocate the special registers (only those which are actually used)
-    this->allocatePayloadRegs();
-
-    // Group and barrier IDs are always allocated by the hardware in r0
-    RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
-    RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
-    RA.insert(std::make_pair(ocl::groupid2,  7*sizeof(float))); // r0.7
-    RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
+    this->allocateSpecialRegs();
 
     // block IP used to handle the mask in SW is always allocated
 
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index fa75052..af19732 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -91,7 +91,6 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_Z,
   GBE_CURBE_WORK_DIM,
   GBE_CURBE_IMAGE_INFO,
-  GBE_CURBE_STACK_POINTER,
   GBE_CURBE_PRINTF_BUF_POINTER,
   GBE_CURBE_PRINTF_INDEX_POINTER,
   GBE_CURBE_KERNEL_ARGUMENT,
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index cff2463..efe192f 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -223,7 +223,7 @@ namespace gbe {
     uint32_t argNum;           //!< Number of function arguments
     uint32_t curbeSize;        //!< Size of the data to push
     uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
-    uint32_t stackSize;        //!< Stack size (may be 0 if unused)
+    uint32_t stackSize;        //!< Stack size (0 if unused)
     uint32_t scratchSize;      //!< Scratch memory size (may be 0 if unused)
     bool useSLM;               //!< SLM requires a special HW config
     uint32_t slmSize;          //!< slm size for kernel variable
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 4adbd2b..0e60528 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -210,15 +210,6 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
 #undef UPLOAD
 
-  /* Write identity for the stack pointer. This is required by the stack pointer
-   * computation in the kernel
-   */
-  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) {
-    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
-    uint32_t *stackptr = (uint32_t *) (ker->curbe + offset);
-    int32_t i;
-    for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
-  }
   /* Handle the various offsets to SLM */
   const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
   int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque);
-- 
1.9.1