[Beignet] [PATCH] remove GBE_CURBE_STACK_POINTER in payload

Thu Aug 13 11:52:29 PDT 2015

initialize the data inside kernel with packed integer vector

Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 backend/src/backend/context.cpp            | 66 ++----------------------------
 backend/src/backend/context.hpp            | 61 ++++++++++++++++++++++++++-
 backend/src/backend/gen75_context.cpp      |  4 +-
 backend/src/backend/gen_context.cpp        | 33 ++++++++++++++-
 backend/src/backend/gen_context.hpp        |  2 +
 backend/src/backend/gen_reg_allocation.cpp | 26 +++++++-----
 backend/src/backend/program.h              |  1 -
 backend/src/backend/program.hpp            |  2 +-
 src/cl_command_queue_gen7.c                |  9 ----
 9 files changed, 116 insertions(+), 88 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index b8dfa8c..16b5961 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -35,66 +35,6 @@
 
 namespace gbe
 {
-  class SimpleAllocator
-  {
-  public:
-    SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
-    ~SimpleAllocator(void);
-
-    /*! Allocate some memory from the pool.
-     */
-    int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
-
-    /*! Free the given register file piece */
-    void deallocate(int16_t offset);
-
-    /*! Spilt a block into 2 blocks */
-    void splitBlock(int16_t offset, int16_t subOffset);
-
-  protected:
-    /*! Double chained list of free spaces */
-    struct Block {
-      Block(int16_t offset, int16_t size) :
-        prev(NULL), next(NULL), offset(offset), size(size) {}
-      Block *prev, *next; //!< Previous and next free blocks
-      int16_t offset;        //!< Where the free block starts
-      int16_t size;          //!< Size of the free block
-    };
-
-    /*! Try to coalesce two blocks (left and right). They must be in that order.
-     *  If the colascing was done, the left block is deleted
-     */
-    void coalesce(Block *left, Block *right);
-    /*! the maximum offset */
-    int16_t maxOffset;
-    /*! whether trigger an assertion on allocation failure */
-    bool assertFail;
-    /*! Head and tail of the free list */
-    Block *head;
-    Block *tail;
-    /*! Handle free list element allocation */
-    DECL_POOL(Block, blockPool);
-    /*! Track allocated memory blocks <offset, size> */
-    map<int16_t, int16_t> allocatedBlocks;
-    /*! Use custom allocators */
-    GBE_CLASS(SimpleAllocator);
-  };
-
-  /*! Structure that keeps track of allocation in the register file. This is
-   *  actually needed by Context (and not only by GenContext) because both
-   *  simulator and hardware have to deal with constant pushing which uses the
-   *  register file
-   *
-   *  Since Gen is pretty flexible, we just reuse the Simpleallocator
-   */
-
-  class RegisterAllocator: public SimpleAllocator {
-  public:
-    RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {}
-
-    GBE_CLASS(RegisterAllocator);
-  };
-
   /*!
    * an allocator for scratch memory allocation. Scratch memory are used for register spilling.
    * You can query how much scratch memory needed through getMaxScatchMemUsed().
@@ -396,10 +336,10 @@ namespace gbe
 
   void Context::buildStack(void) {
     const auto &stackUse = dag->getUse(ir::ocl::stackptr);
-    if (stackUse.size() == 0)  // no stack is used if stackptr is unused
+    if (stackUse.size() == 0) {  // no stack is used if stackptr is unused
+      this->kernel->stackSize = 0;
       return;
-    // Be sure that the stack pointer is set
-    // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
+    }
     uint32_t stackSize = 128;
     while (stackSize < fn.getStackSize()) {
       stackSize *= 3;
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index faa7c8a..48b46ed 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -42,9 +42,68 @@ namespace ir {
 namespace gbe
 {
   class Kernel;                 // context creates Kernel
-  class RegisterAllocator;      // allocator for physical register allocation
   class ScratchAllocator;       // allocator for scratch memory allocation
 
+  class SimpleAllocator
+  {
+  public:
+    SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
+    ~SimpleAllocator(void);
+
+    /*! Allocate some memory from the pool.
+     */
+    int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
+
+    /*! Free the given register file piece */
+    void deallocate(int16_t offset);
+
+    /*! Spilt a block into 2 blocks */
+    void splitBlock(int16_t offset, int16_t subOffset);
+
+  protected:
+    /*! Double chained list of free spaces */
+    struct Block {
+      Block(int16_t offset, int16_t size) :
+        prev(NULL), next(NULL), offset(offset), size(size) {}
+      Block *prev, *next; //!< Previous and next free blocks
+      int16_t offset;        //!< Where the free block starts
+      int16_t size;          //!< Size of the free block
+    };
+
+    /*! Try to coalesce two blocks (left and right). They must be in that order.
+     *  If the colascing was done, the left block is deleted
+     */
+    void coalesce(Block *left, Block *right);
+    /*! the maximum offset */
+    int16_t maxOffset;
+    /*! whether trigger an assertion on allocation failure */
+    bool assertFail;
+    /*! Head and tail of the free list */
+    Block *head;
+    Block *tail;
+    /*! Handle free list element allocation */
+    DECL_POOL(Block, blockPool);
+    /*! Track allocated memory blocks <offset, size> */
+    map<int16_t, int16_t> allocatedBlocks;
+    /*! Use custom allocators */
+    GBE_CLASS(SimpleAllocator);
+  };
+
+  /*! Structure that keeps track of allocation in the register file. This is
+   *  actually needed by Context (and not only by GenContext) because both
+   *  simulator and hardware have to deal with constant pushing which uses the
+   *  register file
+   *
+   *  Since Gen is pretty flexible, we just reuse the Simpleallocator
+   */
+
+  class RegisterAllocator: public SimpleAllocator {
+  public:
+    RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {}
+
+    GBE_CLASS(RegisterAllocator);
+  };
+
   /*! Context is the helper structure to build the Gen ISA or simulation code
    *  from GenIR
    */
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index b9dfb18..7d407c3 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -67,7 +67,7 @@ namespace gbe
     using namespace ir;
 
     // Only emit stack pointer computation if we use a stack
-    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+    if (kernel->getStackSize() == 0)
       return;
 
     // Check that everything is consistent in the kernel code
@@ -80,6 +80,8 @@ namespace gbe
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
 
+    loadLaneID(stackptr);
+
     // We compute the per-lane stack pointer here
     // private address start from zero
     p->push();
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 0c301dd..25fdf08 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -176,11 +176,39 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::loadLaneID(GenRegister dst) {
+    const GenRegister laneID = GenRegister::immv(0x76543210);
+    GenRegister dst_;
+    if (dst.type == GEN_TYPE_UW)
+      dst_ = dst;
+    else
+      dst_ = GenRegister::uw16grf(126,0);
+
+    p->push();
+      uint32_t execWidth = p->curr.execWidth;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      if (execWidth == 8)
+        p->MOV(dst_, laneID);
+      else {
+        p->curr.execWidth = 8;
+        p->MOV(dst_, laneID);
+        //Packed Unsigned Half-Byte Integer Vector does not work
+        //have to mock by adding 8 to the singed vector
+        const GenRegister eight = GenRegister::immuw(8);
+        p->ADD(GenRegister::offset(dst_, 0, 16), dst_, eight);
+        p->curr.execWidth = 16;
+      }
+      if (dst.type != GEN_TYPE_UW)
+        p->MOV(dst, dst_);
+    p->pop();
+  }
+
   void GenContext::emitStackPointer(void) {
     using namespace ir;
 
     // Only emit stack pointer computation if we use a stack
-    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+    if (kernel->getStackSize() == 0)
       return;
 
     // Check that everything is consistent in the kernel code
@@ -193,6 +221,8 @@ namespace gbe
       GenRegister::ud16grf(ir::ocl::stackptr);
     const GenRegister stackptr = ra->genReg(selStatckPtr);
 
+    loadLaneID(stackptr);
+
     // We compute the per-lane stack pointer here
     // threadId * perThreadSize + laneId*perLaneSize
     // let private address start from zero
@@ -2254,7 +2284,6 @@ namespace gbe
         INSERT_REG(numgroup0, GROUP_NUM_X)
         INSERT_REG(numgroup1, GROUP_NUM_Y)
         INSERT_REG(numgroup2, GROUP_NUM_Z)
-        INSERT_REG(stackptr, STACK_POINTER)
         INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
         INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
         do {} while(0);
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 8ef725f..34f9293 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -107,6 +107,8 @@ namespace gbe
       return this->liveness->getLiveIn(bb);
     }
 
+    void loadLaneID(GenRegister dst);
+
     void collectShifter(GenRegister dest, GenRegister src);
     void loadTopHalf(GenRegister dest, GenRegister src);
     void storeTopHalf(GenRegister dest, GenRegister src);
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 4cb88e9..36ee80a 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -133,8 +133,8 @@ namespace gbe
     void validateFlag(Selection &selection, SelectionInstruction &insn);
     /*! Allocate the GRF registers */
     bool allocateGRFs(Selection &selection);
-    /*! Create gen registers for all preallocated curbe registers. */
-    void allocatePayloadRegs(void);
+    /*! Create gen registers for all preallocated special registers. */
+    void allocateSpecialRegs(void);
     /*! Create a Gen register from a register set in the payload */
     void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0);
     /*! Create the intervals for each register */
@@ -228,7 +228,7 @@ namespace gbe
     this->intervals[reg].maxID = 0;
   }
 
-  INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
+  INLINE void GenRegAllocator::Opaque::allocateSpecialRegs(void) {
     using namespace ir;
     for(auto &it : this->ctx.curbeRegs)
       allocatePayloadReg(it.first, it.second);
@@ -248,6 +248,18 @@ namespace gbe
       allocatePayloadReg(reg, it->second, subOffset);
       ctx.splitBlock(it->second, subOffset);
     }
+
+    if (RA.contains(ocl::stackbuffer)) {
+      uint32_t regSize = this->ctx.ra->getRegSize(ocl::stackptr);
+      uint32_t offset = this->ctx.registerAllocator->allocate(regSize, regSize, 1);
+      RA.insert(std::make_pair(ocl::stackptr, offset));
+    }
+
+    // Group and barrier IDs are always allocated by the hardware in r0
+    RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
+    RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
+    RA.insert(std::make_pair(ocl::groupid2,  7*sizeof(float))); // r0.7
+    RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
   }
 
   bool GenRegAllocator::Opaque::createGenReg(const Selection &selection, const GenRegInterval &interval) {
@@ -1001,13 +1013,7 @@ namespace gbe
       this->intervals.push_back(ir::Register(regID));
 
     // Allocate the special registers (only those which are actually used)
-    this->allocatePayloadRegs();
-
-    // Group and barrier IDs are always allocated by the hardware in r0
-    RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
-    RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
-    RA.insert(std::make_pair(ocl::groupid2,  7*sizeof(float))); // r0.7
-    RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
+    this->allocateSpecialRegs();
 
     // block IP used to handle the mask in SW is always allocated
 
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index fa75052..af19732 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -91,7 +91,6 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_Z,
   GBE_CURBE_WORK_DIM,
   GBE_CURBE_IMAGE_INFO,
-  GBE_CURBE_STACK_POINTER,
   GBE_CURBE_PRINTF_BUF_POINTER,
   GBE_CURBE_PRINTF_INDEX_POINTER,
   GBE_CURBE_KERNEL_ARGUMENT,
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index cff2463..efe192f 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -223,7 +223,7 @@ namespace gbe {
     uint32_t argNum;           //!< Number of function arguments
     uint32_t curbeSize;        //!< Size of the data to push
     uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
-    uint32_t stackSize;        //!< Stack size (may be 0 if unused)
+    uint32_t stackSize;        //!< Stack size (0 if unused)
     uint32_t scratchSize;      //!< Scratch memory size (may be 0 if unused)
     bool useSLM;               //!< SLM requires a special HW config
     uint32_t slmSize;          //!< slm size for kernel variable
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 4adbd2b..0e60528 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -210,15 +210,6 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
 #undef UPLOAD
 
-  /* Write identity for the stack pointer. This is required by the stack pointer
-   * computation in the kernel
-   */
-  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) {
-    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
-    uint32_t *stackptr = (uint32_t *) (ker->curbe + offset);
-    int32_t i;
-    for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
-  }
   /* Handle the various offsets to SLM */
   const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
   int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque);
-- 
1.9.1