[Beignet] [PATCH] remove GBE_CURBE_STACK_POINTER in payload

Mon Aug 24 01:59:44 PDT 2015

Some comments.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Guo Yejun
> Sent: Friday, August 14, 2015 02:52
> To: beignet at lists.freedesktop.org
> Cc: Guo, Yejun
> Subject: [Beignet] [PATCH] remove GBE_CURBE_STACK_POINTER in payload
> 
> initialize the data inside kernel with packed integer vector
> 
> Signed-off-by: Guo Yejun <yejun.guo at intel.com>
> ---
>  backend/src/backend/context.cpp            | 66 ++----------------------------
>  backend/src/backend/context.hpp            | 61
> ++++++++++++++++++++++++++-
>  backend/src/backend/gen75_context.cpp      |  4 +-
>  backend/src/backend/gen_context.cpp        | 33 ++++++++++++++-
>  backend/src/backend/gen_context.hpp        |  2 +
>  backend/src/backend/gen_reg_allocation.cpp | 26 +++++++-----
>  backend/src/backend/program.h              |  1 -
>  backend/src/backend/program.hpp            |  2 +-
>  src/cl_command_queue_gen7.c                |  9 ----
>  9 files changed, 116 insertions(+), 88 deletions(-)
> 
> diff --git a/backend/src/backend/context.cpp
> b/backend/src/backend/context.cpp index b8dfa8c..16b5961 100644
> --- a/backend/src/backend/context.cpp
> +++ b/backend/src/backend/context.cpp
> @@ -35,66 +35,6 @@
> 
>  namespace gbe
>  {
> -  class SimpleAllocator
> -  {
> -  public:
> -    SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
> -    ~SimpleAllocator(void);
> -
> -    /*! Allocate some memory from the pool.
> -     */
> -    int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
> -
> -    /*! Free the given register file piece */
> -    void deallocate(int16_t offset);
> -
> -    /*! Spilt a block into 2 blocks */
> -    void splitBlock(int16_t offset, int16_t subOffset);
> -
> -  protected:
> -    /*! Double chained list of free spaces */
> -    struct Block {
> -      Block(int16_t offset, int16_t size) :
> -        prev(NULL), next(NULL), offset(offset), size(size) {}
> -      Block *prev, *next; //!< Previous and next free blocks
> -      int16_t offset;        //!< Where the free block starts
> -      int16_t size;          //!< Size of the free block
> -    };
> -
> -    /*! Try to coalesce two blocks (left and right). They must be in that order.
> -     *  If the colascing was done, the left block is deleted
> -     */
> -    void coalesce(Block *left, Block *right);
> -    /*! the maximum offset */
> -    int16_t maxOffset;
> -    /*! whether trigger an assertion on allocation failure */
> -    bool assertFail;
> -    /*! Head and tail of the free list */
> -    Block *head;
> -    Block *tail;
> -    /*! Handle free list element allocation */
> -    DECL_POOL(Block, blockPool);
> -    /*! Track allocated memory blocks <offset, size> */
> -    map<int16_t, int16_t> allocatedBlocks;
> -    /*! Use custom allocators */
> -    GBE_CLASS(SimpleAllocator);
> -  };
> -
> -  /*! Structure that keeps track of allocation in the register file. This is
> -   *  actually needed by Context (and not only by GenContext) because both
> -   *  simulator and hardware have to deal with constant pushing which uses
> the
> -   *  register file
> -   *
> -   *  Since Gen is pretty flexible, we just reuse the Simpleallocator
> -   */
> -
> -  class RegisterAllocator: public SimpleAllocator {
> -  public:
> -    RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size,
> false) {}
> -
> -    GBE_CLASS(RegisterAllocator);
> -  };
> -
>    /*!
>     * an allocator for scratch memory allocation. Scratch memory are used for
> register spilling.
>     * You can query how much scratch memory needed through
> getMaxScatchMemUsed().
> @@ -396,10 +336,10 @@ namespace gbe
> 
>    void Context::buildStack(void) {
>      const auto &stackUse = dag->getUse(ir::ocl::stackptr);
> -    if (stackUse.size() == 0)  // no stack is used if stackptr is unused
> +    if (stackUse.size() == 0) {  // no stack is used if stackptr is unused
> +      this->kernel->stackSize = 0;
>        return;
> -    // Be sure that the stack pointer is set
> -    // GBE_ASSERT(this->kernel-
> >getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
> +    }
>      uint32_t stackSize = 128;
>      while (stackSize < fn.getStackSize()) {
>        stackSize *= 3;
> diff --git a/backend/src/backend/context.hpp
> b/backend/src/backend/context.hpp index faa7c8a..48b46ed 100644
> --- a/backend/src/backend/context.hpp
> +++ b/backend/src/backend/context.hpp
> @@ -42,9 +42,68 @@ namespace ir {
>  namespace gbe
>  {
>    class Kernel;                 // context creates Kernel
> -  class RegisterAllocator;      // allocator for physical register allocation
>    class ScratchAllocator;       // allocator for scratch memory allocation
> 
> +  class SimpleAllocator
> +  {
> +  public:
> +    SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
> +    ~SimpleAllocator(void);
> +
> +    /*! Allocate some memory from the pool.
> +     */
> +    int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
> +
> +    /*! Free the given register file piece */
> +    void deallocate(int16_t offset);
> +
> +    /*! Spilt a block into 2 blocks */
> +    void splitBlock(int16_t offset, int16_t subOffset);
> +
> +  protected:
> +    /*! Double chained list of free spaces */
> +    struct Block {
> +      Block(int16_t offset, int16_t size) :
> +        prev(NULL), next(NULL), offset(offset), size(size) {}
> +      Block *prev, *next; //!< Previous and next free blocks
> +      int16_t offset;        //!< Where the free block starts
> +      int16_t size;          //!< Size of the free block
> +    };
> +
> +    /*! Try to coalesce two blocks (left and right). They must be in that order.
> +     *  If the colascing was done, the left block is deleted
> +     */
> +    void coalesce(Block *left, Block *right);
> +    /*! the maximum offset */
> +    int16_t maxOffset;
> +    /*! whether trigger an assertion on allocation failure */
> +    bool assertFail;
> +    /*! Head and tail of the free list */
> +    Block *head;
> +    Block *tail;
> +    /*! Handle free list element allocation */
> +    DECL_POOL(Block, blockPool);
> +    /*! Track allocated memory blocks <offset, size> */
> +    map<int16_t, int16_t> allocatedBlocks;
> +    /*! Use custom allocators */
> +    GBE_CLASS(SimpleAllocator);
> +  };
> +
> +  /*! Structure that keeps track of allocation in the register file. This is
> +   *  actually needed by Context (and not only by GenContext) because both
> +   *  simulator and hardware have to deal with constant pushing which uses
> the
> +   *  register file
> +   *
> +   *  Since Gen is pretty flexible, we just reuse the Simpleallocator
> +   */
> +
> +  class RegisterAllocator: public SimpleAllocator {
> +  public:
> +    RegisterAllocator(int16_t offset, int16_t size):
> + SimpleAllocator(offset, size, false) {}
> +
> +    GBE_CLASS(RegisterAllocator);
> +  };
> +
>    /*! Context is the helper structure to build the Gen ISA or simulation code
>     *  from GenIR
>     */
> diff --git a/backend/src/backend/gen75_context.cpp
> b/backend/src/backend/gen75_context.cpp
> index b9dfb18..7d407c3 100644
> --- a/backend/src/backend/gen75_context.cpp
> +++ b/backend/src/backend/gen75_context.cpp
> @@ -67,7 +67,7 @@ namespace gbe
>      using namespace ir;
> 
>      // Only emit stack pointer computation if we use a stack
> -    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
> +    if (kernel->getStackSize() == 0)
>        return;
> 
>      // Check that everything is consistent in the kernel code @@ -80,6 +80,8
> @@ namespace gbe
>        GenRegister::ud16grf(ir::ocl::stackptr);
>      const GenRegister stackptr = ra->genReg(selStatckPtr);
> 
> +    loadLaneID(stackptr);
> +
>      // We compute the per-lane stack pointer here
>      // private address start from zero
>      p->push();
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 0c301dd..25fdf08 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -176,11 +176,39 @@ namespace gbe
>      p->pop();
>    }
> 
> +  void GenContext::loadLaneID(GenRegister dst) {
> +    const GenRegister laneID = GenRegister::immv(0x76543210);
> +    GenRegister dst_;
> +    if (dst.type == GEN_TYPE_UW)
> +      dst_ = dst;
> +    else
> +      dst_ = GenRegister::uw16grf(126,0);
Does GEN only support move immv register to word or unsigned word?

> +
> +    p->push();
> +      uint32_t execWidth = p->curr.execWidth;
> +      p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
> +      if (execWidth == 8)
> +        p->MOV(dst_, laneID);
> +      else {
> +        p->curr.execWidth = 8;
> +        p->MOV(dst_, laneID);
> +        //Packed Unsigned Half-Byte Integer Vector does not work
> +        //have to mock by adding 8 to the singed vector
> +        const GenRegister eight = GenRegister::immuw(8);
> +        p->ADD(GenRegister::offset(dst_, 0, 16), dst_, eight);
> +        p->curr.execWidth = 16;
> +      }
> +      if (dst.type != GEN_TYPE_UW)
> +        p->MOV(dst, dst_);
> +    p->pop();
> +  }
> +
>    void GenContext::emitStackPointer(void) {
>      using namespace ir;
> 
>      // Only emit stack pointer computation if we use a stack
> -    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
> +    if (kernel->getStackSize() == 0)
>        return;
> 
>      // Check that everything is consistent in the kernel code @@ -193,6 +221,8
> @@ namespace gbe
>        GenRegister::ud16grf(ir::ocl::stackptr);
>      const GenRegister stackptr = ra->genReg(selStatckPtr);
> 
> +    loadLaneID(stackptr);
> +
>      // We compute the per-lane stack pointer here
>      // threadId * perThreadSize + laneId*perLaneSize
>      // let private address start from zero @@ -2254,7 +2284,6 @@ namespace
> gbe
>          INSERT_REG(numgroup0, GROUP_NUM_X)
>          INSERT_REG(numgroup1, GROUP_NUM_Y)
>          INSERT_REG(numgroup2, GROUP_NUM_Z)
> -        INSERT_REG(stackptr, STACK_POINTER)
>          INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
>          INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
>          do {} while(0);
> diff --git a/backend/src/backend/gen_context.hpp
> b/backend/src/backend/gen_context.hpp
> index 8ef725f..34f9293 100644
> --- a/backend/src/backend/gen_context.hpp
> +++ b/backend/src/backend/gen_context.hpp
> @@ -107,6 +107,8 @@ namespace gbe
>        return this->liveness->getLiveIn(bb);
>      }
> 
> +    void loadLaneID(GenRegister dst);
> +
>      void collectShifter(GenRegister dest, GenRegister src);
>      void loadTopHalf(GenRegister dest, GenRegister src);
>      void storeTopHalf(GenRegister dest, GenRegister src); diff --git
> a/backend/src/backend/gen_reg_allocation.cpp
> b/backend/src/backend/gen_reg_allocation.cpp
> index 4cb88e9..36ee80a 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -133,8 +133,8 @@ namespace gbe
>      void validateFlag(Selection &selection, SelectionInstruction &insn);
>      /*! Allocate the GRF registers */
>      bool allocateGRFs(Selection &selection);
> -    /*! Create gen registers for all preallocated curbe registers. */
> -    void allocatePayloadRegs(void);
> +    /*! Create gen registers for all preallocated special registers. */
> +    void allocateSpecialRegs(void);
>      /*! Create a Gen register from a register set in the payload */
>      void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset =
> 0);
>      /*! Create the intervals for each register */ @@ -228,7 +228,7 @@
> namespace gbe
>      this->intervals[reg].maxID = 0;
>    }
> 
> -  INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
> +  INLINE void GenRegAllocator::Opaque::allocateSpecialRegs(void) {
>      using namespace ir;
>      for(auto &it : this->ctx.curbeRegs)
>        allocatePayloadReg(it.first, it.second); @@ -248,6 +248,18 @@
> namespace gbe
>        allocatePayloadReg(reg, it->second, subOffset);
>        ctx.splitBlock(it->second, subOffset);
>      }
> +
> +    if (RA.contains(ocl::stackbuffer)) {
> +      uint32_t regSize = this->ctx.ra->getRegSize(ocl::stackptr);
You can use this->getRegAttrib directly.

> +      uint32_t offset = this->ctx.registerAllocator->allocate(regSize, regSize, 1);
Should not call ctx.registerAllocator->allocate, you can use ctx.allocate. 

> +      RA.insert(std::make_pair(ocl::stackptr, offset));
> +    }
> +
> +    // Group and barrier IDs are always allocated by the hardware in r0
> +    RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
> +    RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
> +    RA.insert(std::make_pair(ocl::groupid2,  7*sizeof(float))); // r0.7
> +    RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
>    }
> 
>    bool GenRegAllocator::Opaque::createGenReg(const Selection &selection,
> const GenRegInterval &interval) { @@ -1001,13 +1013,7 @@ namespace gbe
>        this->intervals.push_back(ir::Register(regID));
> 
>      // Allocate the special registers (only those which are actually used)
> -    this->allocatePayloadRegs();
> -
> -    // Group and barrier IDs are always allocated by the hardware in r0
> -    RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
> -    RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
> -    RA.insert(std::make_pair(ocl::groupid2,  7*sizeof(float))); // r0.7
> -    RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
> +    this->allocateSpecialRegs();
> 
>      // block IP used to handle the mask in SW is always allocated
> 
> diff --git a/backend/src/backend/program.h
> b/backend/src/backend/program.h index fa75052..af19732 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -91,7 +91,6 @@ enum gbe_curbe_type {
>    GBE_CURBE_GROUP_NUM_Z,
>    GBE_CURBE_WORK_DIM,
>    GBE_CURBE_IMAGE_INFO,
> -  GBE_CURBE_STACK_POINTER,
>    GBE_CURBE_PRINTF_BUF_POINTER,
>    GBE_CURBE_PRINTF_INDEX_POINTER,
>    GBE_CURBE_KERNEL_ARGUMENT,
> diff --git a/backend/src/backend/program.hpp
> b/backend/src/backend/program.hpp index cff2463..efe192f 100644
> --- a/backend/src/backend/program.hpp
> +++ b/backend/src/backend/program.hpp
> @@ -223,7 +223,7 @@ namespace gbe {
>      uint32_t argNum;           //!< Number of function arguments
>      uint32_t curbeSize;        //!< Size of the data to push
>      uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
> -    uint32_t stackSize;        //!< Stack size (may be 0 if unused)
> +    uint32_t stackSize;        //!< Stack size (0 if unused)
>      uint32_t scratchSize;      //!< Scratch memory size (may be 0 if unused)
>      bool useSLM;               //!< SLM requires a special HW config
>      uint32_t slmSize;          //!< slm size for kernel variable
> diff --git a/src/cl_command_queue_gen7.c
> b/src/cl_command_queue_gen7.c index 4adbd2b..0e60528 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -210,15 +210,6 @@ cl_curbe_fill(cl_kernel ker,
>    UPLOAD(GBE_CURBE_WORK_DIM, work_dim);  #undef UPLOAD
> 
> -  /* Write identity for the stack pointer. This is required by the stack pointer
> -   * computation in the kernel
> -   */
> -  if ((offset = interp_kernel_get_curbe_offset(ker->opaque,
> GBE_CURBE_STACK_POINTER, 0)) >= 0) {
> -    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
> -    uint32_t *stackptr = (uint32_t *) (ker->curbe + offset);
> -    int32_t i;
> -    for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
> -  }
>    /* Handle the various offsets to SLM */
>    const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
>    int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque);
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet