[Beignet] [PATCH] remove GBE_CURBE_STACK_POINTER in payload
Yang, Rong R
rong.r.yang at intel.com
Mon Aug 24 01:59:44 PDT 2015
Some comments.
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Guo Yejun
> Sent: Friday, August 14, 2015 02:52
> To: beignet at lists.freedesktop.org
> Cc: Guo, Yejun
> Subject: [Beignet] [PATCH] remove GBE_CURBE_STACK_POINTER in payload
>
> initialize the data inside kernel with packed integer vector
>
> Signed-off-by: Guo Yejun <yejun.guo at intel.com>
> ---
> backend/src/backend/context.cpp | 66 ++----------------------------
> backend/src/backend/context.hpp | 61
> ++++++++++++++++++++++++++-
> backend/src/backend/gen75_context.cpp | 4 +-
> backend/src/backend/gen_context.cpp | 33 ++++++++++++++-
> backend/src/backend/gen_context.hpp | 2 +
> backend/src/backend/gen_reg_allocation.cpp | 26 +++++++-----
> backend/src/backend/program.h | 1 -
> backend/src/backend/program.hpp | 2 +-
> src/cl_command_queue_gen7.c | 9 ----
> 9 files changed, 116 insertions(+), 88 deletions(-)
>
> diff --git a/backend/src/backend/context.cpp
> b/backend/src/backend/context.cpp index b8dfa8c..16b5961 100644
> --- a/backend/src/backend/context.cpp
> +++ b/backend/src/backend/context.cpp
> @@ -35,66 +35,6 @@
>
> namespace gbe
> {
> - class SimpleAllocator
> - {
> - public:
> - SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
> - ~SimpleAllocator(void);
> -
> - /*! Allocate some memory from the pool.
> - */
> - int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
> -
> - /*! Free the given register file piece */
> - void deallocate(int16_t offset);
> -
> - /*! Spilt a block into 2 blocks */
> - void splitBlock(int16_t offset, int16_t subOffset);
> -
> - protected:
> - /*! Double chained list of free spaces */
> - struct Block {
> - Block(int16_t offset, int16_t size) :
> - prev(NULL), next(NULL), offset(offset), size(size) {}
> - Block *prev, *next; //!< Previous and next free blocks
> - int16_t offset; //!< Where the free block starts
> - int16_t size; //!< Size of the free block
> - };
> -
> - /*! Try to coalesce two blocks (left and right). They must be in that order.
> - * If the colascing was done, the left block is deleted
> - */
> - void coalesce(Block *left, Block *right);
> - /*! the maximum offset */
> - int16_t maxOffset;
> - /*! whether trigger an assertion on allocation failure */
> - bool assertFail;
> - /*! Head and tail of the free list */
> - Block *head;
> - Block *tail;
> - /*! Handle free list element allocation */
> - DECL_POOL(Block, blockPool);
> - /*! Track allocated memory blocks <offset, size> */
> - map<int16_t, int16_t> allocatedBlocks;
> - /*! Use custom allocators */
> - GBE_CLASS(SimpleAllocator);
> - };
> -
> - /*! Structure that keeps track of allocation in the register file. This is
> - * actually needed by Context (and not only by GenContext) because both
> - * simulator and hardware have to deal with constant pushing which uses
> the
> - * register file
> - *
> - * Since Gen is pretty flexible, we just reuse the Simpleallocator
> - */
> -
> - class RegisterAllocator: public SimpleAllocator {
> - public:
> - RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size,
> false) {}
> -
> - GBE_CLASS(RegisterAllocator);
> - };
> -
> /*!
> * an allocator for scratch memory allocation. Scratch memory are used for
> register spilling.
> * You can query how much scratch memory needed through
> getMaxScatchMemUsed().
> @@ -396,10 +336,10 @@ namespace gbe
>
> void Context::buildStack(void) {
> const auto &stackUse = dag->getUse(ir::ocl::stackptr);
> - if (stackUse.size() == 0) // no stack is used if stackptr is unused
> + if (stackUse.size() == 0) { // no stack is used if stackptr is unused
> + this->kernel->stackSize = 0;
> return;
> - // Be sure that the stack pointer is set
> - // GBE_ASSERT(this->kernel-
> >getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
> + }
> uint32_t stackSize = 128;
> while (stackSize < fn.getStackSize()) {
> stackSize *= 3;
> diff --git a/backend/src/backend/context.hpp
> b/backend/src/backend/context.hpp index faa7c8a..48b46ed 100644
> --- a/backend/src/backend/context.hpp
> +++ b/backend/src/backend/context.hpp
> @@ -42,9 +42,68 @@ namespace ir {
> namespace gbe
> {
> class Kernel; // context creates Kernel
> - class RegisterAllocator; // allocator for physical register allocation
> class ScratchAllocator; // allocator for scratch memory allocation
>
> + class SimpleAllocator
> + {
> + public:
> + SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
> + ~SimpleAllocator(void);
> +
> + /*! Allocate some memory from the pool.
> + */
> + int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
> +
> + /*! Free the given register file piece */
> + void deallocate(int16_t offset);
> +
> + /*! Spilt a block into 2 blocks */
> + void splitBlock(int16_t offset, int16_t subOffset);
> +
> + protected:
> + /*! Double chained list of free spaces */
> + struct Block {
> + Block(int16_t offset, int16_t size) :
> + prev(NULL), next(NULL), offset(offset), size(size) {}
> + Block *prev, *next; //!< Previous and next free blocks
> + int16_t offset; //!< Where the free block starts
> + int16_t size; //!< Size of the free block
> + };
> +
> + /*! Try to coalesce two blocks (left and right). They must be in that order.
> + * If the colascing was done, the left block is deleted
> + */
> + void coalesce(Block *left, Block *right);
> + /*! the maximum offset */
> + int16_t maxOffset;
> + /*! whether trigger an assertion on allocation failure */
> + bool assertFail;
> + /*! Head and tail of the free list */
> + Block *head;
> + Block *tail;
> + /*! Handle free list element allocation */
> + DECL_POOL(Block, blockPool);
> + /*! Track allocated memory blocks <offset, size> */
> + map<int16_t, int16_t> allocatedBlocks;
> + /*! Use custom allocators */
> + GBE_CLASS(SimpleAllocator);
> + };
> +
> + /*! Structure that keeps track of allocation in the register file. This is
> + * actually needed by Context (and not only by GenContext) because both
> + * simulator and hardware have to deal with constant pushing which uses
> the
> + * register file
> + *
> + * Since Gen is pretty flexible, we just reuse the Simpleallocator
> + */
> +
> + class RegisterAllocator: public SimpleAllocator {
> + public:
> + RegisterAllocator(int16_t offset, int16_t size):
> + SimpleAllocator(offset, size, false) {}
> +
> + GBE_CLASS(RegisterAllocator);
> + };
> +
> /*! Context is the helper structure to build the Gen ISA or simulation code
> * from GenIR
> */
> diff --git a/backend/src/backend/gen75_context.cpp
> b/backend/src/backend/gen75_context.cpp
> index b9dfb18..7d407c3 100644
> --- a/backend/src/backend/gen75_context.cpp
> +++ b/backend/src/backend/gen75_context.cpp
> @@ -67,7 +67,7 @@ namespace gbe
> using namespace ir;
>
> // Only emit stack pointer computation if we use a stack
> - if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
> + if (kernel->getStackSize() == 0)
> return;
>
> // Check that everything is consistent in the kernel code @@ -80,6 +80,8
> @@ namespace gbe
> GenRegister::ud16grf(ir::ocl::stackptr);
> const GenRegister stackptr = ra->genReg(selStatckPtr);
>
> + loadLaneID(stackptr);
> +
> // We compute the per-lane stack pointer here
> // private address start from zero
> p->push();
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 0c301dd..25fdf08 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -176,11 +176,39 @@ namespace gbe
> p->pop();
> }
>
> + void GenContext::loadLaneID(GenRegister dst) {
> + const GenRegister laneID = GenRegister::immv(0x76543210);
> + GenRegister dst_;
> + if (dst.type == GEN_TYPE_UW)
> + dst_ = dst;
> + else
> + dst_ = GenRegister::uw16grf(126,0);
Does GEN only support move immv register to word or unsigned word?
> +
> + p->push();
> + uint32_t execWidth = p->curr.execWidth;
> + p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> + if (execWidth == 8)
> + p->MOV(dst_, laneID);
> + else {
> + p->curr.execWidth = 8;
> + p->MOV(dst_, laneID);
> + //Packed Unsigned Half-Byte Integer Vector does not work
> + //have to mock by adding 8 to the singed vector
> + const GenRegister eight = GenRegister::immuw(8);
> + p->ADD(GenRegister::offset(dst_, 0, 16), dst_, eight);
> + p->curr.execWidth = 16;
> + }
> + if (dst.type != GEN_TYPE_UW)
> + p->MOV(dst, dst_);
> + p->pop();
> + }
> +
> void GenContext::emitStackPointer(void) {
> using namespace ir;
>
> // Only emit stack pointer computation if we use a stack
> - if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
> + if (kernel->getStackSize() == 0)
> return;
>
> // Check that everything is consistent in the kernel code @@ -193,6 +221,8
> @@ namespace gbe
> GenRegister::ud16grf(ir::ocl::stackptr);
> const GenRegister stackptr = ra->genReg(selStatckPtr);
>
> + loadLaneID(stackptr);
> +
> // We compute the per-lane stack pointer here
> // threadId * perThreadSize + laneId*perLaneSize
> // let private address start from zero @@ -2254,7 +2284,6 @@ namespace
> gbe
> INSERT_REG(numgroup0, GROUP_NUM_X)
> INSERT_REG(numgroup1, GROUP_NUM_Y)
> INSERT_REG(numgroup2, GROUP_NUM_Z)
> - INSERT_REG(stackptr, STACK_POINTER)
> INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
> INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
> do {} while(0);
> diff --git a/backend/src/backend/gen_context.hpp
> b/backend/src/backend/gen_context.hpp
> index 8ef725f..34f9293 100644
> --- a/backend/src/backend/gen_context.hpp
> +++ b/backend/src/backend/gen_context.hpp
> @@ -107,6 +107,8 @@ namespace gbe
> return this->liveness->getLiveIn(bb);
> }
>
> + void loadLaneID(GenRegister dst);
> +
> void collectShifter(GenRegister dest, GenRegister src);
> void loadTopHalf(GenRegister dest, GenRegister src);
> void storeTopHalf(GenRegister dest, GenRegister src); diff --git
> a/backend/src/backend/gen_reg_allocation.cpp
> b/backend/src/backend/gen_reg_allocation.cpp
> index 4cb88e9..36ee80a 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -133,8 +133,8 @@ namespace gbe
> void validateFlag(Selection &selection, SelectionInstruction &insn);
> /*! Allocate the GRF registers */
> bool allocateGRFs(Selection &selection);
> - /*! Create gen registers for all preallocated curbe registers. */
> - void allocatePayloadRegs(void);
> + /*! Create gen registers for all preallocated special registers. */
> + void allocateSpecialRegs(void);
> /*! Create a Gen register from a register set in the payload */
> void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset =
> 0);
> /*! Create the intervals for each register */ @@ -228,7 +228,7 @@
> namespace gbe
> this->intervals[reg].maxID = 0;
> }
>
> - INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
> + INLINE void GenRegAllocator::Opaque::allocateSpecialRegs(void) {
> using namespace ir;
> for(auto &it : this->ctx.curbeRegs)
> allocatePayloadReg(it.first, it.second); @@ -248,6 +248,18 @@
> namespace gbe
> allocatePayloadReg(reg, it->second, subOffset);
> ctx.splitBlock(it->second, subOffset);
> }
> +
> + if (RA.contains(ocl::stackbuffer)) {
> + uint32_t regSize = this->ctx.ra->getRegSize(ocl::stackptr);
You can use this->getRegAttrib directly.
> + uint32_t offset = this->ctx.registerAllocator->allocate(regSize, regSize, 1);
Should not call ctx.registerAllocator->allocate, you can use ctx.allocate.
> + RA.insert(std::make_pair(ocl::stackptr, offset));
> + }
> +
> + // Group and barrier IDs are always allocated by the hardware in r0
> + RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1
> + RA.insert(std::make_pair(ocl::groupid1, 6*sizeof(float))); // r0.6
> + RA.insert(std::make_pair(ocl::groupid2, 7*sizeof(float))); // r0.7
> + RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
> }
>
> bool GenRegAllocator::Opaque::createGenReg(const Selection &selection,
> const GenRegInterval &interval) { @@ -1001,13 +1013,7 @@ namespace gbe
> this->intervals.push_back(ir::Register(regID));
>
> // Allocate the special registers (only those which are actually used)
> - this->allocatePayloadRegs();
> -
> - // Group and barrier IDs are always allocated by the hardware in r0
> - RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1
> - RA.insert(std::make_pair(ocl::groupid1, 6*sizeof(float))); // r0.6
> - RA.insert(std::make_pair(ocl::groupid2, 7*sizeof(float))); // r0.7
> - RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
> + this->allocateSpecialRegs();
>
> // block IP used to handle the mask in SW is always allocated
>
> diff --git a/backend/src/backend/program.h
> b/backend/src/backend/program.h index fa75052..af19732 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -91,7 +91,6 @@ enum gbe_curbe_type {
> GBE_CURBE_GROUP_NUM_Z,
> GBE_CURBE_WORK_DIM,
> GBE_CURBE_IMAGE_INFO,
> - GBE_CURBE_STACK_POINTER,
> GBE_CURBE_PRINTF_BUF_POINTER,
> GBE_CURBE_PRINTF_INDEX_POINTER,
> GBE_CURBE_KERNEL_ARGUMENT,
> diff --git a/backend/src/backend/program.hpp
> b/backend/src/backend/program.hpp index cff2463..efe192f 100644
> --- a/backend/src/backend/program.hpp
> +++ b/backend/src/backend/program.hpp
> @@ -223,7 +223,7 @@ namespace gbe {
> uint32_t argNum; //!< Number of function arguments
> uint32_t curbeSize; //!< Size of the data to push
> uint32_t simdWidth; //!< SIMD size for the kernel (lane number)
> - uint32_t stackSize; //!< Stack size (may be 0 if unused)
> + uint32_t stackSize; //!< Stack size (0 if unused)
> uint32_t scratchSize; //!< Scratch memory size (may be 0 if unused)
> bool useSLM; //!< SLM requires a special HW config
> uint32_t slmSize; //!< slm size for kernel variable
> diff --git a/src/cl_command_queue_gen7.c
> b/src/cl_command_queue_gen7.c index 4adbd2b..0e60528 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -210,15 +210,6 @@ cl_curbe_fill(cl_kernel ker,
> UPLOAD(GBE_CURBE_WORK_DIM, work_dim); #undef UPLOAD
>
> - /* Write identity for the stack pointer. This is required by the stack pointer
> - * computation in the kernel
> - */
> - if ((offset = interp_kernel_get_curbe_offset(ker->opaque,
> GBE_CURBE_STACK_POINTER, 0)) >= 0) {
> - const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
> - uint32_t *stackptr = (uint32_t *) (ker->curbe + offset);
> - int32_t i;
> - for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
> - }
> /* Handle the various offsets to SLM */
> const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
> int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque);
> --
> 1.9.1
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list