[Beignet] [PATCH V3] GBE: Optimize scratch memory usage using register interval
Zhigang Gong
zhigang.gong at linux.intel.com
Thu Feb 27 21:53:08 PST 2014
LGTM, pushed, thanks.
On Fri, Feb 28, 2014 at 10:16:45AM +0800, Ruiling Song wrote:
> As scratch memory is a limited resource in HW. And different
> register have the opptunity to share same scratch memory. So
> I introduce an allocator for scratch memory management.
>
> v2:
> In order to reuse the registerFilePartitioner, I rename it as
> SimpleAllocator, and derive ScratchAllocator & RegisterAllocator
> from it.
>
> v3:
> fix a typo, scratch size is 12KB.
>
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
> backend/src/backend/context.cpp | 114 ++++++++++++++++++----------
> backend/src/backend/context.hpp | 11 ++-
> backend/src/backend/gen_reg_allocation.cpp | 50 +++++++++---
> 3 files changed, 120 insertions(+), 55 deletions(-)
>
> diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
> index 51a0628..b806586 100644
> --- a/backend/src/backend/context.cpp
> +++ b/backend/src/backend/context.cpp
> @@ -35,24 +35,13 @@
>
> namespace gbe
> {
> - /*! Structure that keeps track of allocation in the register file. This is
> - * actually needed by Context (and not only by GenContext) because both
> - * simulator and hardware have to deal with constant pushing which uses the
> - * register file
> - *
> - * Since Gen is pretty flexible, we just maintain a free list for the
> - * register file (as a classical allocator) and coalesce blocks when required
> - */
> - class RegisterFilePartitioner
> + class SimpleAllocator
> {
> public:
> - RegisterFilePartitioner(void);
> - ~RegisterFilePartitioner(void);
> + SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
> + ~SimpleAllocator(void);
>
> - /*! Allocate some memory in the register file. Return 0 if out-of-memory. By
> - * the way, zero is not a valid offset since r0 is always preallocated by
> - * the hardware. Note that we always use the left most block when
> - * allocating, so it makes sense for constant pushing
> + /*! Allocate some memory from the pool.
> */
> int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
>
> @@ -62,10 +51,7 @@ namespace gbe
> /*! Spilt a block into 2 blocks */
> void splitBlock(int16_t offset, int16_t subOffset);
>
> - private:
> - /*! May need to make that run-time in the future */
> - static const int16_t RegisterFileSize = 4*KB;
> -
> + protected:
> /*! Double chained list of free spaces */
> struct Block {
> Block(int16_t offset, int16_t size) :
> @@ -79,6 +65,10 @@ namespace gbe
> * If the colascing was done, the left block is deleted
> */
> void coalesce(Block *left, Block *right);
> + /*! the maximum offset */
> + int16_t maxOffset;
> + /*! whether trigger an assertion on allocation failure */
> + bool assertFail;
> /*! Head and tail of the free list */
> Block *head;
> Block *tail;
> @@ -87,17 +77,46 @@ namespace gbe
> /*! Track allocated memory blocks <offset, size> */
> map<int16_t, int16_t> allocatedBlocks;
> /*! Use custom allocators */
> - GBE_CLASS(RegisterFilePartitioner);
> + GBE_CLASS(SimpleAllocator);
> + };
> +
> + /*! Structure that keeps track of allocation in the register file. This is
> + * actually needed by Context (and not only by GenContext) because both
> + * simulator and hardware have to deal with constant pushing which uses the
> + * register file
> + *
> + * Since Gen is pretty flexible, we just reuse the Simpleallocator
> + */
> +
> + class RegisterAllocator: public SimpleAllocator {
> + public:
> + RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {}
> +
> + GBE_CLASS(RegisterAllocator);
> + };
> +
> + /*!
> + * an allocator for scratch memory allocation. Scratch memory are used for register spilling.
> + * You can query how much scratch memory needed through getMaxScatchMemUsed().
> + */
> +
> + class ScratchAllocator: public SimpleAllocator {
> + public:
> + ScratchAllocator(int16_t size): SimpleAllocator(0, size, true) {}
> + int16_t getMaxScatchMemUsed() { return maxOffset; }
> +
> + GBE_CLASS(ScratchAllocator);
> };
>
> - RegisterFilePartitioner::RegisterFilePartitioner(void) {
> - // r0 is always set by the HW and used at the end by EOT
> - const int16_t offset = GEN_REG_SIZE;
> - const int16_t size = RegisterFileSize - offset;
> - tail = head = this->newBlock(offset, size);
> + SimpleAllocator::SimpleAllocator(int16_t startOffset,
> + int16_t size,
> + bool _assertFail)
> + : maxOffset(0),
> + assertFail(_assertFail){
> + tail = head = this->newBlock(startOffset, size);
> }
>
> - RegisterFilePartitioner::~RegisterFilePartitioner(void) {
> + SimpleAllocator::~SimpleAllocator(void) {
> while (this->head) {
> Block *next = this->head->next;
> this->deleteBlock(this->head);
> @@ -105,7 +124,7 @@ namespace gbe
> }
> }
>
> - int16_t RegisterFilePartitioner::allocate(int16_t size, int16_t alignment, bool bFwd)
> + int16_t SimpleAllocator::allocate(int16_t size, int16_t alignment, bool bFwd)
> {
> // Make it simple and just use the first block we find
> Block *list = bFwd ? head : tail;
> @@ -205,13 +224,16 @@ namespace gbe
>
> // Track the allocation to retrieve the size later
> allocatedBlocks.insert(std::make_pair(aligned, size));
> + // update max offset
> + if(aligned + size > maxOffset) maxOffset = aligned + size;
> // We have a valid offset now
> return aligned;
> }
> + GBE_ASSERT( !assertFail );
> return 0;
> }
>
> - void RegisterFilePartitioner::deallocate(int16_t offset)
> + void SimpleAllocator::deallocate(int16_t offset)
> {
> // Retrieve the size in the allocation map
> auto it = allocatedBlocks.find(offset);
> @@ -254,7 +276,7 @@ namespace gbe
> allocatedBlocks.erase(it);
> }
>
> - void RegisterFilePartitioner::coalesce(Block *left, Block *right) {
> + void SimpleAllocator::coalesce(Block *left, Block *right) {
> if (left == NULL || right == NULL) return;
> GBE_ASSERT(left->offset < right->offset);
> GBE_ASSERT(left->next == right);
> @@ -270,7 +292,7 @@ namespace gbe
> }
> }
>
> - void RegisterFilePartitioner::splitBlock(int16_t offset, int16_t subOffset) {
> + void SimpleAllocator::splitBlock(int16_t offset, int16_t subOffset) {
> // Retrieve the size in the allocation map
> auto it = allocatedBlocks.find(offset);
> GBE_ASSERT(it != allocatedBlocks.end());
> @@ -300,6 +322,7 @@ namespace gbe
>
> return i;
> }
> +
> ///////////////////////////////////////////////////////////////////////////
> // Generic Context (shared by the simulator and the HW context)
> ///////////////////////////////////////////////////////////////////////////
> @@ -311,16 +334,18 @@ namespace gbe
> GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
> this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn));
> this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
> - this->partitioner = GBE_NEW_NO_ARG(RegisterFilePartitioner);
> + // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
> + this->registerAllocator = GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
> + this->scratchAllocator = GBE_NEW(ScratchAllocator, 12*KB);
> if (fn.getSimdWidth() == 0 || OCL_SIMD_WIDTH != 15)
> this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
> else
> this->simdWidth = fn.getSimdWidth();
> - this->scratchOffset = 0;
> }
>
> Context::~Context(void) {
> - GBE_SAFE_DELETE(this->partitioner);
> + GBE_SAFE_DELETE(this->registerAllocator);
> + GBE_SAFE_DELETE(this->scratchAllocator);
> GBE_SAFE_DELETE(this->dag);
> GBE_SAFE_DELETE(this->liveness);
> }
> @@ -339,20 +364,20 @@ namespace gbe
> this->kernel = NULL;
> }
> if(this->kernel != NULL) {
> - this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
> + this->kernel->scratchSize = alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
> this->kernel->ctx = this;
> }
> return this->kernel;
> }
>
> int16_t Context::allocate(int16_t size, int16_t alignment) {
> - return partitioner->allocate(size, alignment);
> + return registerAllocator->allocate(size, alignment);
> }
>
> - void Context::deallocate(int16_t offset) { partitioner->deallocate(offset); }
> + void Context::deallocate(int16_t offset) { registerAllocator->deallocate(offset); }
>
> void Context::splitBlock(int16_t offset, int16_t subOffset) {
> - partitioner->splitBlock(offset, subOffset);
> + registerAllocator->splitBlock(offset, subOffset);
> }
>
> int32_t Context::allocConstBuf(uint32_t argID) {
> @@ -376,10 +401,15 @@ namespace gbe
> return offset + GEN_REG_SIZE;
> }
>
> - uint32_t Context::allocateScratchMem(uint32_t size) {
> - uint32_t offset = scratchOffset;
> - scratchOffset += size;
> - return offset;
> + // FIXME TODO as we optimize scratch memory usage using the register interval.
> + // we need to add some dependency in post_reg_alloc scheduler, to keep scratch
> + // memory that are reused still keep the order
> +
> + int32_t Context::allocateScratchMem(uint32_t size) {
> + return scratchAllocator->allocate(size, 32, true);
> + }
> + void Context::deallocateScratchMem(int32_t offset) {
> + scratchAllocator->deallocate(offset);
> }
>
> void Context::buildStack(void) {
> @@ -402,7 +432,7 @@ namespace gbe
> uint32_t alignment)
> {
> alignment = alignment == 0 ? size : alignment;
> - const uint32_t offset = partitioner->allocate(size, alignment, 1);
> + const uint32_t offset = registerAllocator->allocate(size, alignment, 1);
> GBE_ASSERT(offset >= GEN_REG_SIZE);
> kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
> kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
> diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
> index 000612e..ac940bd 100644
> --- a/backend/src/backend/context.hpp
> +++ b/backend/src/backend/context.hpp
> @@ -41,7 +41,8 @@ namespace ir {
> namespace gbe
> {
> class Kernel; // context creates Kernel
> - class RegisterFilePartitioner; // Partition register file for reg allocation
> + class RegisterAllocator; // allocator for physical register allocation
> + class ScratchAllocator; // allocator for scratch memory allocation
>
> /*! Context is the helper structure to build the Gen ISA or simulation code
> * from GenIR
> @@ -94,7 +95,9 @@ namespace gbe
> /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
> uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
> /*! allocate size scratch memory and return start address */
> - uint32_t allocateScratchMem(uint32_t size);
> + int32_t allocateScratchMem(uint32_t size);
> + /*! deallocate scratch memory at offset */
> + void deallocateScratchMem(int32_t offset);
> /*! Preallocated curbe register set including special registers. */
> map<ir::Register, uint32_t> curbeRegs;
> protected:
> @@ -129,11 +132,11 @@ namespace gbe
> Kernel *kernel; //!< Kernel we are building
> ir::Liveness *liveness; //!< Liveness info for the variables
> ir::FunctionDAG *dag; //!< Graph of values on the function
> - RegisterFilePartitioner *partitioner; //!< Handle register file partionning
> + RegisterAllocator *registerAllocator; //!< physical register allocation
> + ScratchAllocator *scratchAllocator; //!< scratch memory allocator
> set<ir::LabelIndex> usedLabels; //!< Set of all used labels
> JIPMap JIPs; //!< Where to jump all labels/branches
> uint32_t simdWidth; //!< Number of lanes per HW threads
> - uint32_t scratchOffset; //!< scratch slot for next scratch memory request
> GBE_CLASS(Context); //!< Use custom allocators
> };
>
> diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
> index c282b36..fdc1d2f 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -174,7 +174,7 @@ namespace gbe
> INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
> INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
> INLINE bool vectorCanSpill(SelectionVector *vector);
> -
> + INLINE void allocateScratchForSpilled();
> /*! Use custom allocator */
> GBE_CLASS(Opaque);
> };
> @@ -583,6 +583,8 @@ namespace gbe
> }
> if (!spilledRegs.empty()) {
> GBE_ASSERT(reservedReg != 0);
> + allocateScratchForSpilled();
> +
> bool success = selection.spillRegs(spilledRegs, reservedReg);
> if (!success) {
> std::cerr << "Fail to spill registers." << std::endl;
> @@ -592,6 +594,43 @@ namespace gbe
> return true;
> }
>
> + INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled()
> + {
> + const uint32_t regNum = spilledRegs.size();
> + this->starting.resize(regNum);
> + this->ending.resize(regNum);
> + uint32_t regID = 0;
> + for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) {
> + this->starting[regID] = this->ending[regID] = &intervals[it->first];
> + regID++;
> + }
> + std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
> + std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
> + int toExpire = 0;
> + for(uint32_t i = 0; i < regNum; i++) {
> + const GenRegInterval * cur = starting[i];
> + const GenRegInterval * exp = ending[toExpire];
> + if(exp->maxID < cur->minID) {
> + auto it = spilledRegs.find(exp->reg);
> + GBE_ASSERT(it != spilledRegs.end());
> + if(it->second.addr != -1) {
> + ctx.deallocateScratchMem(it->second.addr);
> + }
> + toExpire++;
> + }
> + auto it = spilledRegs.find(cur->reg);
> + GBE_ASSERT(it != spilledRegs.end());
> + if(cur->minID == cur->maxID) {
> + it->second.addr = -1;
> + continue;
> + }
> +
> + ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg);
> + it->second.addr = ctx.allocateScratchMem(getFamilySize(family)
> + * ctx.getSimdWidth());
> + }
> + }
> +
> INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
> {
> auto it = RA.find(reg);
> @@ -643,14 +682,7 @@ namespace gbe
> return false;
> SpillRegTag spillTag;
> spillTag.isTmpReg = interval.maxID == interval.minID;
> - if (!spillTag.isTmpReg) {
> - // FIXME, we can optimize scratch allocation according to
> - // the interval information.
> - ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg);
> - spillTag.addr = ctx.allocateScratchMem(getFamilySize(family)
> - * ctx.getSimdWidth());
> - } else
> - spillTag.addr = -1;
> +
> if (isAllocated) {
> // If this register is allocated, we need to expire it and erase it
> // from the RA map.
> --
> 1.7.9.5
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list