[Beignet] [PATCH V2] GBE: Optimize scratch memory usage using register interval
Ruiling Song
ruiling.song at intel.com
Wed Feb 26 23:56:29 PST 2014
As scratch memory is a limited resource in HW. And different
register have the opptunity to share same scratch memory. So
I introduce an allocator for scratch memory management.
v2:
In order to reuse the registerFilePartitioner, I rename it as
SimpleAllocator, and derive ScratchAllocator & RegisterAllocator
from it.
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/backend/context.cpp | 114 ++++++++++++++++++----------
backend/src/backend/context.hpp | 11 ++-
backend/src/backend/gen_reg_allocation.cpp | 50 +++++++++---
3 files changed, 120 insertions(+), 55 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 51a0628..c6f8064 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -35,24 +35,13 @@
namespace gbe
{
- /*! Structure that keeps track of allocation in the register file. This is
- * actually needed by Context (and not only by GenContext) because both
- * simulator and hardware have to deal with constant pushing which uses the
- * register file
- *
- * Since Gen is pretty flexible, we just maintain a free list for the
- * register file (as a classical allocator) and coalesce blocks when required
- */
- class RegisterFilePartitioner
+ class SimpleAllocator
{
public:
- RegisterFilePartitioner(void);
- ~RegisterFilePartitioner(void);
+ SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
+ ~SimpleAllocator(void);
- /*! Allocate some memory in the register file. Return 0 if out-of-memory. By
- * the way, zero is not a valid offset since r0 is always preallocated by
- * the hardware. Note that we always use the left most block when
- * allocating, so it makes sense for constant pushing
+ /*! Allocate some memory from the pool.
*/
int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
@@ -62,10 +51,7 @@ namespace gbe
/*! Spilt a block into 2 blocks */
void splitBlock(int16_t offset, int16_t subOffset);
- private:
- /*! May need to make that run-time in the future */
- static const int16_t RegisterFileSize = 4*KB;
-
+ protected:
/*! Double chained list of free spaces */
struct Block {
Block(int16_t offset, int16_t size) :
@@ -79,6 +65,10 @@ namespace gbe
* If the colascing was done, the left block is deleted
*/
void coalesce(Block *left, Block *right);
+ /*! the maximum offset */
+ int16_t maxOffset;
+ /*! whether trigger an assertion on allocation failure */
+ bool assertFail;
/*! Head and tail of the free list */
Block *head;
Block *tail;
@@ -87,17 +77,46 @@ namespace gbe
/*! Track allocated memory blocks <offset, size> */
map<int16_t, int16_t> allocatedBlocks;
/*! Use custom allocators */
- GBE_CLASS(RegisterFilePartitioner);
+ GBE_CLASS(SimpleAllocator);
+ };
+
+ /*! Structure that keeps track of allocation in the register file. This is
+ * actually needed by Context (and not only by GenContext) because both
+ * simulator and hardware have to deal with constant pushing which uses the
+ * register file
+ *
+ * Since Gen is pretty flexible, we just reuse the Simpleallocator
+ */
+
+ class RegisterAllocator: public SimpleAllocator {
+ public:
+ RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {}
+
+ GBE_CLASS(RegisterAllocator);
+ };
+
+ /*!
+ * an allocator for scratch memory allocation. Scratch memory are used for register spilling.
+ * You can query how much scratch memory needed through getMaxScatchMemUsed().
+ */
+
+ class ScratchAllocator: public SimpleAllocator {
+ public:
+ ScratchAllocator(int16_t size): SimpleAllocator(0, size, true) {}
+ int16_t getMaxScatchMemUsed() { return maxOffset; }
+
+ GBE_CLASS(ScratchAllocator);
};
- RegisterFilePartitioner::RegisterFilePartitioner(void) {
- // r0 is always set by the HW and used at the end by EOT
- const int16_t offset = GEN_REG_SIZE;
- const int16_t size = RegisterFileSize - offset;
- tail = head = this->newBlock(offset, size);
+ SimpleAllocator::SimpleAllocator(int16_t startOffset,
+ int16_t size,
+ bool _assertFail)
+ : maxOffset(0),
+ assertFail(_assertFail){
+ tail = head = this->newBlock(startOffset, size);
}
- RegisterFilePartitioner::~RegisterFilePartitioner(void) {
+ SimpleAllocator::~SimpleAllocator(void) {
while (this->head) {
Block *next = this->head->next;
this->deleteBlock(this->head);
@@ -105,7 +124,7 @@ namespace gbe
}
}
- int16_t RegisterFilePartitioner::allocate(int16_t size, int16_t alignment, bool bFwd)
+ int16_t SimpleAllocator::allocate(int16_t size, int16_t alignment, bool bFwd)
{
// Make it simple and just use the first block we find
Block *list = bFwd ? head : tail;
@@ -205,13 +224,16 @@ namespace gbe
// Track the allocation to retrieve the size later
allocatedBlocks.insert(std::make_pair(aligned, size));
+ // update max offset
+ if(aligned + size > maxOffset) maxOffset = aligned + size;
// We have a valid offset now
return aligned;
}
+ GBE_ASSERT( !assertFail );
return 0;
}
- void RegisterFilePartitioner::deallocate(int16_t offset)
+ void SimpleAllocator::deallocate(int16_t offset)
{
// Retrieve the size in the allocation map
auto it = allocatedBlocks.find(offset);
@@ -254,7 +276,7 @@ namespace gbe
allocatedBlocks.erase(it);
}
- void RegisterFilePartitioner::coalesce(Block *left, Block *right) {
+ void SimpleAllocator::coalesce(Block *left, Block *right) {
if (left == NULL || right == NULL) return;
GBE_ASSERT(left->offset < right->offset);
GBE_ASSERT(left->next == right);
@@ -270,7 +292,7 @@ namespace gbe
}
}
- void RegisterFilePartitioner::splitBlock(int16_t offset, int16_t subOffset) {
+ void SimpleAllocator::splitBlock(int16_t offset, int16_t subOffset) {
// Retrieve the size in the allocation map
auto it = allocatedBlocks.find(offset);
GBE_ASSERT(it != allocatedBlocks.end());
@@ -300,6 +322,7 @@ namespace gbe
return i;
}
+
///////////////////////////////////////////////////////////////////////////
// Generic Context (shared by the simulator and the HW context)
///////////////////////////////////////////////////////////////////////////
@@ -311,16 +334,18 @@ namespace gbe
GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn));
this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
- this->partitioner = GBE_NEW_NO_ARG(RegisterFilePartitioner);
+ // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
+ this->registerAllocator = GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
+ this->scratchAllocator = GBE_NEW(ScratchAllocator, 22*KB);
if (fn.getSimdWidth() == 0 || OCL_SIMD_WIDTH != 15)
this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
else
this->simdWidth = fn.getSimdWidth();
- this->scratchOffset = 0;
}
Context::~Context(void) {
- GBE_SAFE_DELETE(this->partitioner);
+ GBE_SAFE_DELETE(this->registerAllocator);
+ GBE_SAFE_DELETE(this->scratchAllocator);
GBE_SAFE_DELETE(this->dag);
GBE_SAFE_DELETE(this->liveness);
}
@@ -339,20 +364,20 @@ namespace gbe
this->kernel = NULL;
}
if(this->kernel != NULL) {
- this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
+ this->kernel->scratchSize = alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
this->kernel->ctx = this;
}
return this->kernel;
}
int16_t Context::allocate(int16_t size, int16_t alignment) {
- return partitioner->allocate(size, alignment);
+ return registerAllocator->allocate(size, alignment);
}
- void Context::deallocate(int16_t offset) { partitioner->deallocate(offset); }
+ void Context::deallocate(int16_t offset) { registerAllocator->deallocate(offset); }
void Context::splitBlock(int16_t offset, int16_t subOffset) {
- partitioner->splitBlock(offset, subOffset);
+ registerAllocator->splitBlock(offset, subOffset);
}
int32_t Context::allocConstBuf(uint32_t argID) {
@@ -376,10 +401,15 @@ namespace gbe
return offset + GEN_REG_SIZE;
}
- uint32_t Context::allocateScratchMem(uint32_t size) {
- uint32_t offset = scratchOffset;
- scratchOffset += size;
- return offset;
+ // FIXME TODO as we optimize scratch memory usage using the register interval.
+ // we need to add some dependency in post_reg_alloc scheduler, to keep scratch
+ // memory that are reused still keep the order
+
+ int32_t Context::allocateScratchMem(uint32_t size) {
+ return scratchAllocator->allocate(size, 32, true);
+ }
+ void Context::deallocateScratchMem(int32_t offset) {
+ scratchAllocator->deallocate(offset);
}
void Context::buildStack(void) {
@@ -402,7 +432,7 @@ namespace gbe
uint32_t alignment)
{
alignment = alignment == 0 ? size : alignment;
- const uint32_t offset = partitioner->allocate(size, alignment, 1);
+ const uint32_t offset = registerAllocator->allocate(size, alignment, 1);
GBE_ASSERT(offset >= GEN_REG_SIZE);
kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 000612e..ac940bd 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -41,7 +41,8 @@ namespace ir {
namespace gbe
{
class Kernel; // context creates Kernel
- class RegisterFilePartitioner; // Partition register file for reg allocation
+ class RegisterAllocator; // allocator for physical register allocation
+ class ScratchAllocator; // allocator for scratch memory allocation
/*! Context is the helper structure to build the Gen ISA or simulation code
* from GenIR
@@ -94,7 +95,9 @@ namespace gbe
/*! Get (search or allocate if fail to find one) image info curbeOffset.*/
uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
/*! allocate size scratch memory and return start address */
- uint32_t allocateScratchMem(uint32_t size);
+ int32_t allocateScratchMem(uint32_t size);
+ /*! deallocate scratch memory at offset */
+ void deallocateScratchMem(int32_t offset);
/*! Preallocated curbe register set including special registers. */
map<ir::Register, uint32_t> curbeRegs;
protected:
@@ -129,11 +132,11 @@ namespace gbe
Kernel *kernel; //!< Kernel we are building
ir::Liveness *liveness; //!< Liveness info for the variables
ir::FunctionDAG *dag; //!< Graph of values on the function
- RegisterFilePartitioner *partitioner; //!< Handle register file partionning
+ RegisterAllocator *registerAllocator; //!< physical register allocation
+ ScratchAllocator *scratchAllocator; //!< scratch memory allocator
set<ir::LabelIndex> usedLabels; //!< Set of all used labels
JIPMap JIPs; //!< Where to jump all labels/branches
uint32_t simdWidth; //!< Number of lanes per HW threads
- uint32_t scratchOffset; //!< scratch slot for next scratch memory request
GBE_CLASS(Context); //!< Use custom allocators
};
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index c282b36..fdc1d2f 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -174,7 +174,7 @@ namespace gbe
INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
INLINE bool vectorCanSpill(SelectionVector *vector);
-
+ INLINE void allocateScratchForSpilled();
/*! Use custom allocator */
GBE_CLASS(Opaque);
};
@@ -583,6 +583,8 @@ namespace gbe
}
if (!spilledRegs.empty()) {
GBE_ASSERT(reservedReg != 0);
+ allocateScratchForSpilled();
+
bool success = selection.spillRegs(spilledRegs, reservedReg);
if (!success) {
std::cerr << "Fail to spill registers." << std::endl;
@@ -592,6 +594,43 @@ namespace gbe
return true;
}
+ INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled()
+ {
+ const uint32_t regNum = spilledRegs.size();
+ this->starting.resize(regNum);
+ this->ending.resize(regNum);
+ uint32_t regID = 0;
+ for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) {
+ this->starting[regID] = this->ending[regID] = &intervals[it->first];
+ regID++;
+ }
+ std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
+ std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
+ int toExpire = 0;
+ for(uint32_t i = 0; i < regNum; i++) {
+ const GenRegInterval * cur = starting[i];
+ const GenRegInterval * exp = ending[toExpire];
+ if(exp->maxID < cur->minID) {
+ auto it = spilledRegs.find(exp->reg);
+ GBE_ASSERT(it != spilledRegs.end());
+ if(it->second.addr != -1) {
+ ctx.deallocateScratchMem(it->second.addr);
+ }
+ toExpire++;
+ }
+ auto it = spilledRegs.find(cur->reg);
+ GBE_ASSERT(it != spilledRegs.end());
+ if(cur->minID == cur->maxID) {
+ it->second.addr = -1;
+ continue;
+ }
+
+ ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg);
+ it->second.addr = ctx.allocateScratchMem(getFamilySize(family)
+ * ctx.getSimdWidth());
+ }
+ }
+
INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
{
auto it = RA.find(reg);
@@ -643,14 +682,7 @@ namespace gbe
return false;
SpillRegTag spillTag;
spillTag.isTmpReg = interval.maxID == interval.minID;
- if (!spillTag.isTmpReg) {
- // FIXME, we can optimize scratch allocation according to
- // the interval information.
- ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg);
- spillTag.addr = ctx.allocateScratchMem(getFamilySize(family)
- * ctx.getSimdWidth());
- } else
- spillTag.addr = -1;
+
if (isAllocated) {
// If this register is allocated, we need to expire it and erase it
// from the RA map.
--
1.7.9.5
More information about the Beignet
mailing list