[Beignet] [PATCH] GBE: Optimize scratch memory usage using register interval
Ruiling Song
ruiling.song at intel.com
Thu Feb 20 23:23:49 PST 2014
As scratch memory is a limited resource in HW. And different
register have the opptunity to share same scratch memory. So I
implement a special designed allocator to reduce scratch memory usage,
it is assumed that scatch memory are always allocated in multiple of 32 bytes.
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/backend/context.cpp | 149 +++++++++++++++++++++++++++-
backend/src/backend/context.hpp | 28 +++++-
backend/src/backend/gen_reg_allocation.cpp | 50 ++++++++--
3 files changed, 212 insertions(+), 15 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 2125bd1..24f4f6f 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -301,6 +301,145 @@ namespace gbe
return i;
}
+ /*
+ * As scrath memory are allocated by multiple of fixedSize(32), we design the simple allocator
+ * to make allocation and free efficient. A ScratchBlock is multiple of fixedSize with one head,
+ * and possibly one tail and internal elements, each element are defined as [size, used, prev, next],
+ * The block header's size record the real block size, other elements in the block are with size 0.
+ * When an element is block header, its [prev/next] are used for free block list.
+ * The tail block's prev points to block header. When a block is freed, it will check
+ * left and right blocks for possible merge.
+ */
+ // XXX TODO as we optimize scratch memory usage using the register interval.
+ // we need to add some dependency in post_reg_alloc scheduler, to keep scratch
+ // memory that are reused still keep the order
+int ScratchAllocator::allocate(int bytes) {
+ GBE_ASSERT(bytes % fixedSize == 0);
+ int size = bytes / fixedSize;
+ int target = free;
+ while(target != -1 && blocks[target].size < size) {
+ target = blocks[target].next;
+ }
+
+ if(target == -1) {
+ // no suitable block found, simply allocate enough
+ int ret = maxOffset*fixedSize;
+ blocks.push_back(ScratchBlock(size, 1, -1, -1));
+ for(int i = 0; i < size-1; i++)
+ blocks.push_back(ScratchBlock(0, 1, maxOffset, -1));
+ maxOffset = blocks.size();
+ return ret;
+ } else {
+ // found one block large enough
+ int old_size = blocks[target].size;
+ for(int i = 0; i < size; i++) {
+ blocks[target + i].used = 1;
+ blocks[target + i].size = 0;
+ }
+ blocks[target].size = size;
+
+ if(old_size > size) {
+ // split the larger block
+ if(size > 1)
+ blocks[target + size - 1].prev = target;
+
+ // setup new free block
+ int new_block = target + size;
+ int new_size = old_size - size;
+ blocks[new_block].size = new_size;
+ blocks[new_block].prev = blocks[target].prev;
+ blocks[new_block].next = blocks[target].next;
+ if(new_size > 1)
+ blocks[new_block + new_size - 1].prev = new_block;
+
+ // setup free list
+ if(blocks[target].prev != -1)
+ blocks[blocks[target].prev].next = new_block;
+ else {
+ GBE_ASSERT(free == target);
+ free = new_block;
+ }
+ if(blocks[target].next != -1)
+ blocks[blocks[target].next].prev = new_block;
+ } else {
+ // of same size
+ if(blocks[target].prev == -1) {
+ GBE_ASSERT(free == target);
+ free = blocks[target].next;
+ }else {
+ blocks[blocks[target].prev].next = blocks[target].next;
+ }
+
+ if(blocks[target].next != -1)
+ blocks[blocks[target].next].prev = blocks[target].prev;
+ }
+ blocks[target].prev = blocks[target].next = -1;
+
+ return target * fixedSize;
+ }
+ }
+ void ScratchAllocator::dumpAllBlocks() {
+ std::cout << "---All Scratch Blocks Dumped (with free = " << free << ")---" << std::endl;
+ for(int i = 0; i < blocks.size(); i++) {
+ std::cout << "[" << i << ", sz(" << (int)blocks[i].size << "), u(" << (int)blocks[i].used
+ << "), p("<< blocks[i].prev << "), n(" << blocks[i].next << ")]"<<std::endl;
+ }
+ }
+ void ScratchAllocator::deallocate(int offset) {
+ offset /= fixedSize;
+ int size = blocks[offset].size;
+ for(int i = 0; i < size; i++) {
+ blocks[offset+i].used = 0;
+ blocks[offset+i].next = -1;
+ }
+
+ int head = offset;
+ // check left can be merged?
+ if(offset > 0 && blocks[offset-1].used == 0) {
+ int lblock_head = offset -1;
+ if(blocks[offset -1].size == 0) // block tail
+ lblock_head = blocks[offset-1].prev;
+ blocks[lblock_head].size += size;
+ blocks[offset+size-1].prev = lblock_head; // point to new block head
+ blocks[offset].size = 0; // not head now
+ GBE_ASSERT(blocks[offset+size-1].size == 0);
+ head = lblock_head;
+ }
+
+ int rblock_head = offset+size;
+ // check right can be merged?
+ if(rblock_head < maxOffset && blocks[rblock_head].used == 0) {
+ int rblock_size = blocks[rblock_head].size;
+ blocks[head].size += rblock_size;
+ GBE_ASSERT(rblock_size != 0);
+ blocks[rblock_head].size = 0; // not head now
+
+ // let rblock's prev directly point to rblock's next
+ if(blocks[rblock_head].prev != -1)
+ blocks[blocks[rblock_head].prev].next = blocks[rblock_head].next;
+ else
+ free = blocks[rblock_head].next;
+
+ if(blocks[rblock_head].next != -1)
+ blocks[blocks[rblock_head].next].prev = blocks[rblock_head].prev;
+
+ blocks[rblock_head + rblock_size - 1].prev = head; //points to new block head
+ blocks[rblock_head + rblock_size - 1].next = -1;
+ }
+
+ // insert the block at the front of free list
+ if(blocks[head].prev == -1) {
+ // 'head' maybe 'free' or the newly deallocated
+ int old_free = free;
+
+ if(old_free != -1 && old_free != head) {
+ blocks[old_free].prev = head;
+ blocks[head].next = old_free;
+ }
+ free = head;
+ }
+ }
+
///////////////////////////////////////////////////////////////////////////
// Generic Context (shared by the simulator and the HW context)
///////////////////////////////////////////////////////////////////////////
@@ -317,7 +456,6 @@ namespace gbe
this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
else
this->simdWidth = fn.getSimdWidth();
- this->scratchOffset = 0;
}
Context::~Context(void) {
@@ -340,7 +478,7 @@ namespace gbe
this->kernel = NULL;
}
if(this->kernel != NULL) {
- this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
+ this->kernel->scratchSize = alignScratchSize(scratchAllocator.getMaxSize());
this->kernel->ctx = this;
}
return this->kernel;
@@ -378,9 +516,10 @@ namespace gbe
}
uint32_t Context::allocateScratchMem(uint32_t size) {
- uint32_t offset = scratchOffset;
- scratchOffset += size;
- return offset;
+ return scratchAllocator.allocate(size);
+ }
+ void Context::deallocateScratchMem(uint32_t offset) {
+ scratchAllocator.deallocate(offset);
}
void Context::buildStack(void) {
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 000612e..a63ddb3 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -43,6 +43,30 @@ namespace gbe
class Kernel; // context creates Kernel
class RegisterFilePartitioner; // Partition register file for reg allocation
+ struct ScratchBlock {
+ ScratchBlock(uint8_t _size, uint8_t _used, int _prev, int _next)
+ :size(_size), used(_used), prev(_prev), next(_next)
+ {}
+ uint8_t size; //!< size of the block if this the head, otherwise should be 0
+ uint8_t used; //!< used: 1 free: 0
+ int prev; //!< if this is block head, it means previous free block, otherwise point to block head
+ int next; //!< used: -1, free: next free block
+ };
+
+ class ScratchAllocator {
+ public:
+ ScratchAllocator(): free(-1), maxOffset(0) {}
+ ~ScratchAllocator() { blocks.clear(); }
+ int allocate(int size);
+ void deallocate(int offset);
+ int getMaxSize() { return maxOffset* fixedSize; }
+ void dumpAllBlocks();
+ private:
+ std::vector<ScratchBlock> blocks;
+ int free;
+ int maxOffset;
+ static const int fixedSize = 32;
+ };
/*! Context is the helper structure to build the Gen ISA or simulation code
* from GenIR
*/
@@ -95,6 +119,8 @@ namespace gbe
uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
/*! allocate size scratch memory and return start address */
uint32_t allocateScratchMem(uint32_t size);
+ /*! deallocate scratch memory at offset */
+ void deallocateScratchMem(uint32_t offset);
/*! Preallocated curbe register set including special registers. */
map<ir::Register, uint32_t> curbeRegs;
protected:
@@ -133,7 +159,7 @@ namespace gbe
set<ir::LabelIndex> usedLabels; //!< Set of all used labels
JIPMap JIPs; //!< Where to jump all labels/branches
uint32_t simdWidth; //!< Number of lanes per HW threads
- uint32_t scratchOffset; //!< scratch slot for next scratch memory request
+ ScratchAllocator scratchAllocator; //!< scratch memory allocator
GBE_CLASS(Context); //!< Use custom allocators
};
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 726b78c..2bab785 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -177,7 +177,7 @@ namespace gbe
INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
INLINE bool vectorCanSpill(SelectionVector *vector);
-
+ INLINE void allocateScratchForSpilled();
/*! Use custom allocator */
GBE_CLASS(Opaque);
};
@@ -576,6 +576,8 @@ namespace gbe
}
if (!spilledRegs.empty()) {
GBE_ASSERT(reservedReg != 0);
+ allocateScratchForSpilled();
+
bool success = selection.spillRegs(spilledRegs, reservedReg);
if (!success) {
std::cerr << "Fail to spill registers." << std::endl;
@@ -585,6 +587,43 @@ namespace gbe
return true;
}
+ INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled()
+ {
+ const uint32_t regNum = spilledRegs.size();
+ this->starting.resize(regNum);
+ this->ending.resize(regNum);
+ uint32_t regID = 0;
+ for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) {
+ this->starting[regID] = this->ending[regID] = &intervals[it->first];
+ regID++;
+ }
+ std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
+ std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
+ int toExpire = 0;
+ for(uint32_t i = 0; i < regNum; i++) {
+ const GenRegInterval * cur = starting[i];
+ const GenRegInterval * exp = ending[toExpire];
+ if(exp->maxID <= cur->minID) {
+ auto it = spilledRegs.find(exp->reg);
+ GBE_ASSERT(it != spilledRegs.end());
+ if(it->second.addr != -1) {
+ ctx.deallocateScratchMem(it->second.addr);
+ }
+ toExpire++;
+ }
+ auto it = spilledRegs.find(cur->reg);
+ GBE_ASSERT(it != spilledRegs.end());
+ if(cur->minID == cur->maxID) {
+ it->second.addr = -1;
+ continue;
+ }
+
+ ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg);
+ it->second.addr = ctx.allocateScratchMem(getFamilySize(family)
+ * ctx.getSimdWidth());
+ }
+ }
+
INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
{
auto it = RA.find(reg);
@@ -636,14 +675,7 @@ namespace gbe
return false;
SpillRegTag spillTag;
spillTag.isTmpReg = interval.maxID == interval.minID;
- if (!spillTag.isTmpReg) {
- // FIXME, we can optimize scratch allocation according to
- // the interval information.
- ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg);
- spillTag.addr = ctx.allocateScratchMem(getFamilySize(family)
- * ctx.getSimdWidth());
- } else
- spillTag.addr = -1;
+
if (isAllocated) {
// If this register is allocated, we need to expire it and erase it
// from the RA map.
--
1.7.9.5
More information about the Beignet
mailing list