[Beignet] [PATCH] GBE: Optimize scratch memory usage using register interval

Thu Feb 20 23:23:49 PST 2014

As scratch memory is a limited resource in HW. And different
register have the opptunity to share same scratch memory. So I
implement a special designed allocator to reduce scratch memory usage,
it is assumed that scatch memory are always allocated in multiple of 32 bytes.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/context.cpp            |  149 +++++++++++++++++++++++++++-
 backend/src/backend/context.hpp            |   28 +++++-
 backend/src/backend/gen_reg_allocation.cpp |   50 ++++++++--
 3 files changed, 212 insertions(+), 15 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 2125bd1..24f4f6f 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -301,6 +301,145 @@ namespace gbe
 
     return i;
   }
+ /*
+  * As scrath memory are allocated by multiple of fixedSize(32), we design the simple allocator
+  * to make allocation and free efficient. A ScratchBlock is multiple of fixedSize with one head,
+  * and possibly one tail and internal elements, each element are defined as [size, used, prev, next],
+  * The block header's size record the real block size, other elements in the block are with size 0.
+  * When an element is block header, its [prev/next] are used for free block list.
+  * The tail block's prev points to block header. When a block is freed, it will check
+  * left and right blocks for possible merge.
+  */
+  // XXX TODO as we optimize scratch memory usage using the register interval.
+  // we need to add some dependency in post_reg_alloc scheduler, to keep scratch
+  // memory that are reused still keep the order
+int ScratchAllocator::allocate(int bytes) {
+    GBE_ASSERT(bytes % fixedSize == 0);
+    int size = bytes / fixedSize;
+    int target = free;
+    while(target != -1 && blocks[target].size < size) {
+      target = blocks[target].next;
+    }
+
+    if(target == -1) {
+      // no suitable block found, simply allocate enough
+      int ret = maxOffset*fixedSize;
+      blocks.push_back(ScratchBlock(size, 1, -1, -1));
+      for(int i = 0; i < size-1; i++)
+        blocks.push_back(ScratchBlock(0, 1, maxOffset, -1));
+      maxOffset = blocks.size();
+      return ret;
+    } else {
+      // found one block large enough
+      int old_size = blocks[target].size;
+      for(int i = 0; i < size; i++) {
+        blocks[target + i].used = 1;
+        blocks[target + i].size = 0;
+      }
+      blocks[target].size = size;
+
+      if(old_size > size) {
+        // split the larger block
+        if(size > 1)
+          blocks[target + size - 1].prev = target;
+
+        // setup new free block
+        int new_block = target + size;
+        int new_size = old_size - size;
+        blocks[new_block].size = new_size;
+        blocks[new_block].prev = blocks[target].prev;
+        blocks[new_block].next = blocks[target].next;
+        if(new_size > 1)
+          blocks[new_block + new_size - 1].prev = new_block;
+
+        // setup free list
+        if(blocks[target].prev != -1)
+          blocks[blocks[target].prev].next = new_block;
+        else {
+          GBE_ASSERT(free == target);
+          free = new_block;
+        }
+        if(blocks[target].next != -1)
+          blocks[blocks[target].next].prev = new_block;
+      } else {
+        // of same size
+        if(blocks[target].prev == -1) {
+          GBE_ASSERT(free == target);
+          free = blocks[target].next;
+        }else {
+          blocks[blocks[target].prev].next = blocks[target].next;
+        }
+
+        if(blocks[target].next != -1)
+          blocks[blocks[target].next].prev = blocks[target].prev;
+      }
+      blocks[target].prev = blocks[target].next = -1;
+
+      return target * fixedSize;
+    }
+  }
+  void ScratchAllocator::dumpAllBlocks() {
+    std::cout << "---All Scratch Blocks Dumped (with free = " << free << ")---" << std::endl;
+    for(int i = 0; i < blocks.size(); i++) {
+      std::cout << "[" << i << ", sz(" << (int)blocks[i].size << "), u(" << (int)blocks[i].used
+                << "), p("<< blocks[i].prev << "), n(" << blocks[i].next << ")]"<<std::endl;
+    }
+  }
+  void ScratchAllocator::deallocate(int offset) {
+    offset /= fixedSize;
+    int size = blocks[offset].size;
+    for(int i = 0; i < size; i++) {
+      blocks[offset+i].used = 0;
+      blocks[offset+i].next = -1;
+    }
+
+    int head = offset;
+    // check left can be merged?
+    if(offset > 0 && blocks[offset-1].used == 0) {
+      int lblock_head = offset -1;
+      if(blocks[offset -1].size == 0) // block tail
+        lblock_head = blocks[offset-1].prev;
+      blocks[lblock_head].size += size;
+      blocks[offset+size-1].prev = lblock_head; // point to new block head
+      blocks[offset].size = 0; // not head now
+      GBE_ASSERT(blocks[offset+size-1].size == 0);
+      head = lblock_head;
+    }
+
+    int rblock_head = offset+size;
+    // check right can be merged?
+    if(rblock_head < maxOffset && blocks[rblock_head].used == 0) {
+      int rblock_size = blocks[rblock_head].size;
+      blocks[head].size += rblock_size;
+      GBE_ASSERT(rblock_size != 0);
+      blocks[rblock_head].size = 0; // not head now
+
+      // let rblock's prev directly point to rblock's next
+      if(blocks[rblock_head].prev != -1)
+        blocks[blocks[rblock_head].prev].next = blocks[rblock_head].next;
+      else
+        free = blocks[rblock_head].next;
+
+      if(blocks[rblock_head].next != -1)
+        blocks[blocks[rblock_head].next].prev = blocks[rblock_head].prev;
+
+      blocks[rblock_head + rblock_size - 1].prev = head; //points to new block head
+      blocks[rblock_head + rblock_size - 1].next = -1;
+    }
+
+    // insert the block at the front of free list
+    if(blocks[head].prev == -1) {
+      // 'head' maybe 'free' or the newly deallocated
+      int old_free = free;
+
+      if(old_free != -1 && old_free != head) {
+        blocks[old_free].prev = head;
+        blocks[head].next = old_free;
+      }
+      free = head;
+    }
+  }
+
   ///////////////////////////////////////////////////////////////////////////
   // Generic Context (shared by the simulator and the HW context)
   ///////////////////////////////////////////////////////////////////////////
@@ -317,7 +456,6 @@ namespace gbe
       this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
     else
       this->simdWidth = fn.getSimdWidth();
-    this->scratchOffset = 0;
   }
 
   Context::~Context(void) {
@@ -340,7 +478,7 @@ namespace gbe
       this->kernel = NULL;
     }
     if(this->kernel != NULL) {
-      this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
+      this->kernel->scratchSize = alignScratchSize(scratchAllocator.getMaxSize());
       this->kernel->ctx = this;
     }
     return this->kernel;
@@ -378,9 +516,10 @@ namespace gbe
   }
 
   uint32_t Context::allocateScratchMem(uint32_t size) {
-    uint32_t offset = scratchOffset;
-    scratchOffset += size;
-    return offset;
+    return scratchAllocator.allocate(size);
+  }
+  void Context::deallocateScratchMem(uint32_t offset) {
+    scratchAllocator.deallocate(offset);
   }
 
   void Context::buildStack(void) {
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 000612e..a63ddb3 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -43,6 +43,30 @@ namespace gbe
   class Kernel;                 // context creates Kernel
   class RegisterFilePartitioner; // Partition register file for reg allocation
 
+  struct ScratchBlock {
+    ScratchBlock(uint8_t _size, uint8_t _used, int _prev, int _next)
+      :size(_size), used(_used), prev(_prev), next(_next)
+    {}
+    uint8_t size;      //!< size of the block if this the head, otherwise should be 0
+    uint8_t used;      //!< used: 1 free: 0
+    int prev;          //!< if this is block head, it means previous free block, otherwise point to block head
+    int next;          //!< used: -1, free: next free block
+  };
+
+  class ScratchAllocator {
+    public:
+      ScratchAllocator(): free(-1), maxOffset(0) {}
+      ~ScratchAllocator() { blocks.clear(); }
+      int allocate(int size);
+      void deallocate(int offset);
+      int getMaxSize() { return maxOffset* fixedSize; }
+      void dumpAllBlocks();
+    private:
+      std::vector<ScratchBlock> blocks;
+      int free;
+      int maxOffset;
+      static const int fixedSize = 32;
+  };
   /*! Context is the helper structure to build the Gen ISA or simulation code
    *  from GenIR
    */
@@ -95,6 +119,8 @@ namespace gbe
     uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
     /*! allocate size scratch memory and return start address */
     uint32_t allocateScratchMem(uint32_t size);
+    /*! deallocate scratch memory at offset */
+    void deallocateScratchMem(uint32_t offset);
     /*! Preallocated curbe register set including special registers. */
     map<ir::Register, uint32_t> curbeRegs;
   protected:
@@ -133,7 +159,7 @@ namespace gbe
     set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
     JIPMap JIPs;                          //!< Where to jump all labels/branches
     uint32_t simdWidth;                   //!< Number of lanes per HW threads
-    uint32_t scratchOffset;               //!< scratch slot for next scratch memory request
+    ScratchAllocator scratchAllocator;    //!< scratch memory allocator
     GBE_CLASS(Context);                   //!< Use custom allocators
   };
 
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 726b78c..2bab785 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -177,7 +177,7 @@ namespace gbe
     INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
     INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
     INLINE bool vectorCanSpill(SelectionVector *vector);
-
+    INLINE void allocateScratchForSpilled();
     /*! Use custom allocator */
     GBE_CLASS(Opaque);
   };
@@ -576,6 +576,8 @@ namespace gbe
     }
     if (!spilledRegs.empty()) {
       GBE_ASSERT(reservedReg != 0);
+      allocateScratchForSpilled();
+
       bool success = selection.spillRegs(spilledRegs, reservedReg);
       if (!success) {
         std::cerr << "Fail to spill registers." << std::endl;
@@ -585,6 +587,43 @@ namespace gbe
     return true;
   }
 
+  INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled()
+  {
+    const uint32_t regNum = spilledRegs.size();
+    this->starting.resize(regNum);
+    this->ending.resize(regNum);
+    uint32_t regID = 0;
+    for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) {
+      this->starting[regID] = this->ending[regID] = &intervals[it->first];
+      regID++;
+    }
+    std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
+    std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
+    int toExpire = 0;
+    for(uint32_t i = 0; i < regNum; i++) {
+      const GenRegInterval * cur = starting[i];
+      const GenRegInterval * exp = ending[toExpire];
+      if(exp->maxID <= cur->minID) {
+        auto it = spilledRegs.find(exp->reg);
+        GBE_ASSERT(it != spilledRegs.end());
+        if(it->second.addr != -1) {
+          ctx.deallocateScratchMem(it->second.addr);
+        }
+        toExpire++;
+      }
+      auto it = spilledRegs.find(cur->reg);
+      GBE_ASSERT(it != spilledRegs.end());
+      if(cur->minID == cur->maxID) {
+        it->second.addr = -1;
+        continue;
+      }
+
+      ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg);
+      it->second.addr = ctx.allocateScratchMem(getFamilySize(family)
+                                             * ctx.getSimdWidth());
+      }
+  }
+
   INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
   {
     auto it = RA.find(reg);
@@ -636,14 +675,7 @@ namespace gbe
       return false;
     SpillRegTag spillTag;
     spillTag.isTmpReg = interval.maxID == interval.minID;
-    if (!spillTag.isTmpReg) {
-      // FIXME, we can optimize scratch allocation according to
-      // the interval information.
-      ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg);
-      spillTag.addr = ctx.allocateScratchMem(getFamilySize(family)
-                                             * ctx.getSimdWidth());
-    } else
-      spillTag.addr = -1;
+
     if (isAllocated) {
       // If this register is allocated, we need to expire it and erase it
       // from the RA map.
-- 
1.7.9.5