[Beignet] [PATCH V3] GBE: Optimize scratch memory usage using register interval

Ruiling Song ruiling.song at intel.com
Thu Feb 27 18:16:45 PST 2014


As scratch memory is a limited resource in HW. And different
register have the opptunity to share same scratch memory. So
I introduce an allocator for scratch memory management.

v2:
In order to reuse the registerFilePartitioner, I rename it as
SimpleAllocator, and derive ScratchAllocator & RegisterAllocator
from it.

v3:
fix a typo, scratch size is 12KB.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/context.cpp            |  114 ++++++++++++++++++----------
 backend/src/backend/context.hpp            |   11 ++-
 backend/src/backend/gen_reg_allocation.cpp |   50 +++++++++---
 3 files changed, 120 insertions(+), 55 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 51a0628..b806586 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -35,24 +35,13 @@
 
 namespace gbe
 {
-  /*! Structure that keeps track of allocation in the register file. This is
-   *  actually needed by Context (and not only by GenContext) because both
-   *  simulator and hardware have to deal with constant pushing which uses the
-   *  register file
-   *
-   *  Since Gen is pretty flexible, we just maintain a free list for the
-   *  register file (as a classical allocator) and coalesce blocks when required
-   */
-  class RegisterFilePartitioner
+  class SimpleAllocator
   {
   public:
-    RegisterFilePartitioner(void);
-    ~RegisterFilePartitioner(void);
+    SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
+    ~SimpleAllocator(void);
 
-    /*! Allocate some memory in the register file. Return 0 if out-of-memory. By
-     *  the way, zero is not a valid offset since r0 is always preallocated by
-     *  the hardware. Note that we always use the left most block when
-     *  allocating, so it makes sense for constant pushing
+    /*! Allocate some memory from the pool.
      */
     int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
 
@@ -62,10 +51,7 @@ namespace gbe
     /*! Spilt a block into 2 blocks */
     void splitBlock(int16_t offset, int16_t subOffset);
 
-  private:
-    /*! May need to make that run-time in the future */
-    static const int16_t RegisterFileSize = 4*KB;
-
+  protected:
     /*! Double chained list of free spaces */
     struct Block {
       Block(int16_t offset, int16_t size) :
@@ -79,6 +65,10 @@ namespace gbe
      *  If the colascing was done, the left block is deleted
      */
     void coalesce(Block *left, Block *right);
+    /*! the maximum offset */
+    int16_t maxOffset;
+    /*! whether trigger an assertion on allocation failure */
+    bool assertFail;
     /*! Head and tail of the free list */
     Block *head;
     Block *tail;
@@ -87,17 +77,46 @@ namespace gbe
     /*! Track allocated memory blocks <offset, size> */
     map<int16_t, int16_t> allocatedBlocks;
     /*! Use custom allocators */
-    GBE_CLASS(RegisterFilePartitioner);
+    GBE_CLASS(SimpleAllocator);
+  };
+
+  /*! Structure that keeps track of allocation in the register file. This is
+   *  actually needed by Context (and not only by GenContext) because both
+   *  simulator and hardware have to deal with constant pushing which uses the
+   *  register file
+   *
+   *  Since Gen is pretty flexible, we just reuse the Simpleallocator
+   */
+
+  class RegisterAllocator: public SimpleAllocator {
+  public:
+    RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {}
+
+    GBE_CLASS(RegisterAllocator);
+  };
+
+  /*!
+   * an allocator for scratch memory allocation. Scratch memory are used for register spilling.
+   * You can query how much scratch memory needed through getMaxScatchMemUsed().
+   */
+
+  class ScratchAllocator: public SimpleAllocator {
+  public:
+    ScratchAllocator(int16_t size): SimpleAllocator(0, size, true) {}
+    int16_t getMaxScatchMemUsed() { return maxOffset; }
+
+    GBE_CLASS(ScratchAllocator);
   };
 
-  RegisterFilePartitioner::RegisterFilePartitioner(void) {
-    // r0 is always set by the HW and used at the end by EOT
-    const int16_t offset = GEN_REG_SIZE;
-    const int16_t size = RegisterFileSize  - offset;
-    tail = head = this->newBlock(offset, size);
+  SimpleAllocator::SimpleAllocator(int16_t startOffset,
+                                   int16_t size,
+                                   bool _assertFail)
+                                  : maxOffset(0),
+                                  assertFail(_assertFail){
+    tail = head = this->newBlock(startOffset, size);
   }
 
-  RegisterFilePartitioner::~RegisterFilePartitioner(void) {
+  SimpleAllocator::~SimpleAllocator(void) {
     while (this->head) {
       Block *next = this->head->next;
       this->deleteBlock(this->head);
@@ -105,7 +124,7 @@ namespace gbe
     }
   }
 
-  int16_t RegisterFilePartitioner::allocate(int16_t size, int16_t alignment, bool bFwd)
+  int16_t SimpleAllocator::allocate(int16_t size, int16_t alignment, bool bFwd)
   {
     // Make it simple and just use the first block we find
     Block *list = bFwd ? head : tail;
@@ -205,13 +224,16 @@ namespace gbe
 
       // Track the allocation to retrieve the size later
       allocatedBlocks.insert(std::make_pair(aligned, size));
+      // update max offset
+      if(aligned + size > maxOffset) maxOffset = aligned + size;
       // We have a valid offset now
       return aligned;
     }
+    GBE_ASSERT( !assertFail );
     return 0;
   }
 
-  void RegisterFilePartitioner::deallocate(int16_t offset)
+  void SimpleAllocator::deallocate(int16_t offset)
   {
     // Retrieve the size in the allocation map
     auto it = allocatedBlocks.find(offset);
@@ -254,7 +276,7 @@ namespace gbe
     allocatedBlocks.erase(it);
   }
 
-  void RegisterFilePartitioner::coalesce(Block *left, Block *right) {
+  void SimpleAllocator::coalesce(Block *left, Block *right) {
     if (left == NULL || right == NULL) return;
     GBE_ASSERT(left->offset < right->offset);
     GBE_ASSERT(left->next == right);
@@ -270,7 +292,7 @@ namespace gbe
     }
   }
 
-  void RegisterFilePartitioner::splitBlock(int16_t offset, int16_t subOffset) {
+  void SimpleAllocator::splitBlock(int16_t offset, int16_t subOffset) {
     // Retrieve the size in the allocation map
     auto it = allocatedBlocks.find(offset);
     GBE_ASSERT(it != allocatedBlocks.end());
@@ -300,6 +322,7 @@ namespace gbe
 
     return i;
   }
+
   ///////////////////////////////////////////////////////////////////////////
   // Generic Context (shared by the simulator and the HW context)
   ///////////////////////////////////////////////////////////////////////////
@@ -311,16 +334,18 @@ namespace gbe
     GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
     this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn));
     this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
-    this->partitioner = GBE_NEW_NO_ARG(RegisterFilePartitioner);
+    // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
+    this->registerAllocator = GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
+    this->scratchAllocator = GBE_NEW(ScratchAllocator, 12*KB);
     if (fn.getSimdWidth() == 0 || OCL_SIMD_WIDTH != 15)
       this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
     else
       this->simdWidth = fn.getSimdWidth();
-    this->scratchOffset = 0;
   }
 
   Context::~Context(void) {
-    GBE_SAFE_DELETE(this->partitioner);
+    GBE_SAFE_DELETE(this->registerAllocator);
+    GBE_SAFE_DELETE(this->scratchAllocator);
     GBE_SAFE_DELETE(this->dag);
     GBE_SAFE_DELETE(this->liveness);
   }
@@ -339,20 +364,20 @@ namespace gbe
       this->kernel = NULL;
     }
     if(this->kernel != NULL) {
-      this->kernel->scratchSize = alignScratchSize(this->scratchOffset);
+      this->kernel->scratchSize = alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
       this->kernel->ctx = this;
     }
     return this->kernel;
   }
 
   int16_t Context::allocate(int16_t size, int16_t alignment) {
-    return partitioner->allocate(size, alignment);
+    return registerAllocator->allocate(size, alignment);
   }
 
-  void Context::deallocate(int16_t offset) { partitioner->deallocate(offset); }
+  void Context::deallocate(int16_t offset) { registerAllocator->deallocate(offset); }
 
   void Context::splitBlock(int16_t offset, int16_t subOffset) {
-    partitioner->splitBlock(offset, subOffset);
+    registerAllocator->splitBlock(offset, subOffset);
   }
 
   int32_t Context::allocConstBuf(uint32_t argID) {
@@ -376,10 +401,15 @@ namespace gbe
     return offset + GEN_REG_SIZE;
   }
 
-  uint32_t Context::allocateScratchMem(uint32_t size) {
-    uint32_t offset = scratchOffset;
-    scratchOffset += size;
-    return offset;
+  // FIXME TODO as we optimize scratch memory usage using the register interval.
+  // we need to add some dependency in post_reg_alloc scheduler, to keep scratch
+  // memory that are reused still keep the order
+
+  int32_t Context::allocateScratchMem(uint32_t size) {
+    return scratchAllocator->allocate(size, 32, true);
+  }
+  void Context::deallocateScratchMem(int32_t offset) {
+    scratchAllocator->deallocate(offset);
   }
 
   void Context::buildStack(void) {
@@ -402,7 +432,7 @@ namespace gbe
                               uint32_t alignment)
   {
     alignment = alignment == 0 ? size : alignment;
-    const uint32_t offset = partitioner->allocate(size, alignment, 1);
+    const uint32_t offset = registerAllocator->allocate(size, alignment, 1);
     GBE_ASSERT(offset >= GEN_REG_SIZE);
     kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
     kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 000612e..ac940bd 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -41,7 +41,8 @@ namespace ir {
 namespace gbe
 {
   class Kernel;                 // context creates Kernel
-  class RegisterFilePartitioner; // Partition register file for reg allocation
+  class RegisterAllocator;      // allocator for physical register allocation
+  class ScratchAllocator;       // allocator for scratch memory allocation
 
   /*! Context is the helper structure to build the Gen ISA or simulation code
    *  from GenIR
@@ -94,7 +95,9 @@ namespace gbe
     /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
     uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
     /*! allocate size scratch memory and return start address */
-    uint32_t allocateScratchMem(uint32_t size);
+    int32_t allocateScratchMem(uint32_t size);
+    /*! deallocate scratch memory at offset */
+    void deallocateScratchMem(int32_t offset);
     /*! Preallocated curbe register set including special registers. */
     map<ir::Register, uint32_t> curbeRegs;
   protected:
@@ -129,11 +132,11 @@ namespace gbe
     Kernel *kernel;                       //!< Kernel we are building
     ir::Liveness *liveness;               //!< Liveness info for the variables
     ir::FunctionDAG *dag;                 //!< Graph of values on the function
-    RegisterFilePartitioner *partitioner; //!< Handle register file partionning
+    RegisterAllocator *registerAllocator; //!< physical register allocation
+    ScratchAllocator *scratchAllocator;   //!< scratch memory allocator
     set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
     JIPMap JIPs;                          //!< Where to jump all labels/branches
     uint32_t simdWidth;                   //!< Number of lanes per HW threads
-    uint32_t scratchOffset;               //!< scratch slot for next scratch memory request
     GBE_CLASS(Context);                   //!< Use custom allocators
   };
 
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index c282b36..fdc1d2f 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -174,7 +174,7 @@ namespace gbe
     INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
     INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
     INLINE bool vectorCanSpill(SelectionVector *vector);
-
+    INLINE void allocateScratchForSpilled();
     /*! Use custom allocator */
     GBE_CLASS(Opaque);
   };
@@ -583,6 +583,8 @@ namespace gbe
     }
     if (!spilledRegs.empty()) {
       GBE_ASSERT(reservedReg != 0);
+      allocateScratchForSpilled();
+
       bool success = selection.spillRegs(spilledRegs, reservedReg);
       if (!success) {
         std::cerr << "Fail to spill registers." << std::endl;
@@ -592,6 +594,43 @@ namespace gbe
     return true;
   }
 
+  INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled()
+  {
+    const uint32_t regNum = spilledRegs.size();
+    this->starting.resize(regNum);
+    this->ending.resize(regNum);
+    uint32_t regID = 0;
+    for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) {
+      this->starting[regID] = this->ending[regID] = &intervals[it->first];
+      regID++;
+    }
+    std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
+    std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
+    int toExpire = 0;
+    for(uint32_t i = 0; i < regNum; i++) {
+      const GenRegInterval * cur = starting[i];
+      const GenRegInterval * exp = ending[toExpire];
+      if(exp->maxID < cur->minID) {
+        auto it = spilledRegs.find(exp->reg);
+        GBE_ASSERT(it != spilledRegs.end());
+        if(it->second.addr != -1) {
+          ctx.deallocateScratchMem(it->second.addr);
+        }
+        toExpire++;
+      }
+      auto it = spilledRegs.find(cur->reg);
+      GBE_ASSERT(it != spilledRegs.end());
+      if(cur->minID == cur->maxID) {
+        it->second.addr = -1;
+        continue;
+      }
+
+      ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg);
+      it->second.addr = ctx.allocateScratchMem(getFamilySize(family)
+                                             * ctx.getSimdWidth());
+      }
+  }
+
   INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
   {
     auto it = RA.find(reg);
@@ -643,14 +682,7 @@ namespace gbe
       return false;
     SpillRegTag spillTag;
     spillTag.isTmpReg = interval.maxID == interval.minID;
-    if (!spillTag.isTmpReg) {
-      // FIXME, we can optimize scratch allocation according to
-      // the interval information.
-      ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg);
-      spillTag.addr = ctx.allocateScratchMem(getFamilySize(family)
-                                             * ctx.getSimdWidth());
-    } else
-      spillTag.addr = -1;
+
     if (isAllocated) {
       // If this register is allocated, we need to expire it and erase it
       // from the RA map.
-- 
1.7.9.5



More information about the Beignet mailing list