[Beignet] [PATCH V2 3/3] Backend: Add hole reuse in reg alloction
Xiuli Pan
xiuli.pan at intel.com
Tue Oct 18 09:45:52 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
We first find regs that have pool in simple linear scale, and save them
in HoleRegPool, when allocte regs we first try to search fit candidate
in the pool and choose the most fit one to reuse.
V2: Refine hole reuse only in one block.
DEBUG version: You can export DUMPHOLE for easy review.
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
backend/src/backend/gen_insn_selection.cpp | 31 ++++++
backend/src/backend/gen_insn_selection.hpp | 3 +
backend/src/backend/gen_reg_allocation.cpp | 167 ++++++++++++++++++++++++++---
backend/src/backend/gen_reg_allocation.hpp | 7 ++
4 files changed, 195 insertions(+), 13 deletions(-)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 97ee17a..ba78858 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -225,6 +225,37 @@ namespace gbe
return this->opcode == SEL_OP_LABEL;
}
+ bool SelectionInstruction::isNative(void) const {
+ return this->opcode == SEL_OP_NOT || /* ALU1 */
+ this->opcode == SEL_OP_LZD ||
+ this->opcode == SEL_OP_RNDZ ||
+ this->opcode == SEL_OP_RNDE ||
+ this->opcode == SEL_OP_RNDD ||
+ this->opcode == SEL_OP_RNDU ||
+ this->opcode == SEL_OP_FRC ||
+ this->opcode == SEL_OP_F16TO32 ||
+ this->opcode == SEL_OP_F32TO16 ||
+ this->opcode == SEL_OP_CBIT ||
+ this->opcode == SEL_OP_SEL || /* ALU2 */
+ this->opcode == SEL_OP_AND ||
+ this->opcode == SEL_OP_OR ||
+ this->opcode == SEL_OP_XOR ||
+ this->opcode == SEL_OP_SHR ||
+ this->opcode == SEL_OP_SHL ||
+ this->opcode == SEL_OP_RSR ||
+ this->opcode == SEL_OP_RSL ||
+ this->opcode == SEL_OP_ASR ||
+ this->opcode == SEL_OP_SEL ||
+ this->opcode == SEL_OP_ADD ||
+ this->opcode == SEL_OP_MUL ||
+ this->opcode == SEL_OP_FBH ||
+ this->opcode == SEL_OP_FBL ||
+ this->opcode == SEL_OP_MACH ||
+ this->opcode == SEL_OP_MATH ||
+ this->opcode == SEL_OP_LRP || /* ALU3 */
+ this->opcode == SEL_OP_MAD;
+ }
+
///////////////////////////////////////////////////////////////////////////
// SelectionVector
///////////////////////////////////////////////////////////////////////////
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 814b449..2719858 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -82,6 +82,8 @@ namespace gbe
bool isBranch(void) const;
/*! Is it a label instruction (i.e. change the implicit mask) */
bool isLabel(void) const;
+ /*! Is it a simple navtive label instruction (i.e. will be one simple ISA) */
+ bool isNative(void) const;
/*! Get the destination register */
GenRegister &dst(uint32_t dstID) { return regs[dstID]; }
/*! Get the source register */
@@ -259,6 +261,7 @@ namespace gbe
bool hasBarrier;
bool hasBranch;
bool removeSimpleIfEndif;
+ uint32_t ID;
};
enum SEL_IR_OPT_FEATURE {
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 4451efb..08f087a 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -50,12 +50,14 @@ namespace gbe
struct GenRegInterval {
INLINE GenRegInterval(ir::Register reg) :
reg(reg), minID(INT_MAX), maxID(-INT_MAX), accessCount(0),
- conflictReg(0), b3OpAlign(0) {}
+ conflictReg(0), b3OpAlign(0) , isreused(0), ishole(0),
+ lastblockID(-1), allocID(-1), useID(-1){}
ir::Register reg; //!< (virtual) register of the interval
int32_t minID, maxID; //!< Starting and ending points
int32_t accessCount;
ir::Register conflictReg; // < has banck conflict with this register
- bool b3OpAlign;
+ bool b3OpAlign, isreused, ishole;
+ int32_t allocblockID, lastblockID, allocID, useID;
};
struct SpillInterval {
@@ -123,7 +125,7 @@ namespace gbe
/*! Allocate the vectors detected in the instruction selection pass */
void allocateVector(Selection &selection);
/*! Allocate the given interval. Return true if success */
- bool createGenReg(const Selection &selection, const GenRegInterval &interval);
+ bool createGenReg(const Selection &selection, GenRegInterval &interval);
/*! Indicate if the registers are already allocated in vectors */
bool isAllocated(const SelectionVector *vector) const;
/*! Reallocate registers if needed to make the registers in the vector
@@ -164,6 +166,8 @@ namespace gbe
uint32_t reservedReg;
/*! Current vector to expire */
uint32_t expiringID;
+ /*! PHI regs that can be reused */
+ map<uint32_t, vector<HoleRegTag>> HoleRegPool;
INLINE void insertNewReg(const Selection &selection, ir::Register reg, uint32_t grfOffset, bool isVector = false);
INLINE bool expireReg(ir::Register reg);
INLINE bool spillAtInterval(GenRegInterval interval, int size, uint32_t alignment);
@@ -278,14 +282,67 @@ namespace gbe
}
}
- bool GenRegAllocator::Opaque::createGenReg(const Selection &selection, const GenRegInterval &interval) {
+ INLINE float IDFitess(int a, int b) {
+ if (a == b) return 1.0f;
+ else if (a > b) return 1.0f/(a - b -1);
+ else return 2.0f;
+ }
+ INLINE float getPhiRegFitness(const GenRegInterval &interval, HoleRegTag &holeRegTag) {
+ int regstID = interval.allocID;
+ int regendID = interval.useID;
+ int holeregstID = holeRegTag.startID;
+ int holeregendID = holeRegTag.endID;
+ return IDFitess(regstID, holeregstID) + IDFitess(holeregendID, regendID);
+ }
+
+
+ bool GenRegAllocator::Opaque::createGenReg(const Selection &selection, GenRegInterval &interval) {
using namespace ir;
const ir::Register reg = interval.reg;
- if (RA.contains(reg) == true)
- return true; // already allocated
uint32_t regSize;
ir::RegisterFamily family;
getRegAttrib(reg, regSize, &family);
+ // Reg reuse need very strict condition check, otherwise may have errors
+ if (!interval.ishole && interval.allocID == interval.minID && interval.useID == interval.maxID)
+ {
+ uint32_t blockID = interval.lastblockID;
+ uint32_t useID = interval.useID;
+ auto holepool = this->HoleRegPool.find(blockID);
+ // Use block ID as index to get candidate hole reg to reuse
+ if (holepool != this->HoleRegPool.end()) {
+ auto &holepoolvec = holepool->second;
+ HoleRegTag* holeregbest = NULL;
+ float lastfitness = 0;
+ for (auto itr = holepoolvec.begin() ; itr != holepoolvec.end(); ++itr) {
+ if (regSize != itr->regSize)
+ continue;
+ float fitness = getPhiRegFitness(interval, *itr);
+ // reg out of range of holepool reg
+ if (fitness > 2.0f || fitness < 0.5f) continue;
+ if (fitness > lastfitness) {
+ lastfitness = fitness;
+ holeregbest = &*itr;
+ }
+ // reg has one prefect fit use this
+ if (fitness > 1 ) break;
+ }
+ // Reuse the hole and update the holeregpool
+ if (holeregbest) {
+ auto holereg = holeregbest->reg;
+ holeregbest->startID = useID + 1;
+ if (getenv("DUMPHOLE"))
+ printf("%d (%d:%d->%d)has %zu candidate best fitness is %d with %f\n", (int)reg, blockID, interval.allocID, useID, holepoolvec.size(), (int)holereg, lastfitness);
+ if (RA.contains(holereg)) {
+ uint32_t grfOffset = RA.find(holereg)->second;
+ RA.insert(std::make_pair(reg, grfOffset));
+ offsetReg.insert(std::make_pair(grfOffset, reg));
+ interval.isreused = true;
+ }
+ }
+ }
+ }
+ if (RA.contains(reg) == true)
+ return true; // already allocated
uint32_t grfOffset = allocateReg(interval, regSize, regSize);
if (grfOffset == 0) {
return false;
@@ -712,7 +769,7 @@ namespace gbe
ctx.errCode = REGISTER_ALLOCATION_FAIL;
const uint32_t regNum = ctx.sel->getRegNum();
for (uint32_t startID = 0; startID < regNum; ++startID) {
- const GenRegInterval &interval = *this->starting[startID];
+ GenRegInterval &interval = *this->starting[startID];
const ir::Register reg = interval.reg;
if (interval.maxID == -INT_MAX)
@@ -830,6 +887,8 @@ namespace gbe
INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
{
+ if (this->intervals[reg].isreused)
+ return true;
auto it = RA.find(reg);
if (flagBooleans.contains(reg))
return false;
@@ -1208,10 +1267,13 @@ namespace gbe
}
// Compute the intervals
- int32_t insnID = 0;
+ int insnID = 0;
+ int blockID = 0;
for (auto &block : *selection.blockList) {
+ blockID++;
int32_t lastID = insnID;
- int32_t firstID = insnID;
+ int32_t firstID = insnID ? insnID + 2 : insnID;
+ block.ID = blockID;
// Update the intervals of each used register. Note that we do not
// register allocate R0, so we skip all sub-registers in r0
RegIntervalMap *boolsMap = new RegIntervalMap;
@@ -1219,7 +1281,7 @@ namespace gbe
flag0ReservedBlocks.insert(&block);
for (auto &insn : block.insnList) {
const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
- insn.ID = insnID;
+ insnID = (int)insn.ID;
bool is3SrcOp = insn.opcode == SEL_OP_MAD;
for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
const GenRegister &selReg = insn.src(srcID);
@@ -1244,8 +1306,14 @@ namespace gbe
if (this->intervals[reg].conflictReg == 0 ||
this->intervals[reg].conflictReg > conflictReg)
this->intervals[reg].conflictReg = conflictReg;
- this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
- this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+ int insnsrcID = insnID;
+ // If instruction is simple, src and dst can resed and they will have different IDs
+ if (insn.isNative())
+ insnsrcID -= 1;
+ this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnsrcID);
+ this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnsrcID);
+ this->intervals[reg].useID = insnsrcID;
+ this->intervals[reg].lastblockID = blockID;
}
for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
const GenRegister &selReg = insn.dst(dstID);
@@ -1261,6 +1329,30 @@ namespace gbe
}
this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+ // Only find hole reg that not in ocl registers
+ if ( this->intervals[reg].lastblockID == blockID && reg >= ir::ocl::regNum) {
+ int useID = this->intervals[reg].useID;
+ // Check if the reg will leave a hole and add it to HoleRegPool
+ if (useID >= 0 && useID < insnID - 1 && reg >= ir::ocl::regNum) {
+ if (getenv("DUMPHOLE"))
+ printf("reg %d is hole in %d: %d => %d)\n",(int)reg, blockID, useID, insnID);
+ uint32_t regSize;
+ ir::RegisterFamily family;
+ getRegAttrib(reg, regSize, &family);
+ HoleRegTag holeregtag;
+ holeregtag.reg = reg;
+ holeregtag.startID = useID + 1;
+ holeregtag.endID = insnID - 1;
+ holeregtag.regSize = regSize;
+ this->HoleRegPool[blockID].push_back(holeregtag);
+ this->intervals[reg].ishole = true;
+ }
+ }
+ if (this->intervals[reg].allocID == -1)
+ this->intervals[reg].allocID = insnID;
+ if (this->intervals[reg].allocblockID == -1)
+ this->intervals[reg].allocblockID = blockID;
+ this->intervals[reg].lastblockID= blockID;
}
// OK, a flag is used as a predicate or a conditional modifier
@@ -1303,7 +1395,6 @@ namespace gbe
(block.removeSimpleIfEndif && insn.state.flag == 0 && insn.state.subFlag == 0) ));
}
lastID = insnID;
- insnID++;
}
// All registers alive at the begining of the block must update their intervals.
@@ -1422,6 +1513,7 @@ do { \
INLINE void GenRegAllocator::Opaque::outputAllocation(void) {
using namespace std;
cout << "## register allocation ##" << endl;
+ int allmaxID = 0;
for(auto &i : RA) {
ir::Register vReg = (ir::Register)i.first;
ir::RegisterFamily family;
@@ -1437,6 +1529,9 @@ do { \
<< "[ " << setw(8) << this->intervals[(uint)vReg].minID
<< " -> " << setw(8) << this->intervals[(uint)vReg].maxID
<< "]" << setw(8) << "use count: " << this->intervals[(uint)vReg].accessCount << endl;
+ int thisMaxID = this->intervals[(uint)vReg].maxID;
+ if( thisMaxID > 0)
+ allmaxID = thisMaxID > allmaxID ? thisMaxID : allmaxID;
}
if (!spilledRegs.empty())
cout << "## spilled registers: " << spilledRegs.size() << endl;
@@ -1454,6 +1549,52 @@ do { \
<< "]" << setw(8) << "use count: " << this->intervals[(uint)vReg].accessCount << endl;
}
cout << endl;
+#ifndef NDEBUG
+ int *lregsize = new int[allmaxID + 4];
+ for(int i = 0; i < allmaxID + 4; i++)
+ lregsize[i] = 0;
+ for(auto &i : RA) {
+ ir::Register vReg = (ir::Register)i.first;
+ if (this->intervals[vReg].isreused)
+ continue;
+ ir::RegisterFamily family;
+ uint32_t regSize;
+ getRegAttrib(vReg, regSize, &family);
+ int regminID = this->intervals[(uint)vReg].minID;
+ int regmaxID = this->intervals[(uint)vReg].maxID;
+ regminID = (regminID > allmaxID ? 0 : regminID);
+ regmaxID = (regmaxID < 0 ? allmaxID - 1: regmaxID);
+ for(int j = regminID; j <= regmaxID; ++j) {
+ lregsize[j] += regSize;
+ }
+ }
+ for(auto it = spilledRegs.begin(); it != spilledRegs.end(); it++) {
+ ir::Register vReg = it->first;
+ ir::RegisterFamily family;
+ uint32_t regSize;
+ getRegAttrib(vReg, regSize, &family);
+ int regminID = this->intervals[(uint)vReg].minID;
+ int regmaxID = this->intervals[(uint)vReg].maxID;
+ regminID = (regminID > allmaxID ? 0 : regminID);
+ regmaxID = (regmaxID < 0 ? allmaxID - 1: regmaxID);
+ for(int j = regminID; j <= regmaxID; ++j) {
+ lregsize[j] += regSize;
+ }
+ }
+ cout<<"We have " << allmaxID << " lines" << endl;
+
+ int themaxID = 0, themaxSize = 0;
+ for (int i = 0; i < allmaxID; i+=2) {
+ if (themaxSize < lregsize[i]) {
+ themaxID = i;
+ themaxSize = lregsize[i];
+ }
+ //cout << "[" << i << "]" << "with " << lregsize[i] << "B (" << lregsize[i]/32 <<") regs!" << endl;
+ }
+
+ cout << "[" << themaxID << "]" << "with " << themaxSize << "B (" << themaxSize/32 <<") regs!" << endl;
+ delete [] lregsize;
+#endif
}
INLINE GenRegister setGenReg(const GenRegister &src, uint32_t grfOffset) {
diff --git a/backend/src/backend/gen_reg_allocation.hpp b/backend/src/backend/gen_reg_allocation.hpp
index 8d5e797..dbdb056 100644
--- a/backend/src/backend/gen_reg_allocation.hpp
+++ b/backend/src/backend/gen_reg_allocation.hpp
@@ -40,6 +40,13 @@ namespace gbe
int32_t addr;
} SpillRegTag;
+ typedef struct HoleRegTag {
+ uint32_t startID;
+ uint32_t endID;
+ uint32_t regSize;
+ ir::Register reg;
+ }HoleRegTag;
+
typedef map<ir::Register, SpillRegTag> SpilledRegs;
/*! Register allocate (i.e. virtual to physical register mapping) */
--
2.7.4
More information about the Beignet
mailing list