Mesa (master): nv50/ir/ra: enforce max register requirement, and change spill order

Sat Nov 17 07:49:52 UTC 2018

Module: Mesa
Branch: master
Commit: beb66d374724583e56430992a8458188f07802b8
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=beb66d374724583e56430992a8458188f07802b8

Author: Ilia Mirkin <imirkin at alum.mit.edu>
Date:   Sun Nov 11 15:52:33 2018 -0500

nv50/ir/ra: enforce max register requirement, and change spill order

On nv50, certain operations must happen on regs below 64, due to
encoding requirements. First of all, we add infrastructure to enforce
this. Secondly we change the spill order to first spill RIG nodes that
are unconstrained, followed by ones that are.

This makes the gamecube logo shadertoy compile properly. Curiously, if
we adjust the spill order so that we first spill the constrained RIG
nodes instead, the RA also succeeds. However it seems more logical to
first spill the unconstrained ones.

While we're at it, drop the nv50 max register to reserve r127 as the
zero register of last resort (r63 is preferred).

Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
Acked-by: Karol Herbst <kherbst at redhat.com>

---

 src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp | 25 ++++++++++++++--------
 .../nouveau/codegen/nv50_ir_target_nv50.cpp        |  2 +-
 .../drivers/nouveau/codegen/nv50_ir_util.cpp       |  8 +++----
 src/gallium/drivers/nouveau/codegen/nv50_ir_util.h |  7 ++++--
 4 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 8639c149b5..1e08f2176c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -55,7 +55,7 @@ public:
    void periodicMask(DataFile f, uint32_t lock, uint32_t unlock);
    void intersect(DataFile f, const RegisterSet *);
 
-   bool assign(int32_t& reg, DataFile f, unsigned int size);
+   bool assign(int32_t& reg, DataFile f, unsigned int size, unsigned int maxReg);
    void release(DataFile f, int32_t reg, unsigned int size);
    void occupy(DataFile f, int32_t reg, unsigned int size);
    void occupy(const Value *);
@@ -160,9 +160,9 @@ RegisterSet::print(DataFile f) const
 }
 
 bool
-RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size)
+RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size, unsigned int maxReg)
 {
-   reg = bits[f].findFreeRange(size);
+   reg = bits[f].findFreeRange(size, maxReg);
    if (reg < 0)
       return false;
    fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
@@ -747,6 +747,7 @@ private:
    public:
       uint32_t degree;
       uint16_t degreeLimit; // if deg < degLimit, node is trivially colourable
+      uint16_t maxReg;
       uint16_t colors;
 
       DataFile f;
@@ -884,12 +885,12 @@ GCRA::RIG_Node::init(const RegisterSet& regs, LValue *lval)
 
    weight = std::numeric_limits<float>::infinity();
    degree = 0;
-   int size = regs.getFileSize(f);
+   maxReg = regs.getFileSize(f);
    // On nv50, we lose a bit of gpr encoding when there's an embedded
    // immediate.
    if (regs.restrictedGPR16Range && f == FILE_GPR && (lval->reg.size == 2 || isShortRegVal(lval)))
-      size /= 2;
-   degreeLimit = size;
+      maxReg /= 2;
+   degreeLimit = maxReg;
    degreeLimit -= relDegree[1][colors] - 1;
 
    livei.insert(lval->livei);
@@ -949,6 +950,8 @@ GCRA::coalesceValues(Value *dst, Value *src, bool force)
    // add val's definitions to rep and extend the live interval of its RIG node
    rep->defs.insert(rep->defs.end(), val->defs.begin(), val->defs.end());
    nRep->livei.unify(nVal->livei);
+   nRep->degreeLimit = MIN2(nRep->degreeLimit, nVal->degreeLimit);
+   nRep->maxReg = MIN2(nRep->maxReg, nVal->maxReg);
    return true;
 }
 
@@ -1322,13 +1325,17 @@ GCRA::simplify()
       } else
       if (!DLLIST_EMPTY(&hi)) {
          RIG_Node *best = hi.next;
+         unsigned bestMaxReg = best->maxReg;
          float bestScore = best->weight / (float)best->degree;
-         // spill candidate
+         // Spill candidate. First go through the ones with the highest max
+         // register, then the ones with lower. That way the ones with the
+         // lowest requirement will be allocated first, since it's a stack.
          for (RIG_Node *it = best->next; it != &hi; it = it->next) {
             float score = it->weight / (float)it->degree;
-            if (score < bestScore) {
+            if (score < bestScore || it->maxReg > bestMaxReg) {
                best = it;
                bestScore = score;
+               bestMaxReg = it->maxReg;
             }
          }
          if (isinf(bestScore)) {
@@ -1429,7 +1436,7 @@ GCRA::selectRegisters()
       LValue *lval = node->getValue();
       if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
          regs.print(node->f);
-      bool ret = regs.assign(node->reg, node->f, node->colors);
+      bool ret = regs.assign(node->reg, node->f, node->colors, node->maxReg);
       if (ret) {
          INFO_DBG(prog->dbgFlags, REG_ALLOC, "assigned reg %i\n", node->reg);
          lval->compMask = node->getCompMask();
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 2981497340..ec94590a3f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -203,7 +203,7 @@ TargetNV50::getFileSize(DataFile file) const
 {
    switch (file) {
    case FILE_NULL:          return 0;
-   case FILE_GPR:           return 256; // in 16-bit units **
+   case FILE_GPR:           return 254; // in 16-bit units **
    case FILE_PREDICATE:     return 0;
    case FILE_FLAGS:         return 4;
    case FILE_ADDRESS:       return 4;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
index 1daf778e93..dc4ebd51ac 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
@@ -311,12 +311,12 @@ void BitSet::setOr(BitSet *pA, BitSet *pB)
    }
 }
 
-int BitSet::findFreeRange(unsigned int count) const
+int BitSet::findFreeRange(unsigned int count, unsigned int max) const
 {
    const uint32_t m = (1 << count) - 1;
-   int pos = size;
+   int pos = max;
    unsigned int i;
-   const unsigned int end = (size + 31) / 32;
+   const unsigned int end = (max + 31) / 32;
 
    if (count == 1) {
       for (i = 0; i < end; ++i) {
@@ -373,7 +373,7 @@ int BitSet::findFreeRange(unsigned int count) const
 
    pos += i * 32;
 
-   return ((pos + count) <= size) ? pos : -1;
+   return ((pos + count) <= max) ? pos : -1;
 }
 
 void BitSet::print() const
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
index c619499046..affe04a2dd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
@@ -539,8 +539,11 @@ public:
       return data[i / 32] & (((1 << n) - 1) << (i % 32));
    }
 
-   // Find a range of size (<= 32) clear bits aligned to roundup_pow2(size).
-   int findFreeRange(unsigned int size) const;
+   // Find a range of count (<= 32) clear bits aligned to roundup_pow2(count).
+   int findFreeRange(unsigned int count, unsigned int max) const;
+   inline int findFreeRange(unsigned int count) const {
+      return findFreeRange(count, size);
+   }
 
    BitSet& operator|=(const BitSet&);