[Mesa-dev] [PATCH] nvc0,gm107/ir: add cycle count estimation

Sun Jul 8 14:40:22 UTC 2018

With branching, for simplicity and usefulness, this assumes both paths are
taken. With loops, it assumes their basic blocks execute 10 times.

The average latency for variable latency instructions in this patch is rather
poor, with only something reasonably accurate for IMUL/IMAD. It should be
better than nothing though.

Since information is lacking and this may miss some details, the estimates
should probably be taken with caution, at least until we get better
numbers for variable latency instructions.

Estimation can be enabled or disabled through NV50_PROG_CYCLE_ESTIMATE,
which defaults to enabled on debug builds.

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir.cpp    |   3 +
 src/gallium/drivers/nouveau/codegen/nv50_ir.h      |   6 +
 .../drivers/nouveau/codegen/nv50_ir_driver.h       |   1 +
 .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 132 ++++++++++++++++++++-
 .../drivers/nouveau/codegen/nv50_ir_graph.cpp      |   7 +-
 .../drivers/nouveau/codegen/nv50_ir_graph.h        |   2 +-
 .../nouveau/codegen/nv50_ir_target_gm107.cpp       |  24 ++++
 .../drivers/nouveau/codegen/nv50_ir_target_gm107.h |   1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c    |   4 +-
 9 files changed, 169 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index 49425b98b9..a0c6057dd1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1119,6 +1119,7 @@ Program::Program(Type type, Target *arch)
    binSize = 0;
 
    maxGPR = -1;
+   cycleEstimate = 0;
 
    main = new Function(this, "MAIN", ~0);
    calls.insert(&main->call);
@@ -1279,6 +1280,8 @@ nv50_ir_generate_code(struct nv50_ir_prog_info *info)
       goto out;
    }
 
+   info->bin.cycleEstimate = prog->cycleEstimate;
+
 out:
    INFO_DBG(prog->dbgFlags, VERBOSE, "nv50_ir_generate_code: ret = %i\n", ret);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index f4f3c70888..79e4c7cccf 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -1136,6 +1136,11 @@ public:
 
    bool explicitCont; // loop headers: true if loop contains continue stmts
 
+   // used for cycle count estimation on GM107+
+   Instruction *unresolvedBarriers[6];
+   bool unresolvedBarriersAreRead[6];
+   uint32_t cycleEstimate;
+
 private:
    int id;
    DLList df;
@@ -1282,6 +1287,7 @@ public:
    uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL
 
    int maxGPR;
+   uint32_t cycleEstimate;
 
    MemoryPool mem_Instruction;
    MemoryPool mem_CmpInstruction;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 7c835ceab8..1c7e7f4b5a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -99,6 +99,7 @@ struct nv50_ir_prog_info
       void *fixupData;
       struct nv50_ir_prog_symbol *syms;
       uint16_t numSyms;
+      uint32_t cycleEstimate;
    } bin;
 
    struct nv50_ir_varying sv[PIPE_MAX_SHADER_INPUTS];
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 26826d6360..c84b9e59d5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -3479,7 +3479,12 @@ CodeEmitterGM107::getMinEncodingSize(const Instruction *i) const
 class SchedDataCalculatorGM107 : public Pass
 {
 public:
-   SchedDataCalculatorGM107(const TargetGM107 *targ) : targ(targ) {}
+   SchedDataCalculatorGM107(const TargetGM107 *targ);
+
+   inline uint32_t getCycleEstimate() const
+   {
+      return cycleEstimate;
+   }
 
 private:
    struct RegScores
@@ -3573,9 +3578,20 @@ private:
       }
    };
 
+   struct InsnCostInfo {
+      std::vector<Instruction *> waitSrcs;
+      std::vector<bool> waitsAreRead;
+      uint32_t bbCycleEstimate;
+   };
+
    RegScores *score; // for current BB
    std::vector<RegScores> scoreBoards;
 
+   bool estimateCycleCount;
+   int cycleEstimate;
+   // used for cycle count estimation
+   std::vector<InsnCostInfo> insnCostInfo;
+
    const TargetGM107 *targ;
    bool visit(Function *);
    bool visit(BasicBlock *);
@@ -3591,7 +3607,7 @@ private:
    inline void emitReuse(Instruction *, uint8_t);
    inline void emitWrDepBar(Instruction *, uint8_t);
    inline void emitRdDepBar(Instruction *, uint8_t);
-   inline void emitWtDepBar(Instruction *, uint8_t);
+   inline void emitWtDepBar(Instruction *, uint8_t, Instruction *, bool);
 
    inline int getStall(const Instruction *) const;
    inline int getWrDepBar(const Instruction *) const;
@@ -3624,8 +3640,20 @@ private:
 
    bool needRdDepBar(const Instruction *) const;
    bool needWrDepBar(const Instruction *) const;
+
+   void doEstimateCycleCount(BasicBlock *);
 };
 
+SchedDataCalculatorGM107::SchedDataCalculatorGM107(const TargetGM107 *targ)
+   : estimateCycleCount(false), cycleEstimate(0), targ(targ)
+{
+   #ifdef DEBUG
+   estimateCycleCount = true;
+   #endif
+   estimateCycleCount = debug_get_bool_option("NV50_PROG_CYCLE_ESTIMATE",
+                                              estimateCycleCount);
+}
+
 inline void
 SchedDataCalculatorGM107::emitStall(Instruction *insn, uint8_t cnt)
 {
@@ -3658,10 +3686,15 @@ SchedDataCalculatorGM107::emitRdDepBar(Instruction *insn, uint8_t id)
 }
 
 inline void
-SchedDataCalculatorGM107::emitWtDepBar(Instruction *insn, uint8_t id)
+SchedDataCalculatorGM107::emitWtDepBar(Instruction *insn, uint8_t id,
+                                       Instruction *source, bool read)
 {
    assert(id < 6);
    insn->sched |= 1 << (11 + id);
+   if (source && estimateCycleCount) {
+      insnCostInfo[insn->serial].waitSrcs.push_back(source);
+      insnCostInfo[insn->serial].waitsAreRead.push_back(read);
+   }
 }
 
 inline void
@@ -4048,6 +4081,9 @@ SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb)
    BitSet bars(6, 1);
    int bar_id;
 
+   if (estimateCycleCount)
+      memset(bb->unresolvedBarriers, 0, sizeof(bb->unresolvedBarriers));
+
    for (insn = bb->getEntry(); insn != NULL; insn = next) {
       Instruction *usei = NULL, *defi = NULL;
       bool need_wr_bar, need_rd_bar;
@@ -4059,7 +4095,7 @@ SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb)
            it != live_uses.end();) {
          if (insn->serial >= it->usei->serial) {
             int wr = getWrDepBar(it->insn);
-            emitWtDepBar(insn, wr);
+            emitWtDepBar(insn, wr, it->insn, false);
             bars.clr(wr); // free barrier
             it = live_uses.erase(it);
             continue;
@@ -4072,7 +4108,7 @@ SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb)
            it != live_defs.end();) {
          if (insn->serial >= it->defi->serial) {
             int rd = getRdDepBar(it->insn);
-            emitWtDepBar(insn, rd);
+            emitWtDepBar(insn, rd, it->insn, true);
             bars.clr(rd); // free barrier
             it = live_defs.erase(it);
             continue;
@@ -4098,6 +4134,11 @@ SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb)
          emitWrDepBar(insn, bar_id);
          if (usei)
             live_uses.push_back(LiveBarUse(insn, usei));
+         else
+         if (estimateCycleCount) {
+            bb->unresolvedBarriers[bar_id] = insn;
+            bb->unresolvedBarriersAreRead[bar_id] = false;
+         }
       }
 
       if (need_rd_bar) {
@@ -4117,6 +4158,11 @@ SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb)
          emitRdDepBar(insn, bar_id);
          if (defi)
             live_defs.push_back(LiveBarDef(insn, defi));
+         else
+         if (estimateCycleCount) {
+            bb->unresolvedBarriers[bar_id] = insn;
+            bb->unresolvedBarriersAreRead[bar_id] = true;
+         }
       }
    }
 
@@ -4166,6 +4212,17 @@ SchedDataCalculatorGM107::visit(Function *func)
 bool
 SchedDataCalculatorGM107::visit(BasicBlock *bb)
 {
+   if (estimateCycleCount) {
+      // cycle count estimation requires sequential serials
+      int serial = 0;
+      for (Instruction *insn = bb->getFirst(); insn; insn = insn->next)
+         insn->serial = serial++;
+
+      insnCostInfo.resize(bb->getInsnCount());
+      for (size_t i = 0; i < bb->getInsnCount(); i++)
+         insnCostInfo[i].bbCycleEstimate = -1;
+   }
+
    Instruction *insn, *next = NULL;
    int cycle = 0;
 
@@ -4205,7 +4262,7 @@ SchedDataCalculatorGM107::visit(BasicBlock *bb)
    Instruction *start = bb->getEntry();
    if (start && bb->cfg.incidentCount() > 0) {
       for (int b = 0; b < 6; b++)
-         emitWtDepBar(start, b);
+         emitWtDepBar(start, b, NULL, false);
    }
 
    for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) {
@@ -4271,10 +4328,70 @@ SchedDataCalculatorGM107::visit(BasicBlock *bb)
    setDelay(insn, bbDelay, next);
    cycle += getStall(insn);
 
+   if (estimateCycleCount)
+      doEstimateCycleCount(bb);
+
    score->rebase(cycle); // common base for initializing out blocks' scores
    return true;
 }
 
+void
+SchedDataCalculatorGM107::doEstimateCycleCount(BasicBlock *bb)
+{
+   // the first instruction of a basic block is set to wait for all barriers
+   // so we have to try to figure out what barriers it could be waiting for
+   // and where they were set
+   Instruction *start = bb->getEntry();
+   if (start) {
+      for (Graph::EdgeIterator ei = bb->cfg.incident();
+           !ei.end(); ei.next()) {
+         if (ei.getType() == Graph::Edge::BACK)
+            continue;
+         BasicBlock *in = BasicBlock::get(ei.getNode());
+
+         for (int b = 0; b < 6; b++) {
+            if (!in->unresolvedBarriers[b])
+               continue;
+            InsnCostInfo& info = insnCostInfo[start->serial];
+            info.waitSrcs.push_back(in->unresolvedBarriers[b]);
+            info.waitsAreRead.push_back(in->unresolvedBarriersAreRead[b]);
+         }
+      }
+   }
+
+   bb->cycleEstimate = 0;
+   for (Instruction *insn = bb->getEntry(); insn; insn = insn->next) {
+      InsnCostInfo& costInfo = insnCostInfo[insn->serial];
+      costInfo.bbCycleEstimate = bb->cycleEstimate;
+
+      int cost = getStall(insn);
+      for (size_t i = 0; i < costInfo.waitSrcs.size(); i++) {
+         Instruction *waitFor = costInfo.waitSrcs[i];
+         InsnCostInfo& waitForInfo = insnCostInfo[waitFor->serial];
+
+         int dist = bb->cycleEstimate;
+         if (waitFor->bb == bb)
+            dist -= waitForInfo.bbCycleEstimate;
+         else
+            dist += waitFor->bb->cycleEstimate - waitForInfo.bbCycleEstimate;
+
+         int latency = targ->getAvgLatency(waitFor, costInfo.waitsAreRead[i]);
+         cost = MAX2(cost, MAX2(latency - dist, 0));
+      }
+      bb->cycleEstimate += cost;
+   }
+
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      if (bb->cfg.reachableBy(ei.getNode(), NULL, false)) {
+         // assume loops execute 10 times
+         bb->cycleEstimate *= 10;
+         break;
+      }
+   }
+
+   cycleEstimate += bb->cycleEstimate;
+}
+
 /*******************************************************************************
  * main
  ******************************************************************************/
@@ -4285,6 +4402,7 @@ CodeEmitterGM107::prepareEmission(Function *func)
    SchedDataCalculatorGM107 sched(targGM107);
    CodeEmitter::prepareEmission(func);
    sched.run(func, true, true);
+   func->getProgram()->cycleEstimate += sched.getCycleEstimate();
 }
 
 static inline uint32_t sizeToBundlesGM107(uint32_t size)
@@ -4295,6 +4413,8 @@ static inline uint32_t sizeToBundlesGM107(uint32_t size)
 void
 CodeEmitterGM107::prepareEmission(Program *prog)
 {
+   prog->cycleEstimate = 0;
+
    for (ArrayList::Iterator fi = prog->allFuncs.iterator();
         !fi.end(); fi.next()) {
       Function *func = reinterpret_cast<Function *>(fi.get());
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
index b1076cf412..94f9bf35c5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
@@ -166,7 +166,8 @@ Graph::Edge::Edge(Node *org, Node *tgt, Type kind)
 }
 
 bool
-Graph::Node::reachableBy(const Node *node, const Node *term) const
+Graph::Node::reachableBy(const Node *node, const Node *term,
+                         bool ignoreBack) const
 {
    std::stack<const Node *> stack;
    const Node *pos = NULL;
@@ -184,7 +185,9 @@ Graph::Node::reachableBy(const Node *node, const Node *term) const
          continue;
 
       for (EdgeIterator ei = pos->outgoing(); !ei.end(); ei.next()) {
-         if (ei.getType() == Edge::BACK || ei.getType() == Edge::DUMMY)
+         if (ignoreBack && ei.getType() == Edge::BACK)
+            continue;
+         if (ei.getType() == Edge::DUMMY)
             continue;
          if (ei.getNode()->visit(seq))
             stack.push(ei.getNode());
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
index 115f20e5e9..ca9efabe5e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
@@ -117,7 +117,7 @@ public:
 
       inline Node *parent() const; // returns NULL if count(incident edges) != 1
 
-      bool reachableBy(const Node *node, const Node *term) const;
+      bool reachableBy(const Node *node, const Node *term, bool ignoreBack = true) const;
 
       inline bool visit(int);
       inline int  getSequence() const;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
index 04cbd402a1..ce45b9b954 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
@@ -261,6 +261,30 @@ TargetGM107::getLatency(const Instruction *insn) const
    return 15;
 }
 
+int
+TargetGM107::getAvgLatency(const Instruction *insn, bool read) const
+{
+   if (!isBarrierRequired(insn))
+      return getLatency(insn);
+
+   // assume that the read latency is high but lower than the write latency
+   int unknown = read ? getAvgLatency(insn, false) / 2 : 50;
+
+   // TODO: use better values
+   switch (insn->op) {
+   case OP_MUL:
+   case OP_MAD:
+      assert(!isFloatType(insn->dType));
+      if (read)
+         return unknown;
+      // Table 4.1 from "Dissecting the NVIDIA Volta GPU Architecture via
+      // Microbenchmarking" (https://arxiv.org/pdf/1804.06826.pdf)
+      return 86;
+   default:
+      return unknown;
+   }
+}
+
 // Return the operand read latency which is the number of stall counts before
 // an instruction can read its sources. For memory operations like ATOM, LOAD
 // and STORE, the memory access has to be indirect.
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
index dd4aa6a54d..132bb84f0b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
@@ -22,6 +22,7 @@ public:
 
    virtual bool canDualIssue(const Instruction *, const Instruction *) const;
    virtual int getLatency(const Instruction *) const;
+   virtual int getAvgLatency(const Instruction *, bool) const;
    virtual int getReadLatency(const Instruction *) const;
 };
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 57d98753f4..a72b2cfe27 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -699,10 +699,10 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
                                                 &prog->pipe.stream_output);
 
    pipe_debug_message(debug, SHADER_INFO,
-                      "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d",
+                      "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d, cycles: %d",
                       prog->type, info->bin.tlsSpace, info->bin.smemSize,
                       prog->num_gprs, info->bin.instructions,
-                      info->bin.codeSize);
+                      info->bin.codeSize, info->bin.cycleEstimate);
 
 #ifdef DEBUG
    if (debug_get_option("NV50_PROG_CHIPSET", NULL) && info->dbgFlags)
-- 
2.14.4