[Beignet] [PATCH 3/5] GBE: optimize post reg allocation instruction scheduling.

Zhigang Gong zhigang.gong at intel.com
Wed May 21 18:41:50 PDT 2014


To make the post scheduling working better, I relax the frequency of
the calling of expireGRF when doing register allocation. Thus we can
reduce the physical register conflict and doing the post scheduling.

Another optimization is to insert a pre retire for the instruction
to release those WRITE_AFTER_READ dependency. Write after read will
not bring any hazard, so we can release those register as soon as
the instruction scheduled.

The pre register allocation scheduling is quite different than post
schedlulig, for now, just disable it.

The whole patch could get about 10% perfromance gain with luxmark.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |  48 ++--
 backend/src/backend/gen_insn_scheduling.cpp        | 242 ++++++++++++++-------
 backend/src/backend/gen_insn_selection.cpp         |  10 +-
 backend/src/backend/gen_reg_allocation.cpp         |  12 +-
 4 files changed, 202 insertions(+), 110 deletions(-)

diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 9eb04de..8535b4a 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -1,20 +1,20 @@
 //                 Family     Latency     SIMD16     SIMD8
 DECL_GEN7_SCHEDULE(Label,           0,         0,        0)
 DECL_GEN7_SCHEDULE(Unary,           20,        4,        2)
-DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        4,        2)
+DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        40,      20)
 DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
-DECL_GEN7_SCHEDULE(BinaryWithTemp,  20,        4,        2)
+DECL_GEN7_SCHEDULE(BinaryWithTemp,  20,        40,      20)
 DECL_GEN7_SCHEDULE(Ternary,         20,        4,        2)
-DECL_GEN7_SCHEDULE(I64Shift,        20,        4,        2)
-DECL_GEN7_SCHEDULE(I64HADD,         20,        4,        2)
-DECL_GEN7_SCHEDULE(I64RHADD,        20,        4,        2)
-DECL_GEN7_SCHEDULE(I64ToFloat,      20,        4,        2)
-DECL_GEN7_SCHEDULE(FloatToI64,      20,        4,        2)
-DECL_GEN7_SCHEDULE(I64MULHI,        20,        4,        2)
-DECL_GEN7_SCHEDULE(I64MADSAT,       20,        4,        2)
+DECL_GEN7_SCHEDULE(I64Shift,        20,        40,      20)
+DECL_GEN7_SCHEDULE(I64HADD,         20,        40,      20)
+DECL_GEN7_SCHEDULE(I64RHADD,        20,        40,      20)
+DECL_GEN7_SCHEDULE(I64ToFloat,      20,        40,      20)
+DECL_GEN7_SCHEDULE(FloatToI64,      20,        40,      20)
+DECL_GEN7_SCHEDULE(I64MULHI,        20,        40,      20)
+DECL_GEN7_SCHEDULE(I64MADSAT,       20,        40,      20)
 DECL_GEN7_SCHEDULE(Compare,         20,        4,        2)
-DECL_GEN7_SCHEDULE(I64Compare,      20,        4,        2)
-DECL_GEN7_SCHEDULE(I64DIVREM,       20,        4,        2)
+DECL_GEN7_SCHEDULE(I64Compare,      20,        80,      20)
+DECL_GEN7_SCHEDULE(I64DIVREM,       20,        80,      20)
 DECL_GEN7_SCHEDULE(Jump,            14,        1,        1)
 DECL_GEN7_SCHEDULE(IndirectMove,    20,        2,        2)
 DECL_GEN7_SCHEDULE(Eot,             20,        1,        1)
@@ -25,18 +25,18 @@ DECL_GEN7_SCHEDULE(Barrier,         80,        1,        1)
 DECL_GEN7_SCHEDULE(Fence,           80,        1,        1)
 DECL_GEN7_SCHEDULE(Read64,          80,        1,        1)
 DECL_GEN7_SCHEDULE(Write64,         80,        1,        1)
-DECL_GEN7_SCHEDULE(UntypedRead,     80,        1,        1)
-DECL_GEN7_SCHEDULE(UntypedWrite,    80,        1,        1)
-DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
-DECL_GEN7_SCHEDULE(ByteScatter,     80,        1,        1)
-DECL_GEN7_SCHEDULE(DWordGather,     80,        1,        1)
-DECL_GEN7_SCHEDULE(PackByte,        20,        1,        1)
-DECL_GEN7_SCHEDULE(UnpackByte,      20,        1,        1)
-DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
+DECL_GEN7_SCHEDULE(UntypedRead,     160,       1,        1)
+DECL_GEN7_SCHEDULE(UntypedWrite,    160,       1,        1)
+DECL_GEN7_SCHEDULE(ByteGather,      160,       1,        1)
+DECL_GEN7_SCHEDULE(ByteScatter,     160,       1,        1)
+DECL_GEN7_SCHEDULE(DWordGather,     160,       1,        1)
+DECL_GEN7_SCHEDULE(PackByte,        40,        1,        1)
+DECL_GEN7_SCHEDULE(UnpackByte,      40,        1,        1)
+DECL_GEN7_SCHEDULE(Sample,          160,       1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
-DECL_GEN7_SCHEDULE(SpillReg,        80,        1,        1)
-DECL_GEN7_SCHEDULE(UnSpillReg,      80,        1,        1)
+DECL_GEN7_SCHEDULE(SpillReg,        20,        1,        1)
+DECL_GEN7_SCHEDULE(UnSpillReg,      160,       1,        1)
 DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
-DECL_GEN7_SCHEDULE(I64MUL,          20,        4,        2)
-DECL_GEN7_SCHEDULE(I64SATADD,       20,        4,        2)
-DECL_GEN7_SCHEDULE(I64SATSUB,       20,        4,        2)
+DECL_GEN7_SCHEDULE(I64MUL,          20,        40,      20)
+DECL_GEN7_SCHEDULE(I64SATADD,       20,        40,      20)
+DECL_GEN7_SCHEDULE(I64SATSUB,       20,        40,      20)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index c3c4728..eada722 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -95,18 +95,26 @@ namespace gbe
   // Node for the schedule DAG
   struct ScheduleDAGNode;
 
+  typedef enum {
+    WRITE_AFTER_WRITE,
+    WRITE_AFTER_READ,
+    READ_AFTER_WRITE,
+    READ_AFTER_WRITE_MEMORY
+  } DepMode;
+
   /*! We need to chain together the node we point */
   struct ScheduleListNode : public intrusive_list_node
   {
-    INLINE ScheduleListNode(ScheduleDAGNode *node) : node(node) {}
+    INLINE ScheduleListNode(ScheduleDAGNode *node, DepMode m = READ_AFTER_WRITE) : node(node), depMode(m) {}
     ScheduleDAGNode *node;
+    DepMode depMode;
   };
 
   /*! Node of the DAG */
   struct ScheduleDAGNode
   {
     INLINE ScheduleDAGNode(SelectionInstruction &insn) :
-      insn(insn), refNum(0), retiredCycle(0) {}
+      insn(insn), refNum(0), retiredCycle(0), preRetired(false), readDistance(0x7fffffff) {}
     bool dependsOn(ScheduleDAGNode *node) const {
       GBE_ASSERT(node != NULL);
       for (auto child : node->children)
@@ -122,6 +130,8 @@ namespace gbe
     uint32_t refNum;
     /*! Cycle when the instruction is retired */
     uint32_t retiredCycle;
+    bool preRetired;
+    uint32_t readDistance;
   };
 
   /*! To track loads and stores */
@@ -144,17 +154,17 @@ namespace gbe
   {
     DependencyTracker(const Selection &selection, SelectionScheduler &scheduler);
     /*! Reset it before scheduling a new block */
-    void clear(void);
+    void clear(bool fullClear = false);
     /*! Get an index in the node array for the given register */
     uint32_t getIndex(GenRegister reg) const;
     /*! Get an index in the node array for the given memory system */
     uint32_t getIndex(uint32_t bti) const;
     /*! Add a new dependency "node0 depends on node1" */
-    void addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1);
+    void addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1, DepMode m);
     /*! Add a new dependency "node0 depends on node located at index" */
-    void addDependency(ScheduleDAGNode *node0, uint32_t index);
+    void addDependency(ScheduleDAGNode *node0, uint32_t index, DepMode m);
     /*! Add a new dependency "node located at index depends on node0" */
-    void addDependency(uint32_t index, ScheduleDAGNode *node0);
+    void addDependency(uint32_t index, ScheduleDAGNode *node0, DepMode m);
     /*! No dependency for null registers and immediate */
     INLINE bool ignoreDependency(GenRegister reg) const {
       if (reg.file == GEN_IMMEDIATE_VALUE)
@@ -168,23 +178,9 @@ namespace gbe
     /*! Owns the tracker */
     SelectionScheduler &scheduler;
     /*! Add a new dependency "node0 depends on node set for register reg" */
-    INLINE  void addDependency(ScheduleDAGNode *node0, GenRegister reg) {
-      if (this->ignoreDependency(reg) == false) {
-        const uint32_t index = this->getIndex(reg);
-        this->addDependency(node0, index);
-        if (reg.isdf() || reg.isint64())
-          this->addDependency(node0, index + 1);
-      }
-    }
+    void addDependency(ScheduleDAGNode *node0, GenRegister reg, DepMode m);
     /*! Add a new dependency "node set for register reg depends on node0" */
-    INLINE void addDependency(GenRegister reg, ScheduleDAGNode *node0) {
-      if (this->ignoreDependency(reg) == false) {
-        const uint32_t index = this->getIndex(reg);
-        this->addDependency(index, node0);
-        if (reg.isdf() || reg.isint64())
-          this->addDependency(index + 1, node0);
-      }
-    }
+    void addDependency(GenRegister reg, ScheduleDAGNode *node0, DepMode m);
     /*! Make the node located at insnID a barrier */
     void makeBarrier(int32_t insnID, int32_t insnNum);
     /*! Update all the writes (memory, predicates, registers) */
@@ -195,6 +191,8 @@ namespace gbe
     static const uint32_t MAX_ACC_REGISTER = 1u;
     /*! Stores the last node that wrote to a register / memory ... */
     vector<ScheduleDAGNode*> nodes;
+    /*! store nodes each node depends on */
+    map<ScheduleDAGNode *, vector<ScheduleDAGNode*>> deps;
     /*! Stores the nodes per instruction */
     vector<ScheduleDAGNode*> insnNodes;
     /*! Number of virtual register in the selection */
@@ -210,8 +208,11 @@ namespace gbe
     void clearLists(void);
     /*! Return the number of instructions to schedule in the DAG */
     int32_t buildDAG(SelectionBlock &bb);
-    /*! Schedule the DAG */
-    void scheduleDAG(SelectionBlock &bb, int32_t insnNum);
+    /*! traverse read node and update read distance for all the child. */
+    void traverseReadNode(ScheduleDAGNode *node, uint32_t degree = 0);
+    /*! Schedule the DAG, pre register allocation and post register allocation. */
+    void preScheduleDAG(SelectionBlock &bb, int32_t insnNum);
+    void postScheduleDAG(SelectionBlock &bb, int32_t insnNum);
     /*! To limit register pressure or limit insn latency problems */
     SchedulePolicy policy;
     /*! Make ScheduleListNode allocation faster */
@@ -245,22 +246,49 @@ namespace gbe
     insnNodes.resize(selection.getLargestBlockSize());
   }
 
-  void DependencyTracker::clear(void) { for (auto &x : nodes) x = NULL; }
+  void DependencyTracker::clear(bool fullClear) { for (auto &x : nodes) x = NULL; if (fullClear) deps.clear(); }
+  void DependencyTracker::addDependency(ScheduleDAGNode *node0, GenRegister reg, DepMode m) {
+    if (this->ignoreDependency(reg) == false) {
+      const uint32_t index = this->getIndex(reg);
+      this->addDependency(node0, index, m);
+      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+        this->addDependency(node0, index + 1, m);
+    }
+  }
+
+  void DependencyTracker::addDependency(GenRegister reg, ScheduleDAGNode *node0, DepMode m) {
+    if (this->ignoreDependency(reg) == false) {
+      const uint32_t index = this->getIndex(reg);
+      this->addDependency(index, node0, m);
+      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+        this->addDependency(index + 1, node0, m);
+    }
+  }
 
-  void DependencyTracker::addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1) {
+  void DependencyTracker::addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1, DepMode depMode) {
     if (node0 != NULL && node1 != NULL && node0 != node1 && node0->dependsOn(node1) == false) {
-      ScheduleListNode *dep = scheduler.newScheduleListNode(node0);
+      if (node1->insn.isRead())
+        depMode = depMode == READ_AFTER_WRITE ? READ_AFTER_WRITE_MEMORY : depMode;
+      ScheduleListNode *dep = scheduler.newScheduleListNode(node0, depMode);
       node0->refNum++;
       node1->children.push_back(dep);
+      auto it = deps.find(node0);
+      if (it != deps.end()) {
+        it->second.push_back(node1);
+      } else {
+        vector<ScheduleDAGNode*> vn;
+        vn.push_back(node1);
+        deps.insert(std::make_pair(node0, vn));
+      }
     }
   }
 
-  void DependencyTracker::addDependency(ScheduleDAGNode *node, uint32_t index) {
-    this->addDependency(node, this->nodes[index]);
+  void DependencyTracker::addDependency(ScheduleDAGNode *node, uint32_t index, DepMode m) {
+    this->addDependency(node, this->nodes[index], m);
   }
 
-  void DependencyTracker::addDependency(uint32_t index, ScheduleDAGNode *node) {
-    this->addDependency(this->nodes[index], node);
+  void DependencyTracker::addDependency(uint32_t index, ScheduleDAGNode *node, DepMode m) {
+    this->addDependency(this->nodes[index], node, m);
   }
 
   void DependencyTracker::makeBarrier(int32_t barrierID, int32_t insnNum) {
@@ -268,11 +296,11 @@ namespace gbe
 
     // The barrier depends on all nodes before it
     for (int32_t insnID = 0; insnID < barrierID; ++insnID)
-      this->addDependency(barrier, this->insnNodes[insnID]);
+      this->addDependency(barrier, this->insnNodes[insnID], WRITE_AFTER_WRITE);
 
     // All nodes after barriers depend on the barrier
     for (int32_t insnID = barrierID + 1; insnID < insnNum; ++insnID)
-      this->addDependency(this->insnNodes[insnID], barrier);
+      this->addDependency(this->insnNodes[insnID], barrier, WRITE_AFTER_WRITE);
   }
 
   static GenRegister getFlag(const SelectionInstruction &insn) {
@@ -332,7 +360,7 @@ namespace gbe
       if (this->ignoreDependency(dst) == false) {
         const uint32_t index = this->getIndex(dst);
         this->nodes[index] = node;
-        if (dst.isdf() || dst.isint64())
+        if (scheduler.policy == POST_ALLOC && (dst.isdf() || dst.isint64()))
           this->nodes[index + 1] = node;
       }
     }
@@ -413,10 +441,27 @@ namespace gbe
     this->active.fast_clear();
   }
 
+  void SelectionScheduler::traverseReadNode(ScheduleDAGNode *node, uint32_t degree) {
+    GBE_ASSERT(degree != 0 || node->insn.isRead());
+    if (node->readDistance != 0x7FFFFFFF)
+      return;
+    node->readDistance = degree;
+    if (degree > 5)
+      return;
+    //printf("node id %d op %d degree %d \n", node->insn.ID, node->insn.opcode, degree);
+    auto it = tracker.deps.find(node);
+    if (it != tracker.deps.end()) {
+      for (auto &depNode : it->second) {
+        if (depNode && !depNode->insn.isRead())
+          traverseReadNode(depNode, degree + 1);
+      }
+    }
+  }
+
   int32_t SelectionScheduler::buildDAG(SelectionBlock &bb) {
     nodePool.rewind();
     listPool.rewind();
-    tracker.clear();
+    tracker.clear(true);
     this->clearLists();
 
     // Track write-after-write and read-after-write dependencies
@@ -428,21 +473,21 @@ namespace gbe
 
       // read-after-write in registers
       for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
-        tracker.addDependency(node, insn.src(srcID));
+        tracker.addDependency(node, insn.src(srcID), READ_AFTER_WRITE);
 
       // read-after-write for predicate
       if (insn.state.predicate != GEN_PREDICATE_NONE)
-        tracker.addDependency(node, getFlag(insn));
+        tracker.addDependency(node, getFlag(insn), READ_AFTER_WRITE);
 
       // read-after-write in memory
       if (insn.isRead()) {
         const uint32_t index = tracker.getIndex(insn.extra.function);
-        tracker.addDependency(node, index);
+        tracker.addDependency(node, index, READ_AFTER_WRITE);
       }
       //read-after-write of scratch memory
       if (insn.opcode == SEL_OP_UNSPILL_REG) {
         const uint32_t index = tracker.getIndex(0xff);
-        tracker.addDependency(node, index);
+        tracker.addDependency(node, index, READ_AFTER_WRITE);
       }
 
       // Consider barriers and wait are reading memory (local and global)
@@ -451,42 +496,32 @@ namespace gbe
         insn.opcode == SEL_OP_WAIT) {
         const uint32_t local = tracker.getIndex(0xfe);
         const uint32_t global = tracker.getIndex(0x00);
-        tracker.addDependency(node, local);
-        tracker.addDependency(node, global);
+        tracker.addDependency(node, local, READ_AFTER_WRITE);
+        tracker.addDependency(node, global, READ_AFTER_WRITE);
       }
 
       // write-after-write in registers
       for (uint32_t dstID = 0; dstID < insn.dstNum; ++dstID)
-        tracker.addDependency(node, insn.dst(dstID));
+        tracker.addDependency(node, insn.dst(dstID), WRITE_AFTER_WRITE);
 
       // write-after-write for predicate
       if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP || insn.state.modFlag)
-        tracker.addDependency(node, getFlag(insn));
+        tracker.addDependency(node, getFlag(insn), WRITE_AFTER_WRITE);
 
       // write-after-write for accumulators
       if (insn.state.accWrEnable)
-        tracker.addDependency(node, GenRegister::acc());
+        tracker.addDependency(node, GenRegister::acc(), WRITE_AFTER_WRITE);
 
       // write-after-write in memory
       if (insn.isWrite()) {
         const uint32_t index = tracker.getIndex(insn.extra.function);
-        tracker.addDependency(node, index);
+        tracker.addDependency(node, index, WRITE_AFTER_WRITE);
       }
 
       // write-after-write in scratch memory
       if (insn.opcode == SEL_OP_SPILL_REG) {
         const uint32_t index = tracker.getIndex(0xff);
-        tracker.addDependency(node, index);
-      }
-
-      // Consider barriers and wait are writing memory (local and global)
-    if (insn.opcode == SEL_OP_BARRIER ||
-        insn.opcode == SEL_OP_FENCE ||
-        insn.opcode == SEL_OP_WAIT) {
-        const uint32_t local = tracker.getIndex(0xfe);
-        const uint32_t global = tracker.getIndex(0x00);
-        tracker.addDependency(node, local);
-        tracker.addDependency(node, global);
+        tracker.addDependency(node, index, WRITE_AFTER_WRITE);
       }
 
       // Track all writes done by the instruction
@@ -501,16 +536,16 @@ namespace gbe
 
       // write-after-read in registers
       for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
-        tracker.addDependency(insn.src(srcID), node);
+        tracker.addDependency(insn.src(srcID), node, WRITE_AFTER_READ);
 
       // write-after-read for predicate
       if (insn.state.predicate != GEN_PREDICATE_NONE)
-        tracker.addDependency(getFlag(insn), node);
+        tracker.addDependency(getFlag(insn), node, WRITE_AFTER_READ);
 
       // write-after-read in memory
       if (insn.isRead()) {
         const uint32_t index = tracker.getIndex(insn.extra.function);
-        tracker.addDependency(index, node);
+        tracker.addDependency(index, node, WRITE_AFTER_READ);
       }
 
       // Consider barriers and wait are reading memory (local and global)
@@ -519,14 +554,22 @@ namespace gbe
           insn.opcode == SEL_OP_WAIT) {
         const uint32_t local = tracker.getIndex(0xfe);
         const uint32_t global = tracker.getIndex(0x00);
-        tracker.addDependency(local, node);
-        tracker.addDependency(global, node);
+        tracker.addDependency(local, node, WRITE_AFTER_READ);
+        tracker.addDependency(global, node, WRITE_AFTER_READ);
       }
 
       // Track all writes done by the instruction
       tracker.updateWrites(node);
     }
 
+    // Update distance to read for each read node.
+    for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
+      ScheduleDAGNode *node = tracker.insnNodes[insnID];
+      const SelectionInstruction &insn = node->insn;
+      if (insn.isRead())
+        traverseReadNode(node);
+    }
+
     // Make labels and branches non-schedulable (i.e. they act as barriers)
     for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
       ScheduleDAGNode *node = tracker.insnNodes[insnID];
@@ -546,56 +589,93 @@ namespace gbe
     return insnNum;
   }
 
-  void SelectionScheduler::scheduleDAG(SelectionBlock &bb, int32_t insnNum) {
+  void SelectionScheduler::preScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
+    printf("Not implemented yet. \n");
+  }
+
+  void SelectionScheduler::postScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
     uint32_t cycle = 0;
     const bool isSIMD8 = this->ctx.getSimdWidth() == 8;
+    vector <ScheduleDAGNode *> scheduledNodes;
     while (insnNum) {
 
       // Retire all the instructions that finished
+      //printf("cycle = %d \n", cycle);
       for (auto toRetireIt = active.begin(); toRetireIt != active.end();) {
         ScheduleDAGNode *toRetireNode = toRetireIt.node()->node;
+        // Firstly, put all write after read children to ready.
+        if (toRetireNode->preRetired == false) {
+          auto &children = toRetireNode->children;
+          toRetireNode->preRetired = true;
+          //printf("id %d pre retired \n", toRetireNode->insn.ID);
+          for (auto it = children.begin(); it != children.end();) {
+            ScheduleListNode *listNode = it.node();
+            if (listNode->depMode != WRITE_AFTER_READ) {
+              ++it;
+              continue;
+            }
+            if (--it->node->refNum == 0) {
+              //printf("pre push id %d to ready list. \n", listNode->node->insn.ID);
+              it = children.erase(it);
+              this->ready.push_back(listNode);
+            } else
+              ++it;
+          }
+          if (children.size() == 0) {
+            toRetireIt = this->active.erase(toRetireIt);
+            continue;
+          }
+        }
         // Instruction is now complete
         if (toRetireNode->retiredCycle <= cycle) {
           toRetireIt = this->active.erase(toRetireIt);
+          //printf("id %d retired \n", toRetireNode->insn.ID);
           // Traverse all children and make them ready if no more dependency
           auto &children = toRetireNode->children;
           for (auto it = children.begin(); it != children.end();) {
+            ScheduleListNode *listNode = it.node();
+            if (listNode->depMode == WRITE_AFTER_READ) {
+              ++it;
+              continue;
+            }
             if (--it->node->refNum == 0) {
-              ScheduleListNode *listNode = it.node();
               it = children.erase(it);
-              this->ready.push_back(listNode);
+              if (listNode->depMode != WRITE_AFTER_READ)
+                this->ready.push_back(listNode);
+              //printf("push id %d to ready list. \n", listNode->node->insn.ID);
             } else
               ++it;
           }
-        }
-        // Get the next one
-        else
+        } else
           ++toRetireIt;
       }
 
       // Try to schedule something from the ready list
       intrusive_list<ScheduleListNode>::iterator toSchedule;
-      if (policy == POST_ALLOC) // FIFO scheduling
-        toSchedule = this->ready.begin();
-      else                      // LIFO scheduling
-        toSchedule = this->ready.rbegin();
-        // toSchedule = this->ready.begin();
+      toSchedule = this->ready.begin();
+      float minCost = 1000;
+      for(auto it = this->ready.begin(); it != this->ready.end(); ++it) {
+        float cost = (it->depMode != WRITE_AFTER_READ) * 10.0
+                     - 10.0 / (it->node->readDistance == 0 ? 0.1 : it->node->readDistance);
+        if (cost < minCost) {
+          toSchedule = it;
+          minCost = cost;
+        }
+      }
 
       if (toSchedule != this->ready.end()) {
+        //printf("get id %d  op %d to schedule \n", toSchedule->node->insn.ID, toSchedule->node->insn.opcode);
         // The instruction is instantaneously issued to simulate zero cycle
         // scheduling
-        if (policy == POST_ALLOC)
-          cycle += getThroughputGen7(toSchedule->node->insn, isSIMD8);
+        cycle += getThroughputGen7(toSchedule->node->insn, isSIMD8);
 
         this->ready.erase(toSchedule);
         this->active.push_back(toSchedule.node());
         // When we schedule before allocation, instruction is instantaneously
         // ready. This allows to have a real LIFO strategy
-        if (policy == POST_ALLOC)
-          toSchedule->node->retiredCycle = cycle + getLatencyGen7(toSchedule->node->insn);
-        else
-          toSchedule->node->retiredCycle = cycle;
+        toSchedule->node->retiredCycle = cycle + getLatencyGen7(toSchedule->node->insn);
         bb.append(&toSchedule->node->insn);
+        scheduledNodes.push_back(toSchedule->node);
         insnNum--;
       } else
         cycle++;
@@ -611,7 +691,7 @@ namespace gbe
       for (auto &bb : *selection.blockList) {
         const int32_t insnNum = scheduler.buildDAG(bb);
         bb.insnList.clear();
-        scheduler.scheduleDAG(bb, insnNum);
+        scheduler.postScheduleDAG(bb, insnNum);
       }
     }
   }
@@ -619,10 +699,12 @@ namespace gbe
   void schedulePreRegAllocation(GenContext &ctx, Selection &selection) {
     if (OCL_PRE_ALLOC_INSN_SCHEDULE) {
       SelectionScheduler scheduler(ctx, selection, PRE_ALLOC);
+      // FIXME, need to implement proper pre reg allocation scheduling algorithm.
+      return;
       for (auto &bb : *selection.blockList) {
         const int32_t insnNum = scheduler.buildDAG(bb);
         bb.insnList.clear();
-        scheduler.scheduleDAG(bb, insnNum);
+        scheduler.preScheduleDAG(bb, insnNum);
       }
     }
   }
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index e99dd4c..a253c07 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -164,16 +164,18 @@ namespace gbe
 
   bool SelectionInstruction::isRead(void) const {
     return this->opcode == SEL_OP_UNTYPED_READ ||
-           this->opcode == SEL_OP_READ64 ||
+           this->opcode == SEL_OP_READ64       ||
            this->opcode == SEL_OP_ATOMIC       ||
-           this->opcode == SEL_OP_BYTE_GATHER;
+           this->opcode == SEL_OP_BYTE_GATHER  ||
+           this->opcode == SEL_OP_SAMPLE;
   }
 
   bool SelectionInstruction::isWrite(void) const {
     return this->opcode == SEL_OP_UNTYPED_WRITE ||
-           this->opcode == SEL_OP_WRITE64 ||
+           this->opcode == SEL_OP_WRITE64       ||
            this->opcode == SEL_OP_ATOMIC        ||
-           this->opcode == SEL_OP_BYTE_SCATTER;
+           this->opcode == SEL_OP_BYTE_SCATTER  ||
+           this->opcode == SEL_OP_TYPED_WRITE;
   }
 
   bool SelectionInstruction::isBranch(void) const {
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 8349e9a..b23ab92 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -931,8 +931,16 @@ namespace gbe
                                                        uint32_t size,
                                                        uint32_t alignment) {
     uint32_t grfOffset;
-
-    this->expireGRF(interval);
+    static uint32_t tick = 0;
+    // Doing expireGRF too freqently will cause the post register allocation
+    // scheduling very hard. As it will cause a very high register conflict rate.
+    // The tradeoff here is to reduce the freqency here. And if we are under spilling
+    // then no need to reduce that freqency as the register pressure is the most
+    // important factor.
+    if (tick % (ctx.getSimdWidth() == 8 ? 12 : 4) == 0
+        || ctx.reservedSpillRegs != 0)
+      this->expireGRF(interval);
+    tick++;
     while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
       const bool success = this->expireGRF(interval);
       if (success == false) {
-- 
1.8.3.2



More information about the Beignet mailing list