[Mesa-dev] [PATCH 6/7] i965/sched: use Sethi-Ullman numbering

Fri Oct 30 18:02:57 PDT 2015

Sethi-Ullman numbering is a heuristic that lets us see a little farther
when it comes to making decisions based on register pressure, instead of
only greedily trying to decrease register pressure based on a threshold.
It provides a lower bound on the number of registers required to
schedule a node and all of its children, which we can use to avoid
scheduling nodes which would definitely require more registers than we
could afford.

total instructions in shared programs: 7564666 -> 7564666 (0.00%)
instructions in affected programs: 0 -> 0
helped: 0
HURT: 0

total cycles in shared programs: 51126212 -> 50217894 (-1.78%)
cycles in affected programs: 8758434 -> 7850116 (-10.37%)
helped: 1754
HURT: 1462

LOST:   36
GAINED: 72
Signed-off-by: Connor Abbott <cwabbott0 at gmail.com>
---
 .../drivers/dri/i965/brw_schedule_instructions.cpp | 197 ++++++++++++++++++---
 1 file changed, 169 insertions(+), 28 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 85590df..af992a4 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -82,6 +82,12 @@ public:
     * its children, or just the issue_time if it's a leaf node.
     */
    int delay;
+
+   /*
+    * The Sethi-Ullman number for this node, which is a lower bound on the
+    * number of registers required to schedule it.
+    */
+   int regs_required;
 };
 
 void
@@ -456,6 +462,7 @@ public:
    void run(cfg_t *cfg);
    void add_insts_from_block(bblock_t *block);
    void compute_delay(schedule_node *node);
+   void compute_regs_required(schedule_node *node);
    virtual void calculate_deps() = 0;
    virtual schedule_node *choose_instruction_to_schedule() = 0;
 
@@ -472,6 +479,7 @@ public:
    virtual void setup_liveness(cfg_t *cfg) = 0;
    virtual void update_register_pressure(backend_instruction *inst) = 0;
    virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
+   virtual int dst_size(schedule_node *n) = 0;
 
    void schedule_instructions(bblock_t *block);
 
@@ -553,7 +561,10 @@ public:
    void count_writes_remaining(backend_instruction *inst);
    void setup_liveness(cfg_t *cfg);
    void update_register_pressure(backend_instruction *inst);
+   void update_unblocked(backend_instruction *inst);
+   int get_register_write_benefit(fs_inst *inst);
    int get_register_pressure_benefit(backend_instruction *inst);
+   int dst_size(schedule_node *n);
 };
 
 fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
@@ -676,17 +687,23 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
 }
 
 int
-fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+fs_instruction_scheduler::get_register_write_benefit(fs_inst *inst)
 {
-   fs_inst *inst = (fs_inst *)be;
-   int benefit = 0;
-
    if (inst->dst.file == GRF) {
       if (!BITSET_TEST(livein[block_idx], inst->dst.reg) &&
           writes_remaining[inst->dst.reg] == 1)
-         benefit += v->alloc.sizes[inst->dst.reg];
+         return v->alloc.sizes[inst->dst.reg];
    }
 
+   return 0;
+}
+
+int
+fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+{
+   fs_inst *inst = (fs_inst *)be;
+   int benefit = get_register_write_benefit(inst);
+
    for (int i = 0; i < inst->sources; i++) {
       if (is_src_duplicate(inst, i))
          continue;
@@ -725,6 +742,7 @@ public:
    void setup_liveness(cfg_t *cfg);
    void update_register_pressure(backend_instruction *inst);
    int get_register_pressure_benefit(backend_instruction *inst);
+   int dst_size(schedule_node *n);
 };
 
 vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
@@ -768,6 +786,7 @@ schedule_node::schedule_node(backend_instruction *inst,
    this->parent_count = 0;
    this->unblocked_time = 0;
    this->delay = 0;
+   this->regs_required = 0;
 
    /* We can't measure Gen6 timings directly but expect them to be much
     * closer to Gen7 than Gen4.
@@ -818,6 +837,95 @@ instruction_scheduler::compute_delay(schedule_node *n)
    }
 }
 
+int
+fs_instruction_scheduler::dst_size(schedule_node *n)
+{
+   fs_inst *inst = (fs_inst *) n->inst;
+   if (inst->dst.file != GRF)
+      return 0;
+
+   return v->alloc.sizes[inst->dst.reg];
+}
+
+int
+vec4_instruction_scheduler::dst_size(schedule_node *n)
+{
+   return 0;
+}
+
+static int compare(const void *data1, const void *data2)
+{
+   const schedule_node *n1 = *(schedule_node **)data1;
+   const schedule_node *n2 = *(schedule_node **)data2;
+   return n1->regs_required - n2->regs_required;
+}
+
+/**
+ * Compute the Sethi-Ullman numbering for a node.
+ *
+ * The Sethi-Ullman numbering is a lower bound for the number of registers
+ * required to schedule a given node and all of its children in the DAG. Say
+ * that we've scheduled all of a node N's children optimally. We'll ignore any
+ * interactions between the children, and assume that each child can be
+ * scheduled independently and one after another; this is an approximation,
+ * only correct for expression trees without any barriers. We schedule the
+ * child that takes the least registers first (although in the final order,
+ * it'll be last because we're doing bottom-up scheduling), and order the rest
+ * by the number of registers they require. We have a list of children C_1,
+ * C_2, ... of node N that are in increasing order of registers required.
+ * Now, say that C_n+1 requires dst_size(C_n+1) more registers than C_n. The
+ * result of C_n+1 can be passed through C_n without increasing the number of
+ * registers required beyond what C_n+1 requires, so the number of registers
+ * required is the same as the number of registers that C_n+1 requires.
+ * Otherwise, the number of registers required will be regs_required(C_n) +
+ * dst_size(C_n+1). Furthermore, this number of registers required defines a
+ * new "high water mark" against which to compare the next instruction's
+ * registers required. Any later children have to pass their result through
+ * the high water mark, and doing so will increase the number of registers
+ * required, unless the new source C_m requires even more registers; then the
+ * high water mark resets to C_m. We don't need to store where the high water
+ * mark actually is, though; all we need to know is its current height (i.e.
+ * the current number of registers required) as we compare it to each child in
+ * turn.
+ */
+void
+instruction_scheduler::compute_regs_required(schedule_node *n)
+{
+   int num_children = 0;
+   for (int i = 0; i < n->child_count; i++) {
+      /* Assume dependencies with a latency of 0 are fake and ignore them */
+      if (n->child_latency[i] == 0)
+         continue;
+
+      num_children++;
+   }
+
+   if (num_children == 0) {
+      n->regs_required = dst_size(n);
+      return;
+   }
+
+   schedule_node *children[num_children];
+   int j = 0;
+   for (int i = 0; i < n->child_count; i++) {
+      if (n->child_latency[i] == 0)
+         continue;
+
+      children[j++] = n->children[i];
+   }
+
+   /* sort the children in increasing order of registers required */
+   qsort(children, num_children, sizeof(schedule_node *), compare);
+
+   n->regs_required = children[0]->regs_required;
+   for (int i = 1; i < num_children; i++) {
+      n->regs_required = MAX2(n->regs_required + dst_size(children[i]),
+                              children[i]->regs_required);
+   }
+
+   n->regs_required = MAX2(n->regs_required, dst_size(n));
+}
+
 /**
  * Add a dependency between two instruction nodes.
  *
@@ -1391,27 +1499,56 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
    schedule_node *chosen = NULL;
 
    if (post_reg_alloc || reg_pressure < pressure_threshold) {
-      int chosen_unblocked_time = 0, chosen_delay = 0;
-
-      /* First, find the earliest instruction we can possibly schedule. Then,
-       * if there are multiple instructions that we can schedule at the same
-       * time, choose the thing with the longest critical path. The idea here
-       * is to try not to lengthen already larger critical path lengths, but
-       * if we can, we should first schedule instructions that we can fit in
-       * before the instruction with the longest critical path. There might be
-       * some cases where we still bump the critical path instruction, but
-       * this seems unlikely, given that most instructions have the same issue
-       * time and most latencies are a multiple of the issue time.
-       */
       foreach_in_list(schedule_node, n, &instructions) {
+         if (!chosen) {
+            chosen = n;
+            continue;
+         }
+
+         if (!post_reg_alloc) {
+            /* Penalize choices which require more registers than the
+             * threshold. Note that if a choice benefits from ending a live
+             * interval, we should take that into account; for example,
+             * imagine we have an add instruction that takes two inputs. The
+             * instruction requires two registers to schedule, but it ends the
+             * live range of the register it writes to, then it only requires
+             * one additional register beyond the current register pressure.
+             */
+            int write_benefit = get_register_write_benefit((fs_inst *)n->inst);
+            int chosen_write_benefit =
+               get_register_write_benefit((fs_inst *)chosen->inst);
+            int excess = MAX2(0, reg_pressure + n->regs_required
+                                 - write_benefit - pressure_threshold);
+            int chosen_excess = MAX2(0, reg_pressure + chosen->regs_required
+                                        - chosen_write_benefit
+                                        - pressure_threshold);
+            if (excess < chosen_excess) {
+               chosen = n;
+               continue;
+            } else if (excess > chosen_excess) {
+               continue;
+            }
+         }
+
+         /* First, find the earliest instruction we can possibly schedule.
+          * Then, if there are multiple instructions that we can schedule at
+          * the same time, choose the thing with the longest critical path.
+          * The idea here is to try not to lengthen already larger critical
+          * path lengths, but if we can, we should first schedule instructions
+          * that we can fit in before the instruction with the longest
+          * critical path. There might be some cases where we still bump the
+          * critical path instruction, but this seems unlikely, given that
+          * most instructions have the same issue time and most latencies are
+          * a multiple of the issue time.
+          */
          int unblocked_time = MAX2(n->unblocked_time, time);
-         if (!chosen ||
-             unblocked_time < chosen_unblocked_time ||
+         int chosen_unblocked_time = MAX2(chosen->unblocked_time, time);
+
+         if (unblocked_time < chosen_unblocked_time ||
              (unblocked_time == chosen_unblocked_time &&
-              n->delay > chosen_delay)) {
+              n->delay > chosen->delay)) {
             chosen = n;
-            chosen_unblocked_time = unblocked_time;
-            chosen_delay = n->delay;
+            continue;
          }
       }
    } else {
@@ -1445,15 +1582,13 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
          }
 
          /* For instructions with the same benefit, prefer the one with the
-          * highest delay to the end of the program.  This is most likely to
-          * have its values able to be consumed first (such as for a large
-          * tree of lowered ubo loads, which appear reversed in the
-          * instruction stream with respect to when they can be consumed).
+          * lowest number of registers required. This will reduce the amount
+          * we go over the threshold.
           */
-         if (n->delay < chosen->delay) {
+         if (n->regs_required < chosen->regs_required) {
             chosen = n;
             continue;
-         } else if (n->delay > chosen->delay) {
+         } else if (n->regs_required > chosen->regs_required) {
             continue;
          }
 
@@ -1651,6 +1786,12 @@ instruction_scheduler::run(cfg_t *cfg)
          compute_delay(n);
       }
 
+      if (!post_reg_alloc) {
+         foreach_in_list(schedule_node, n, &instructions) {
+            compute_regs_required(n);
+         }
+      }
+
       schedule_instructions(block);
    }
 
-- 
2.4.3