[Mesa-dev] [PATCH 3/7] i965/sched: get rid of the LIFO heuristic

Fri Oct 30 18:02:54 PDT 2015

In the future, we're going to rewrite the scheduler to switch to a more
conservative heuristic if it goes over a register pressure limit, and
turn it into a bottom-up scheduler. The LIFO heuristic is more difficult
with a limit-based scheduler, since it's not clear when to switch to
LIFO and when to simply try and reduce register pressure, and bottom-up
scheduling makes a lot of the problems this was trying to solve go away.
Before, I tried to add another heuristic which supposedly solved the
same problem as the LIFO heuristic, but by the end of the series
removing it made no difference.

total instructions in shared programs: 7384899 -> 7398972 (0.19%)
instructions in affected programs: 72694 -> 86767 (19.36%)
helped: 25
HURT: 30

total cycles in shared programs: 49150106 -> 49112080 (-0.08%)
cycles in affected programs: 524582 -> 486556 (-7.25%)
helped: 45
HURT: 10

LOST:   223
GAINED: 0
Signed-off-by: Connor Abbott <cwabbott0 at gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp               |  1 -
 .../drivers/dri/i965/brw_schedule_instructions.cpp | 61 ++--------------------
 src/mesa/drivers/dri/i965/brw_shader.h             |  1 -
 3 files changed, 5 insertions(+), 58 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 2d0acb9..72b3677 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -4930,7 +4930,6 @@ fs_visitor::allocate_registers()
    static const enum instruction_scheduler_mode pre_modes[] = {
       SCHEDULE_PRE,
       SCHEDULE_PRE_NON_LIFO,
-      SCHEDULE_PRE_LIFO,
    };
 
    /* Try each scheduling heuristic to see if it can successfully register
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 3ae8c3f..56d91ee 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -78,12 +78,6 @@ public:
    int latency;
 
    /**
-    * Which iteration of pushing groups of children onto the candidates list
-    * this node was a part of.
-    */
-   unsigned cand_generation;
-
-   /**
     * This is the sum of the instruction's latency plus the maximum delay of
     * its children, or just the issue_time if it's a leaf node.
     */
@@ -771,7 +765,6 @@ schedule_node::schedule_node(backend_instruction *inst,
    this->child_count = 0;
    this->parent_count = 0;
    this->unblocked_time = 0;
-   this->cand_generation = 0;
    this->delay = 0;
 
    /* We can't measure Gen6 timings directly but expect them to be much
@@ -1427,8 +1420,6 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
        * latency.
        */
       foreach_in_list(schedule_node, n, &instructions) {
-         fs_inst *inst = (fs_inst *)n->inst;
-
          if (!chosen) {
             chosen = n;
             continue;
@@ -1451,50 +1442,11 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
             continue;
          }
 
-         if (mode == SCHEDULE_PRE_LIFO) {
-            /* Prefer instructions that recently became available for
-             * scheduling.  These are the things that are most likely to
-             * (eventually) make a variable dead and reduce register pressure.
-             * Typical register pressure estimates don't work for us because
-             * most of our pressure comes from texturing, where no single
-             * instruction to schedule will make a vec4 value dead.
-             */
-            if (n->cand_generation > chosen->cand_generation) {
-               chosen = n;
-               continue;
-            } else if (n->cand_generation < chosen->cand_generation) {
-               continue;
-            }
-
-            /* On MRF-using chips, prefer non-SEND instructions.  If we don't
-             * do this, then because we prefer instructions that just became
-             * candidates, we'll end up in a pattern of scheduling a SEND,
-             * then the MRFs for the next SEND, then the next SEND, then the
-             * MRFs, etc., without ever consuming the results of a send.
-             */
-            if (v->devinfo->gen < 7) {
-               fs_inst *chosen_inst = (fs_inst *)chosen->inst;
-
-               /* We use regs_written > 1 as our test for the kind of send
-                * instruction to avoid -- only sends generate many regs, and a
-                * single-result send is probably actually reducing register
-                * pressure.
-                */
-               if (inst->regs_written <= inst->exec_size / 8 &&
-                   chosen_inst->regs_written > chosen_inst->exec_size / 8) {
-                  chosen = n;
-                  continue;
-               } else if (inst->regs_written > chosen_inst->regs_written) {
-                  continue;
-               }
-            }
-         }
-
-         /* For instructions pushed on the cands list at the same time, prefer
-          * the one with the highest delay to the end of the program.  This is
-          * most likely to have its values able to be consumed first (such as
-          * for a large tree of lowered ubo loads, which appear reversed in
-          * the instruction stream with respect to when they can be consumed).
+         /* For instructions with the same benefit, prefer the one with the
+          * highest delay to the end of the program.  This is most likely to
+          * have its values able to be consumed first (such as for a large
+          * tree of lowered ubo loads, which appear reversed in the
+          * instruction stream with respect to when they can be consumed).
           */
          if (n->delay > chosen->delay) {
             chosen = n;
@@ -1563,7 +1515,6 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
          n->remove();
    }
 
-   unsigned cand_generation = 1;
    while (!instructions.is_empty()) {
       schedule_node *chosen = choose_instruction_to_schedule();
 
@@ -1614,7 +1565,6 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
             bs->dump_instruction(child->inst);
          }
 
-         child->cand_generation = cand_generation;
          child->parent_count--;
          if (child->parent_count == 0) {
             if (debug) {
@@ -1623,7 +1573,6 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
             instructions.push_head(child);
          }
       }
-      cand_generation++;
 
       /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
        * but it's more limited pre-gen6, so if we send something off to it then
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 6a2dfc9..5966478 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -163,7 +163,6 @@ struct backend_instruction {
 enum instruction_scheduler_mode {
    SCHEDULE_PRE,
    SCHEDULE_PRE_NON_LIFO,
-   SCHEDULE_PRE_LIFO,
    SCHEDULE_POST,
 };
 
-- 
2.4.3