[Mesa-dev] [PATCH 5/7] i965/sched: switch to bottom-up scheduling

Connor Abbott cwabbott0 at gmail.com
Fri Oct 30 18:02:56 PDT 2015


Previously, our scheduler was doing top-down scheduling, scheduling from
the beginning to the end. But top-down scheduling isn't good for
scheduling things with many nodes with no input dependencies (i.e. many
reads) and few nodes with no output dependencies (i.e. few writes),
which is generally true of shaders, because it isn't able to determine
when to schedule the inputs. With bottom-up scheduling, we're better
able to schedule inputs when we need to in order to reduce register
pressure, while in other cases still doing aggressive scheduling. After
this commit, there are no more shaders in my copy of shader-db that
spill.

total instructions in shared programs: 7403011 -> 7374802 (-0.38%)
instructions in affected programs: 86700 -> 58491 (-32.54%)
helped: 55
HURT: 0

total cycles in shared programs: 48350374 -> 48781484 (0.89%)
cycles in affected programs: 36812664 -> 37243774 (1.17%)
helped: 22829
HURT: 19197

LOST:   6
GAINED: 528
Signed-off-by: Connor Abbott <cwabbott0 at gmail.com>
---
 .../drivers/dri/i965/brw_schedule_instructions.cpp | 168 ++++++++++-----------
 1 file changed, 79 insertions(+), 89 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 55c28f6..85590df 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -410,7 +410,7 @@ public:
       this->mode = mode;
       this->time = 0;
       if (!post_reg_alloc) {
-         this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count);
+         this->reg_pressure_out = rzalloc_array(mem_ctx, int, block_count);
 
          this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
          for (int i = 0; i < block_count; i++)
@@ -427,21 +427,21 @@ public:
             this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
                                                 BITSET_WORDS(hw_reg_count));
 
-         this->written = rzalloc_array(mem_ctx, bool, grf_count);
+         this->read = rzalloc_array(mem_ctx, bool, grf_count);
 
-         this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count);
+         this->writes_remaining = rzalloc_array(mem_ctx, int, grf_count);
 
-         this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count);
+         this->hw_read = rzalloc_array(mem_ctx, bool, hw_reg_count);
 
          this->max_reg_pressure = 0;
       } else {
-         this->reg_pressure_in = NULL;
+         this->reg_pressure_out = NULL;
          this->livein = NULL;
          this->liveout = NULL;
          this->hw_liveout = NULL;
-         this->written = NULL;
-         this->reads_remaining = NULL;
-         this->hw_reads_remaining = NULL;
+         this->read = NULL;
+         this->writes_remaining = NULL;
+         this->hw_read = NULL;
       }
    }
 
@@ -468,7 +468,7 @@ public:
     */
    virtual int issue_time(backend_instruction *inst) = 0;
 
-   virtual void count_reads_remaining(backend_instruction *inst) = 0;
+   virtual void count_writes_remaining(backend_instruction *inst) = 0;
    virtual void setup_liveness(cfg_t *cfg) = 0;
    virtual void update_register_pressure(backend_instruction *inst) = 0;
    virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
@@ -496,10 +496,10 @@ public:
    int max_reg_pressure;
 
    /*
-    * The register pressure at the beginning of each basic block.
+    * The register pressure at the end of each basic block.
     */
 
-   int *reg_pressure_in;
+   int *reg_pressure_out;
 
    /*
     * The virtual GRF's whose range overlaps the beginning of each basic block.
@@ -520,22 +520,22 @@ public:
    BITSET_WORD **hw_liveout;
 
    /*
-    * Whether we've scheduled a write for this virtual GRF yet.
+    * Whether we've scheduled a read for this virtual GRF yet.
     */
-
-   bool *written;
+   
+   bool *read;
 
    /*
-    * How many reads we haven't scheduled for this virtual GRF yet.
+    * How many writes we haven't scheduled for this virtual GRF yet.
     */
 
-   int *reads_remaining;
+   int *writes_remaining;
 
    /*
-    * How many reads we haven't scheduled for this hardware GRF yet.
+    * Whether we've scheduled a read for this hardware GRF yet.
     */
 
-   int *hw_reads_remaining;
+   bool *hw_read;
 };
 
 class fs_instruction_scheduler : public instruction_scheduler
@@ -550,7 +550,7 @@ public:
    int issue_time(backend_instruction *inst);
    fs_visitor *v;
 
-   void count_reads_remaining(backend_instruction *inst);
+   void count_writes_remaining(backend_instruction *inst);
    void setup_liveness(cfg_t *cfg);
    void update_register_pressure(backend_instruction *inst);
    int get_register_pressure_benefit(backend_instruction *inst);
@@ -578,28 +578,15 @@ is_src_duplicate(fs_inst *inst, int src)
 }
 
 void
-fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+fs_instruction_scheduler::count_writes_remaining(backend_instruction *be)
 {
    fs_inst *inst = (fs_inst *)be;
 
-   if (!reads_remaining)
+   if (!writes_remaining)
       return;
 
-   for (int i = 0; i < inst->sources; i++) {
-      if (is_src_duplicate(inst, i))
-         continue;
-
-      if (inst->src[i].file == GRF) {
-         reads_remaining[inst->src[i].reg]++;
-      } else if (inst->src[i].file == HW_REG &&
-               inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
-         if (inst->src[i].fixed_hw_reg.nr >= hw_reg_count)
-            continue;
-
-         for (int j = 0; j < inst->regs_read(i); j++)
-            hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + j]++;
-      }
-   }
+   if (inst->dst.file == GRF)
+      writes_remaining[inst->dst.reg]++;
 }
 
 void
@@ -612,14 +599,16 @@ fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
       for (int i = 0; i < v->live_intervals->num_vars; i++) {
          if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) {
             int vgrf = v->live_intervals->vgrf_from_var[i];
-            if (!BITSET_TEST(livein[block], vgrf)) {
-               reg_pressure_in[block] += v->alloc.sizes[vgrf];
-               BITSET_SET(livein[block], vgrf);
-            }
+            BITSET_SET(livein[block], vgrf);
          }
 
-         if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i))
-            BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]);
+         if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i)) {
+            int vgrf = v->live_intervals->vgrf_from_var[i];
+            if (!BITSET_TEST(liveout[block], vgrf)) {
+               reg_pressure_out[block] += v->alloc.sizes[vgrf];
+               BITSET_SET(liveout[block], vgrf);
+            }
+         }
       }
    }
 
@@ -631,12 +620,12 @@ fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
       for (int i = 0; i < grf_count; i++) {
          if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip &&
              v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) {
-            if (!BITSET_TEST(livein[block + 1], i)) {
-                reg_pressure_in[block + 1] += v->alloc.sizes[i];
-                BITSET_SET(livein[block + 1], i);
+            if (!BITSET_TEST(liveout[block], i)) {
+                reg_pressure_out[block] += v->alloc.sizes[i];
+                BITSET_SET(liveout[block], i);
             }
 
-            BITSET_SET(liveout[block], i);
+            BITSET_SET(livein[block + 1], i);
          }
       }
    }
@@ -650,10 +639,11 @@ fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
 
       for (int block = 0; block < cfg->num_blocks; block++) {
          if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
-            reg_pressure_in[block]++;
 
-         if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
+         if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) {
+            reg_pressure_out[block]++;
             BITSET_SET(hw_liveout[block], i);
+         }
       }
    }
 }
@@ -663,11 +653,11 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
 {
    fs_inst *inst = (fs_inst *)be;
 
-   if (!reads_remaining)
+   if (!writes_remaining)
       return;
 
    if (inst->dst.file == GRF) {
-      written[inst->dst.reg] = true;
+      writes_remaining[inst->dst.reg]--;
    }
 
    for (int i = 0; i < inst->sources; i++) {
@@ -675,12 +665,12 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
           continue;
 
       if (inst->src[i].file == GRF) {
-         reads_remaining[inst->src[i].reg]--;
+         read[inst->src[i].reg] = true;
       } else if (inst->src[i].file == HW_REG &&
                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE &&
                  inst->src[i].fixed_hw_reg.nr < hw_reg_count) {
          for (int off = 0; off < inst->regs_read(i); off++)
-            hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + off]--;
+            hw_read[inst->src[i].fixed_hw_reg.nr + off] = true;
       }
    }
 }
@@ -693,8 +683,8 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
 
    if (inst->dst.file == GRF) {
       if (!BITSET_TEST(livein[block_idx], inst->dst.reg) &&
-          !written[inst->dst.reg])
-         benefit -= v->alloc.sizes[inst->dst.reg];
+          writes_remaining[inst->dst.reg] == 1)
+         benefit += v->alloc.sizes[inst->dst.reg];
    }
 
    for (int i = 0; i < inst->sources; i++) {
@@ -703,8 +693,8 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
 
       if (inst->src[i].file == GRF &&
           !BITSET_TEST(liveout[block_idx], inst->src[i].reg) &&
-          reads_remaining[inst->src[i].reg] == 1)
-         benefit += v->alloc.sizes[inst->src[i].reg];
+          !read[inst->src[i].reg])
+         benefit -= v->alloc.sizes[inst->src[i].reg];
 
       if (inst->src[i].file == HW_REG &&
           inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE &&
@@ -712,8 +702,8 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
          for (int off = 0; off < inst->regs_read(i); off++) {
             int reg = inst->src[i].fixed_hw_reg.nr + off;
             if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
-                hw_reads_remaining[reg] == 1) {
-               benefit++;
+                !hw_read[reg]) {
+               benefit--;
             }
          }
       }
@@ -731,7 +721,7 @@ public:
    int issue_time(backend_instruction *inst);
    vec4_visitor *v;
 
-   void count_reads_remaining(backend_instruction *inst);
+   void count_writes_remaining(backend_instruction *inst);
    void setup_liveness(cfg_t *cfg);
    void update_register_pressure(backend_instruction *inst);
    int get_register_pressure_benefit(backend_instruction *inst);
@@ -745,7 +735,7 @@ vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
 }
 
 void
-vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+vec4_instruction_scheduler::count_writes_remaining(backend_instruction *be)
 {
 }
 
@@ -843,30 +833,30 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 
    assert(before != after);
 
-   for (int i = 0; i < before->child_count; i++) {
-      if (before->children[i] == after) {
-         before->child_latency[i] = MAX2(before->child_latency[i], latency);
+   for (int i = 0; i < after->child_count; i++) {
+      if (after->children[i] == before) {
+         after->child_latency[i] = MAX2(after->child_latency[i], latency);
          return;
       }
    }
 
-   if (before->child_array_size <= before->child_count) {
-      if (before->child_array_size < 16)
-         before->child_array_size = 16;
+   if (after->child_array_size <= after->child_count) {
+      if (after->child_array_size < 16)
+         after->child_array_size = 16;
       else
-         before->child_array_size *= 2;
+         after->child_array_size *= 2;
 
-      before->children = reralloc(mem_ctx, before->children,
-                                  schedule_node *,
-                                  before->child_array_size);
-      before->child_latency = reralloc(mem_ctx, before->child_latency,
-                                       int, before->child_array_size);
+      after->children = reralloc(mem_ctx, after->children,
+                                 schedule_node *,
+                                 after->child_array_size);
+      after->child_latency = reralloc(mem_ctx, after->child_latency,
+                                      int, after->child_array_size);
    }
 
-   before->children[before->child_count] = after;
-   before->child_latency[before->child_count] = latency;
-   before->child_count++;
-   after->parent_count++;
+   after->children[after->child_count] = before;
+   after->child_latency[after->child_count] = latency;
+   after->child_count++;
+   before->parent_count++;
 }
 
 void
@@ -1460,10 +1450,10 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
           * tree of lowered ubo loads, which appear reversed in the
           * instruction stream with respect to when they can be consumed).
           */
-         if (n->delay > chosen->delay) {
+         if (n->delay < chosen->delay) {
             chosen = n;
             continue;
-         } else if (n->delay < chosen->delay) {
+         } else if (n->delay > chosen->delay) {
             continue;
          }
 
@@ -1518,7 +1508,7 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
    backend_instruction *inst = block->end();
    time = 0;
    if (!post_reg_alloc) {
-      reg_pressure = reg_pressure_in[block->num];
+      reg_pressure = reg_pressure_out[block->num];
       max_reg_pressure = MAX2(max_reg_pressure, reg_pressure);
    }
    block_idx = block->num;
@@ -1536,6 +1526,7 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
       assert(chosen);
       chosen->remove();
       inst->insert_before(block, chosen->inst);
+      inst = chosen->inst;
       instructions_to_schedule--;
 
       if (!post_reg_alloc) {
@@ -1547,13 +1538,12 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
       /* If we expected a delay for scheduling, then bump the clock to reflect
        * that.  In reality, the hardware will switch to another hyperthread
        * and may not return to dispatching our thread for a while even after
-       * we're unblocked.  After this, we have the time when the chosen
+       * we're unblocked.  Before this, we have the time when the chosen
        * instruction will start executing.
        */
       time = MAX2(time, chosen->unblocked_time);
 
-      /* Update the clock for how soon an instruction could start after the
-       * chosen one.
+      /* Update the clock for the actual execution of the instruction.
        */
       time += issue_time(chosen->inst);
 
@@ -1642,15 +1632,15 @@ instruction_scheduler::run(cfg_t *cfg)
       if (block->end_ip - block->start_ip <= 1)
          continue;
 
-      if (reads_remaining) {
-         memset(reads_remaining, 0,
-                grf_count * sizeof(*reads_remaining));
-         memset(hw_reads_remaining, 0,
-                hw_reg_count * sizeof(*hw_reads_remaining));
-         memset(written, 0, grf_count * sizeof(*written));
+      if (writes_remaining) {
+         memset(writes_remaining, 0,
+                grf_count * sizeof(*writes_remaining));
+         memset(hw_read, 0,
+                hw_reg_count * sizeof(*hw_read));
+         memset(read, 0, grf_count * sizeof(*read));
 
          foreach_inst_in_block(fs_inst, inst, block)
-            count_reads_remaining(inst);
+            count_writes_remaining(inst);
       }
 
       add_insts_from_block(block);
-- 
2.4.3



More information about the mesa-dev mailing list