[Mesa-dev] [PATCH] i965/fs: In the pre-regalloc schedule, try harder at reducing reg pressure.

Tue Oct 15 01:14:45 CEST 2013

Previously, the best thing we had was to schedule the things unblocked by
the current instruction, on the hope that it would be consuming two values
at the end of their live intervals while only producing one new value.
Sometimes that wasn't the case.

Now, when an instruction is the first user of a GRF we schedule (i.e. it
will probably be the virtual_grf_def[] instruction after computing live
intervals again), penalize it by how many regs it would take up.  When an
instruction is the last user of a GRF we have to schedule (when it will
probably be the virtual_grf_end[] instruction), give it a boost by how
many regs it would free.

The new functions are made virtual (only 1 of 2 really needs to be
virtual) because I expect we'll soon lift the pre-regalloc scheduling
heuristic over to the vec4 backend.

shader-db:
total instructions in shared programs: 1512756 -> 1511604 (-0.08%)
instructions in affected programs:     10292 -> 9140 (-11.19%)
GAINED:                                121
LOST:                                  38

Improves tropics performance at my current settings by 4.50602% +/-
2.60694% (n=5).  No difference on Lightsmark (n=5).  No difference on
GLB2.7 (n=11).

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70445
---
 .../drivers/dri/i965/brw_schedule_instructions.cpp | 125 ++++++++++++++++++---
 1 file changed, 111 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index b24c38c..7cb0265 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -353,6 +353,13 @@ public:
       this->instructions_to_schedule = 0;
       this->post_reg_alloc = post_reg_alloc;
       this->time = 0;
+      if (!post_reg_alloc) {
+         this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
+         this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
+      } else {
+         this->remaining_grf_uses = NULL;
+         this->grf_active = NULL;
+      }
    }
 
    ~instruction_scheduler()
@@ -377,6 +384,9 @@ public:
     */
    virtual int issue_time(backend_instruction *inst) = 0;
 
+   virtual void mod_remaining_grf_uses(backend_instruction *inst, int mod) = 0;
+   virtual int get_grf_pressure_benefit(backend_instruction *inst) = 0;
+
    void schedule_instructions(backend_instruction *next_block_header);
 
    void *mem_ctx;
@@ -387,6 +397,17 @@ public:
    int time;
    exec_list instructions;
    backend_visitor *bv;
+
+   /** Number of instructions left to schedule that reference each vgrf. */
+   int *remaining_grf_uses;
+
+   /**
+    * Tracks whether each VGRF has had an instruction scheduled that uses it.
+    *
+    * This is used to estimate whether scheduling a new instruction will
+    * increase register pressure.
+    */
+   bool *grf_active;
 };
 
 class fs_instruction_scheduler : public instruction_scheduler
@@ -398,6 +419,9 @@ public:
    schedule_node *choose_instruction_to_schedule();
    int issue_time(backend_instruction *inst);
    fs_visitor *v;
+
+   void mod_remaining_grf_uses(backend_instruction *inst, int mod);
+   int get_grf_pressure_benefit(backend_instruction *inst);
 };
 
 fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
@@ -408,6 +432,57 @@ fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
 {
 }
 
+void
+fs_instruction_scheduler::mod_remaining_grf_uses(backend_instruction *be,
+                                                 int mod)
+{
+   fs_inst *inst = (fs_inst *)be;
+
+   if (!remaining_grf_uses)
+      return;
+
+   if (inst->dst.file == GRF) {
+      remaining_grf_uses[inst->dst.reg] += mod;
+      if (mod < 0 && !grf_active[inst->dst.reg])
+         grf_active[inst->dst.reg] = true;
+   }
+
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file != GRF)
+         continue;
+
+      remaining_grf_uses[inst->src[i].reg] += mod;
+      if (mod < 0 && !grf_active[inst->src[i].reg])
+         grf_active[inst->src[i].reg] = true;
+   }
+}
+
+int
+fs_instruction_scheduler::get_grf_pressure_benefit(backend_instruction *be)
+{
+   fs_inst *inst = (fs_inst *)be;
+   int benefit = 0;
+
+   if (inst->dst.file == GRF) {
+      if (remaining_grf_uses[inst->dst.reg] == 1)
+         benefit += v->virtual_grf_sizes[inst->dst.reg];
+      if (!grf_active[inst->dst.reg])
+         benefit -= v->virtual_grf_sizes[inst->dst.reg];
+   }
+
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file != GRF)
+         continue;
+
+      if (remaining_grf_uses[inst->src[i].reg] == 1)
+         benefit += v->virtual_grf_sizes[inst->src[i].reg];
+      if (!grf_active[inst->src[i].reg])
+         benefit -= v->virtual_grf_sizes[inst->src[i].reg];
+   }
+
+   return benefit;
+}
+
 class vec4_instruction_scheduler : public instruction_scheduler
 {
 public:
@@ -416,6 +491,9 @@ public:
    schedule_node *choose_instruction_to_schedule();
    int issue_time(backend_instruction *inst);
    vec4_visitor *v;
+
+   void mod_remaining_grf_uses(backend_instruction *inst, int mod);
+   int get_grf_pressure_benefit(backend_instruction *inst);
 };
 
 vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
@@ -426,6 +504,18 @@ vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
 }
 
 void
+vec4_instruction_scheduler::mod_remaining_grf_uses(backend_instruction *be,
+                                                   int mod)
+{
+}
+
+int
+vec4_instruction_scheduler::get_grf_pressure_benefit(backend_instruction *be)
+{
+   return 0;
+}
+
+void
 instruction_scheduler::add_inst(backend_instruction *inst)
 {
    schedule_node *n = new(mem_ctx) schedule_node(inst, bv->brw);
@@ -946,23 +1036,18 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
          }
       }
    } else {
+      int chosen_score = -1000000; /* Any instruction is better than nothing */
+
       /* Before register allocation, we don't care about the latencies of
        * instructions.  All we care about is reducing live intervals of
        * variables so that we can avoid register spilling, or get 16-wide
        * shaders which naturally do a better job of hiding instruction
        * latency.
        *
-       * To do so, schedule our instructions in a roughly LIFO/depth-first
-       * order: when new instructions become available as a result of
-       * scheduling something, choose those first so that our result
-       * hopefully is consumed quickly.
-       *
-       * The exception is messages that generate more than one result
-       * register (AKA texturing).  In those cases, the LIFO search would
-       * normally tend to choose them quickly (because scheduling the
-       * previous message not only unblocked the children using its result,
-       * but also the MRF setup for the next sampler message, which in turn
-       * unblocks the next sampler message).
+       * If this instruction would be the last use of any GRFs, we bump up its
+       * score since it means it should be reducing register pressure.  If
+       * it's the first use of a GRF, reduce its score since it means it
+       * should be increasing register pressure.
        */
       for (schedule_node *node = (schedule_node *)instructions.get_tail();
            node != instructions.get_head()->prev;
@@ -970,9 +1055,12 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
          schedule_node *n = (schedule_node *)node;
          fs_inst *inst = (fs_inst *)n->inst;
 
-         chosen = n;
-         if (inst->regs_written <= 1)
-            break;
+         int this_score = get_grf_pressure_benefit(inst);
+
+         if (this_score > chosen_score) {
+            chosen = n;
+            chosen_score = this_score;
+         }
       }
    }
 
@@ -1036,6 +1124,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
       chosen->remove();
       next_block_header->insert_before(chosen->inst);
       instructions_to_schedule--;
+      mod_remaining_grf_uses(chosen->inst, -1);
 
       /* Update the clock for how soon an instruction could start after the
        * chosen one.
@@ -1105,6 +1194,14 @@ instruction_scheduler::run(exec_list *all_instructions)
       bv->dump_instructions();
    }
 
+   /* Populate the remaining GRF uses array to improve the pre-regalloc
+    * scheduling.
+    */
+   if (remaining_grf_uses) {
+      foreach_list(node, &instructions)
+         mod_remaining_grf_uses((backend_instruction *)node, 1);
+   }
+
    while (!next_block_header->is_tail_sentinel()) {
       /* Add things to be scheduled until we get to a new BB. */
       while (!next_block_header->is_tail_sentinel()) {
-- 
1.8.4.rc3