[Mesa-dev] [PATCH] i965/fs: Make the first pre-allocation heuristic be the post heuristic.

Tue Nov 19 23:34:46 PST 2013

I recently made us try two different things that tried to reduce register
pressure so that we would be more likely to allocate successfully.  But
now that we have the logic for trying two, we can make the first thing we
try be the normal, not-prioritizing-register-pressure heuristic.

This means one less scheduling pass in the common case of that heuristic
not producing spills, plus the best schedule we know how to produce, if
that one happens to succeed.  This is important, because our register
allocation produces a lot of possibly avoidable dependencies for the
post-register-allocation schedule, despite ra_set_allocate_round_robin().

GLB2.7: 1.04127% +/- 0.732461% fps improvement (n=31)
nexuiz: No difference (n=5)
lightsmark: 0.838512% +/- 0.300147% fps improvement (n=86)
minecraft apitrace: No difference (n=15)
---
 src/mesa/drivers/dri/i965/brw_fs.cpp               | 62 ++++++++++++++--------
 .../drivers/dri/i965/brw_schedule_instructions.cpp |  4 +-
 src/mesa/drivers/dri/i965/brw_shader.h             |  1 +
 3 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8b3f8df..dbe7503 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3190,6 +3190,7 @@ fs_visitor::run()
 {
    sanity_param_count = fp->Base.Parameters->NumParameters;
    uint32_t orig_nr_params = c->prog_data.nr_params;
+   bool allocated_without_spills;
 
    assign_binding_table_offsets();
 
@@ -3273,27 +3274,45 @@ fs_visitor::run()
       assign_curb_setup();
       assign_urb_setup();
 
-      schedule_instructions(SCHEDULE_PRE_NON_LIFO);
+      static enum instruction_scheduler_mode pre_modes[] = {
+         SCHEDULE_PRE,
+         SCHEDULE_PRE_NON_LIFO,
+         SCHEDULE_PRE_LIFO,
+      };
 
-      if (0)
-	 assign_regs_trivial();
-      else {
-         if (!assign_regs(false)) {
-            /* Try a non-spilling register allocation again with a different
-             * scheduling heuristic.
-             */
-            schedule_instructions(SCHEDULE_PRE_LIFO);
-            if (!assign_regs(false)) {
-               if (dispatch_width == 16) {
-                  fail("Failure to register allocate.  Reduce number of "
-                       "live scalar values to avoid this.");
-               } else {
-                  while (!assign_regs(true)) {
-                     if (failed)
-                        break;
-                  }
-               }
-            }
+      /* Try each scheduling heuristic to see if it can successfully register
+       * allocate without spilling.  They should be ordered by decreasing
+       * performance but increasing likelihood of allocating.
+       */
+      for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
+         schedule_instructions(pre_modes[i]);
+
+         if (0) {
+            assign_regs_trivial();
+            allocated_without_spills = true;
+         } else {
+            allocated_without_spills = assign_regs(false);
+         }
+         if (allocated_without_spills)
+            break;
+      }
+
+      if (!allocated_without_spills) {
+         /* We assume that any spilling is worse than just dropping back to
+          * SIMD8.  There's probably actually some intermediate point where
+          * SIMD16 with a couple of spills is still better.
+          */
+         if (dispatch_width == 16) {
+            fail("Failure to register allocate.  Reduce number of "
+                 "live scalar values to avoid this.");
+         }
+
+         /* Since we're out of heuristics, just go spill registers until we
+          * get an allocation.
+          */
+         while (!assign_regs(true)) {
+            if (failed)
+               break;
          }
       }
    }
@@ -3308,7 +3327,8 @@ fs_visitor::run()
    if (failed)
       return false;
 
-   schedule_instructions(SCHEDULE_POST);
+   if (!allocated_without_spills)
+      schedule_instructions(SCHEDULE_POST);
 
    if (dispatch_width == 8) {
       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index a4fae0d..a40a46a 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -1140,10 +1140,10 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
 {
    schedule_node *chosen = NULL;
 
-   if (post_reg_alloc) {
+   if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
       int chosen_time = 0;
 
-      /* Of the instructions closest ready to execute or the closest to
+      /* Of the instructions ready to execute or the closest to
        * being ready, choose the oldest one.
        */
       foreach_list(node, &instructions) {
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index aba24c5..34779ff 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -60,6 +60,7 @@ public:
 };
 
 enum instruction_scheduler_mode {
+   SCHEDULE_PRE,
    SCHEDULE_PRE_NON_LIFO,
    SCHEDULE_PRE_LIFO,
    SCHEDULE_POST,
-- 
1.8.4.rc3