[Mesa-dev] [PATCH] i965/fs: In the pre-regalloc schedule, try harder at reducing reg pressure.

Fri Oct 18 10:25:44 CEST 2013

On Tue, Oct 15, 2013 at 7:14 AM, Eric Anholt <eric at anholt.net> wrote:
> Previously, the best thing we had was to schedule the things unblocked by
> the current instruction, on the hope that it would be consuming two values
> at the end of their live intervals while only producing one new value.
> Sometimes that wasn't the case.
>
> Now, when an instruction is the first user of a GRF we schedule (i.e. it
> will probably be the virtual_grf_def[] instruction after computing live
> intervals again), penalize it by how many regs it would take up.  When an
> instruction is the last user of a GRF we have to schedule (when it will
> probably be the virtual_grf_end[] instruction), give it a boost by how
> many regs it would free.
texture2D() takes up 4 regs and at best free 2 regs.  It will always
be scheduled last.  When there are more than ~60 texture2D() calls (it
could happen because of loop unrolling), the message payloads could
take up all available registers.

I wonder if it helps to take how long an instruction is in the
available queue into consideration.  For after a couple of
texture2D()s are scheduled, the instructions that use the results may
become available and free them up.

>
> The new functions are made virtual (only 1 of 2 really needs to be
> virtual) because I expect we'll soon lift the pre-regalloc scheduling
> heuristic over to the vec4 backend.
>
> shader-db:
> total instructions in shared programs: 1512756 -> 1511604 (-0.08%)
> instructions in affected programs:     10292 -> 9140 (-11.19%)
> GAINED:                                121
> LOST:                                  38
>
> Improves tropics performance at my current settings by 4.50602% +/-
> 2.60694% (n=5).  No difference on Lightsmark (n=5).  No difference on
> GLB2.7 (n=11).
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=70445
> ---
>  .../drivers/dri/i965/brw_schedule_instructions.cpp | 125 ++++++++++++++++++---
>  1 file changed, 111 insertions(+), 14 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> index b24c38c..7cb0265 100644
> --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> @@ -353,6 +353,13 @@ public:
>        this->instructions_to_schedule = 0;
>        this->post_reg_alloc = post_reg_alloc;
>        this->time = 0;
> +      if (!post_reg_alloc) {
> +         this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
> +         this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
> +      } else {
> +         this->remaining_grf_uses = NULL;
> +         this->grf_active = NULL;
> +      }
>     }
>
>     ~instruction_scheduler()
> @@ -377,6 +384,9 @@ public:
>      */
>     virtual int issue_time(backend_instruction *inst) = 0;
>
> +   virtual void mod_remaining_grf_uses(backend_instruction *inst, int mod) = 0;
> +   virtual int get_grf_pressure_benefit(backend_instruction *inst) = 0;
> +
>     void schedule_instructions(backend_instruction *next_block_header);
>
>     void *mem_ctx;
> @@ -387,6 +397,17 @@ public:
>     int time;
>     exec_list instructions;
>     backend_visitor *bv;
> +
> +   /** Number of instructions left to schedule that reference each vgrf. */
> +   int *remaining_grf_uses;
> +
> +   /**
> +    * Tracks whether each VGRF has had an instruction scheduled that uses it.
> +    *
> +    * This is used to estimate whether scheduling a new instruction will
> +    * increase register pressure.
> +    */
> +   bool *grf_active;
>  };
>
>  class fs_instruction_scheduler : public instruction_scheduler
> @@ -398,6 +419,9 @@ public:
>     schedule_node *choose_instruction_to_schedule();
>     int issue_time(backend_instruction *inst);
>     fs_visitor *v;
> +
> +   void mod_remaining_grf_uses(backend_instruction *inst, int mod);
> +   int get_grf_pressure_benefit(backend_instruction *inst);
>  };
>
>  fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
> @@ -408,6 +432,57 @@ fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
>  {
>  }
>
> +void
> +fs_instruction_scheduler::mod_remaining_grf_uses(backend_instruction *be,
> +                                                 int mod)
> +{
> +   fs_inst *inst = (fs_inst *)be;
> +
> +   if (!remaining_grf_uses)
> +      return;
> +
> +   if (inst->dst.file == GRF) {
> +      remaining_grf_uses[inst->dst.reg] += mod;
> +      if (mod < 0 && !grf_active[inst->dst.reg])
> +         grf_active[inst->dst.reg] = true;
> +   }
> +
> +   for (int i = 0; i < 3; i++) {
> +      if (inst->src[i].file != GRF)
> +         continue;
> +
> +      remaining_grf_uses[inst->src[i].reg] += mod;
> +      if (mod < 0 && !grf_active[inst->src[i].reg])
> +         grf_active[inst->src[i].reg] = true;
> +   }
> +}
> +
> +int
> +fs_instruction_scheduler::get_grf_pressure_benefit(backend_instruction *be)
> +{
> +   fs_inst *inst = (fs_inst *)be;
> +   int benefit = 0;
> +
> +   if (inst->dst.file == GRF) {
> +      if (remaining_grf_uses[inst->dst.reg] == 1)
> +         benefit += v->virtual_grf_sizes[inst->dst.reg];
> +      if (!grf_active[inst->dst.reg])
> +         benefit -= v->virtual_grf_sizes[inst->dst.reg];
> +   }
> +
> +   for (int i = 0; i < 3; i++) {
> +      if (inst->src[i].file != GRF)
> +         continue;
> +
> +      if (remaining_grf_uses[inst->src[i].reg] == 1)
> +         benefit += v->virtual_grf_sizes[inst->src[i].reg];
> +      if (!grf_active[inst->src[i].reg])
> +         benefit -= v->virtual_grf_sizes[inst->src[i].reg];
> +   }
> +
> +   return benefit;
> +}
> +
>  class vec4_instruction_scheduler : public instruction_scheduler
>  {
>  public:
> @@ -416,6 +491,9 @@ public:
>     schedule_node *choose_instruction_to_schedule();
>     int issue_time(backend_instruction *inst);
>     vec4_visitor *v;
> +
> +   void mod_remaining_grf_uses(backend_instruction *inst, int mod);
> +   int get_grf_pressure_benefit(backend_instruction *inst);
>  };
>
>  vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
> @@ -426,6 +504,18 @@ vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
>  }
>
>  void
> +vec4_instruction_scheduler::mod_remaining_grf_uses(backend_instruction *be,
> +                                                   int mod)
> +{
> +}
> +
> +int
> +vec4_instruction_scheduler::get_grf_pressure_benefit(backend_instruction *be)
> +{
> +   return 0;
> +}
> +
> +void
>  instruction_scheduler::add_inst(backend_instruction *inst)
>  {
>     schedule_node *n = new(mem_ctx) schedule_node(inst, bv->brw);
> @@ -946,23 +1036,18 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
>           }
>        }
>     } else {
> +      int chosen_score = -1000000; /* Any instruction is better than nothing */
> +
>        /* Before register allocation, we don't care about the latencies of
>         * instructions.  All we care about is reducing live intervals of
>         * variables so that we can avoid register spilling, or get 16-wide
>         * shaders which naturally do a better job of hiding instruction
>         * latency.
>         *
> -       * To do so, schedule our instructions in a roughly LIFO/depth-first
> -       * order: when new instructions become available as a result of
> -       * scheduling something, choose those first so that our result
> -       * hopefully is consumed quickly.
> -       *
> -       * The exception is messages that generate more than one result
> -       * register (AKA texturing).  In those cases, the LIFO search would
> -       * normally tend to choose them quickly (because scheduling the
> -       * previous message not only unblocked the children using its result,
> -       * but also the MRF setup for the next sampler message, which in turn
> -       * unblocks the next sampler message).
> +       * If this instruction would be the last use of any GRFs, we bump up its
> +       * score since it means it should be reducing register pressure.  If
> +       * it's the first use of a GRF, reduce its score since it means it
> +       * should be increasing register pressure.
>         */
>        for (schedule_node *node = (schedule_node *)instructions.get_tail();
>             node != instructions.get_head()->prev;
> @@ -970,9 +1055,12 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
>           schedule_node *n = (schedule_node *)node;
>           fs_inst *inst = (fs_inst *)n->inst;
>
> -         chosen = n;
> -         if (inst->regs_written <= 1)
> -            break;
> +         int this_score = get_grf_pressure_benefit(inst);
> +
> +         if (this_score > chosen_score) {
> +            chosen = n;
> +            chosen_score = this_score;
> +         }
>        }
>     }
>
> @@ -1036,6 +1124,7 @@ instruction_scheduler::schedule_instructions(backend_instruction *next_block_hea
>        chosen->remove();
>        next_block_header->insert_before(chosen->inst);
>        instructions_to_schedule--;
> +      mod_remaining_grf_uses(chosen->inst, -1);
>
>        /* Update the clock for how soon an instruction could start after the
>         * chosen one.
> @@ -1105,6 +1194,14 @@ instruction_scheduler::run(exec_list *all_instructions)
>        bv->dump_instructions();
>     }
>
> +   /* Populate the remaining GRF uses array to improve the pre-regalloc
> +    * scheduling.
> +    */
> +   if (remaining_grf_uses) {
> +      foreach_list(node, &instructions)
> +         mod_remaining_grf_uses((backend_instruction *)node, 1);
> +   }
> +
>     while (!next_block_header->is_tail_sentinel()) {
>        /* Add things to be scheduled until we get to a new BB. */
>        while (!next_block_header->is_tail_sentinel()) {
> --
> 1.8.4.rc3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev

-- 
olv at LunarG.com