[Mesa-dev] [PATCH] i965: Ask the register allocator to round-robin through registers.

Wed Apr 3 23:38:36 PDT 2013

On 04/03/2013 11:59 AM, Matt Turner wrote:
> On Wed, Apr 3, 2013 at 10:25 AM, Eric Anholt <eric at anholt.net> wrote:
>> The way we were allocating registers before, packing into low register
>> numbers for Ironlake, resulted in an overly-constrained dependency graph
>> for instruction scheduling.  Improves GLBenchmark 2.1 performance by
>> 3.4% +/- 0.6% (n=26)
>> ---
>>   src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp |    2 ++
>>   src/mesa/program/register_allocate.c              |   31 +++++++++++++++++++--
>>   src/mesa/program/register_allocate.h              |    1 +
>>   3 files changed, 31 insertions(+), 3 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
>> index 4ee7bbc..b9b0303 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
>> @@ -108,6 +108,8 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width)
>>
>>      uint8_t *ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
>>      struct ra_regs *regs = ra_alloc_reg_set(brw, ra_reg_count);
>> +   if (intel->gen >= 6)
>> +      ra_set_allocate_round_robin(regs);
>>      int *classes = ralloc_array(brw, int, class_count);
>>      int aligned_pairs_class = -1;
>>
>> diff --git a/src/mesa/program/register_allocate.c b/src/mesa/program/register_allocate.c
>> index a9064c3..5f45662 100644
>> --- a/src/mesa/program/register_allocate.c
>> +++ b/src/mesa/program/register_allocate.c
>> @@ -70,6 +70,7 @@
>>    * this during ra_set_finalize().
>>    */
>>
>> +#include <stdbool.h>
>>   #include <ralloc.h>
>>
>>   #include "main/imports.h"
>> @@ -93,6 +94,8 @@ struct ra_regs {
>>
>>      struct ra_class **classes;
>>      unsigned int class_count;
>> +
>> +   bool round_robin;
>>   };
>>
>>   struct ra_class {
>> @@ -185,6 +188,22 @@ ra_alloc_reg_set(void *mem_ctx, unsigned int count)
>>      return regs;
>>   }
>>
>> +/**
>> + * The register allocator by default prefers to allocate low register numbers,
>> + * since it was written for hardware (gen4/5 Intel) that is limited in its
>> + * multithreadedness by the number of registers used in a given shader.
>> + *
>> + * However, for hardware without that restriction, densely packed register
>> + * allocation can put serious constraints on instruction scheduling.  This
>> + * function tells the allocator to rotate around the registers if possible as
>> + * it allocates the nodes.
>> + */
>> +void
>> +ra_set_allocate_round_robin(struct ra_regs *regs)
>> +{
>> +   regs->round_robin = true;
>> +}
>> +
>>   static void
>>   ra_add_conflict_list(struct ra_regs *regs, unsigned int r1, unsigned int r2)
>>   {
>> @@ -436,16 +455,19 @@ GLboolean
>>   ra_select(struct ra_graph *g)
>>   {
>>      int i;
>> +   int start_search_reg = 0;
>>
>>      while (g->stack_count != 0) {
>> -      unsigned int r;
>> +      unsigned int ri;
>> +      unsigned int r = -1;
>>         int n = g->stack[g->stack_count - 1];
>>         struct ra_class *c = g->regs->classes[g->nodes[n].class];
>>
>>         /* Find the lowest-numbered reg which is not used by a member
>>          * of the graph adjacent to us.
>>          */
>> -      for (r = 0; r < g->regs->count; r++) {
>> +      for (ri = 0; ri < g->regs->count; ri++) {
>> +         r = (start_search_reg + ri) % g->regs->count;
>>           if (!c->regs[r])
>>              continue;
>>
>> @@ -461,12 +483,15 @@ ra_select(struct ra_graph *g)
>>           if (i == g->nodes[n].adjacency_count)
>>              break;
>>         }
>> -      if (r == g->regs->count)
>> +      if (ri == g->regs->count)
>>           return GL_FALSE;
>>
>>         g->nodes[n].reg = r;
>>         g->nodes[n].in_stack = GL_FALSE;
>>         g->stack_count--;
>> +
>> +      if (g->regs->round_robin)
>> +         start_search_reg = r;
>>      }
>
> With the s/= r/= r + 1/ change mentioned on IRC to make this work for
> 8-wide too, it gets my
>
> Reviewed-by: Matt Turner <mattst88 at gmail.com>

With that fixed,
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>

3.4% is pretty awesome...nice work!