[Mesa-dev] [PATCH] i965/fs: Reduce the interference between payload regs and virtual GRFs.

Tue Oct 16 13:39:10 PDT 2012

On 10/15/2012 04:06 PM, Eric Anholt wrote:
> Improves performance of the Lightsmark penumbra shadows scene by 15.7% +/-
> 1.0% (n=15), by eliminating register spilling. (tested by smashing the list of
> scenes to have all other scenes have 0 duration -- includes additional
> rendering of scene description text that normally doesn't appear in that
> scene)
> ---
>   src/mesa/drivers/dri/i965/brw_fs.h                |    2 +
>   src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp |  164 ++++++++++++++++++---
>   2 files changed, 147 insertions(+), 19 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index a71783c..ad717c9 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -235,6 +235,8 @@ public:
>      void assign_urb_setup();
>      bool assign_regs();
>      void assign_regs_trivial();
> +   void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
> +                                   int first_payload_node);
>      int choose_spill_reg(struct ra_graph *g);
>      void spill_reg(int spill_reg);
>      void split_virtual_grfs();
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> index 7b778d6..bd9789f 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> @@ -163,27 +163,154 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width, int base_reg_count)
>   /**
>    * Sets up interference between thread payload registers and the virtual GRFs
>    * to be allocated for program temporaries.
> + *
> + * We want to be able to reallocate the payload for our virtual GRFs, notably
> + * because the setup coefficients for a full set of 16 FS inputs takes up 8 of
> + * our 128 registers.
> + *
> + * The layout of the payload registers is:
> + *
> + * 0..nr_payload_regs-1: fixed function setup (including bary coordinates).
> + * nr_payload_regs..nr_payload_regs+curb_read_lengh-1: uniform data
> + * nr_payload_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients.
> + *
> + * And we have payload_node_count nodes covering these registers in order
> + * (note that in 16-wide, a node is two registers).
>    */
> -static void
> -brw_setup_payload_interference(struct ra_graph *g,
> -                               int payload_reg_count,
> -                               int first_payload_node,
> -                               int reg_node_count)
> +void
> +fs_visitor::setup_payload_interference(struct ra_graph *g,
> +                                       int payload_node_count,
> +                                       int first_payload_node)
>   {
> -   for (int i = 0; i < payload_reg_count; i++) {
> -      /* Mark each payload reg node as being allocated to its physical register.
> +   int reg_width = c->dispatch_width / 8;
> +   int loop_depth = 0;
> +   int loop_end_ip = 0;
> +
> +   int payload_last_use_ip[payload_node_count];
> +   memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip));
> +   int ip = 0;
> +   foreach_list(node, &this->instructions) {
> +      fs_inst *inst = (fs_inst *)node;
> +
> +      switch (inst->opcode) {
> +      case BRW_OPCODE_DO:
> +         loop_depth++;
> +
> +         /* Since payload regs are deffed only at the start of the shader
> +          * execution, any uses of the payload within a loop mean the live
> +          * interval extends to the end of the outermost loop.  Find the ip of
> +          * the end now.
> +          */
> +         if (loop_depth == 1) {
> +            int scan_depth = loop_depth;
> +            int scan_ip = ip;
> +            for (fs_inst *scan_inst = (fs_inst *)inst->next;
> +                 scan_depth > 0;
> +                 scan_inst = (fs_inst *)scan_inst->next) {
> +               switch (scan_inst->opcode) {
> +               case BRW_OPCODE_DO:
> +                  scan_depth++;
> +                  break;
> +               case BRW_OPCODE_WHILE:
> +                  scan_depth--;
> +                  break;
> +               default:
> +                  break;
> +               }
> +               scan_ip++;
> +            }
> +            loop_end_ip = scan_ip;
> +         }
> +         break;
> +      case BRW_OPCODE_WHILE:
> +         loop_depth--;
> +         break;
> +      default:
> +         break;
> +      }

Wow, it's unfortunate that you have to do this.  Essentially, for each 
instruction in a loop, you walk through all the instructions, to the end 
of the loop.  That's big O(fail). :(

If we had the jump offsets, we wouldn't have to do this.  But we don't 
until later.  Alternatively...you could probably structure the code like 
this:

1. See if the instruction uses this reg.
2. If it's a loop, fast-forward inst to the WHILE
    (doing both inst = inst->next and ip++)
3. Set payload_last_use_ip.
4. Proceed with the rest of your code.

Otherwise, this looks good.

> +
> +      int use_ip;
> +      if (loop_depth > 0)
> +         use_ip = loop_end_ip;
> +      else
> +         use_ip = ip;
> +
> +      /* Note that UNIFORM args have been turned into FIXED_HW_REG by
> +       * assign_curbe_setup(), and interpolation uses fixed hardware regs from
> +       * the start (see interp_reg()).
> +       */
> +      for (int i = 0; i < 3; i++) {
> +         if (inst->src[i].file == FIXED_HW_REG &&
> +             inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
> +            int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width;
> +            if (node_nr >= payload_node_count)
> +               continue;
> +
> +            payload_last_use_ip[node_nr] = use_ip;
> +         }
> +      }
> +
> +      /* Special case instructions which have extra implied registers used. */
> +      switch (inst->opcode) {
> +      case FS_OPCODE_FB_WRITE:
> +         /* We could omit this for the !inst->header_present case, except that
> +          * the simulator apparently incorrectly reads from g0/g1 instead of
> +          * sideband.  It also really freaks out driver developers to see g0
> +          * used in unusual places, so just always reserve it.
> +          */
> +         payload_last_use_ip[0 / reg_width] = use_ip;
> +         payload_last_use_ip[1 / reg_width] = use_ip;
> +         break;
> +      case FS_OPCODE_DISCARD:
> +         payload_last_use_ip[1 / reg_width] = use_ip;
> +         break;
> +
> +      case FS_OPCODE_LINTERP:
> +         /* On gen6+ in 16-wide, there are 4 adjacent registers (so 2 nodes)
> +          * used by PLN's sourcing of the deltas, while we list only the first
> +          * two in the arguments (1 node).  Pre-gen6, the deltas are computed
> +          * in normal VGRFs.
> +          */
> +         if (intel->gen >= 6) {
> +            int delta_x_arg = 0;
> +            if (inst->src[delta_x_arg].file == FIXED_HW_REG &&
> +                inst->src[delta_x_arg].fixed_hw_reg.file ==
> +                BRW_GENERAL_REGISTER_FILE) {
> +               int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr /
> +                                   reg_width) + 1;
> +               assert(sechalf_node < payload_node_count);
> +               payload_last_use_ip[sechalf_node] = use_ip;
> +            }
> +         }
> +         break;
> +
> +      default:
> +         break;
> +      }
> +
> +      ip++;
> +   }
> +
> +   for (int i = 0; i < payload_node_count; i++) {
> +      /* Mark the payload node as interfering with any virtual grf that is
> +       * live between the start of the program and our last use of the payload
> +       * node.
> +       */
> +      for (int j = 0; j < this->virtual_grf_count; j++) {
> +         if (this->virtual_grf_def[j] <= payload_last_use_ip[i] ||
> +             this->virtual_grf_use[j] <= payload_last_use_ip[i]) {
> +            ra_add_node_interference(g, first_payload_node + i, j);
> +         }
> +      }
> +   }
> +
> +   for (int i = 0; i < payload_node_count; i++) {
> +      /* Mark each payload node as being allocated to its physical register.
>          *
>          * The alternative would be to have per-physical-register classes, which
>          * would just be silly.
>          */
>         ra_set_node_reg(g, first_payload_node + i, i);
> -
> -      /* For now, just mark each payload node as interfering with every other
> -       * node to be allocated.
> -       */
> -      for (int j = 0; j < reg_node_count; j++) {
> -         ra_add_node_interference(g, first_payload_node + i, j);
> -      }
>      }
>   }
>
> @@ -198,7 +325,7 @@ fs_visitor::assign_regs()
>       */
>      int reg_width = c->dispatch_width / 8;
>      int hw_reg_mapping[this->virtual_grf_count];
> -   int payload_reg_count = (ALIGN(this->first_non_payload_grf, reg_width) /
> +   int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) /
>                               reg_width);
>      int base_reg_count = max_grf / reg_width;
>
> @@ -208,7 +335,7 @@ fs_visitor::assign_regs()
>
>      int node_count = this->virtual_grf_count;
>      int first_payload_node = node_count;
> -   node_count += payload_reg_count;
> +   node_count += payload_node_count;
>      struct ra_graph *g = ra_alloc_interference_graph(brw->wm.regs, node_count);
>
>      for (int i = 0; i < this->virtual_grf_count; i++) {
> @@ -240,8 +367,7 @@ fs_visitor::assign_regs()
>         }
>      }
>
> -   brw_setup_payload_interference(g, payload_reg_count, first_payload_node,
> -                                  this->virtual_grf_count);
> +   setup_payload_interference(g, payload_node_count, first_payload_node);
>
>      if (!ra_allocate_no_spills(g)) {
>         /* Failed to allocate registers.  Spill a reg, and the caller will
> @@ -268,7 +394,7 @@ fs_visitor::assign_regs()
>       * regs in the register classes back down to real hardware reg
>       * numbers.
>       */
> -   this->grf_used = payload_reg_count * reg_width;
> +   this->grf_used = payload_node_count * reg_width;
>      for (int i = 0; i < this->virtual_grf_count; i++) {
>         int reg = ra_get_node_reg(g, i);
>
>