[Mesa-dev] [PATCH] i965/vec4: Reduce working set size of live variables computation.

Paul Berry stereotype441 at gmail.com
Sun Oct 27 19:02:19 CET 2013


On 21 October 2013 11:20, Eric Anholt <eric at anholt.net> wrote:

> Orbital Explorer was generating a 4000 instruction geometry shader, which
> was taking 275 trips through dead code elimination and register
> coalescing, each of which updated live variables to get its work done, and
> invalidated those live variables afterwards.
>
> By using bitfields instead of bools (reducing the working set size by a
> factor of 8) in live variables analysis, it drops from 88% of the profile
> to 57%, and reduces overall runtime from I-got-bored-and-killed-it (Paul
> says 3+ minutes) to 10.5 seconds.
>
> Compare to f179f419d1d0a03fad36c2b0a58e8b853bae6118 on the FS side.
> ---
>  .../drivers/dri/i965/brw_vec4_live_variables.cpp   | 41
> ++++++++++++----------
>  .../drivers/dri/i965/brw_vec4_live_variables.h     | 10 +++---
>  2 files changed, 28 insertions(+), 23 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> index db3787b..f6675c8 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> @@ -83,8 +83,8 @@ vec4_live_variables::setup_def_use()
>
>                 for (int j = 0; j < 4; j++) {
>                    int c = BRW_GET_SWZ(inst->src[i].swizzle, j);
> -                  if (!bd[b].def[reg * 4 + c])
> -                     bd[b].use[reg * 4 + c] = true;
> +                  if (!BITSET_TEST(bd[b].def, reg * 4 + c))
> +                     BITSET_SET(bd[b].use, reg * 4 + c);
>                 }
>             }
>          }
> @@ -99,8 +99,8 @@ vec4_live_variables::setup_def_use()
>              for (int c = 0; c < 4; c++) {
>                 if (inst->dst.writemask & (1 << c)) {
>                    int reg = inst->dst.reg;
> -                  if (!bd[b].use[reg * 4 + c])
> -                     bd[b].def[reg * 4 + c] = true;
> +                  if (!BITSET_TEST(bd[b].use, reg * 4 + c))
> +                     BITSET_SET(bd[b].def, reg * 4 + c);
>                 }
>              }
>           }
> @@ -126,12 +126,12 @@ vec4_live_variables::compute_live_variables()
>
>        for (int b = 0; b < cfg->num_blocks; b++) {
>          /* Update livein */
> -        for (int i = 0; i < num_vars; i++) {
> -           if (bd[b].use[i] || (bd[b].liveout[i] && !bd[b].def[i])) {
> -              if (!bd[b].livein[i]) {
> -                 bd[b].livein[i] = true;
> -                 cont = true;
> -              }
> +        for (int i = 0; i < bitset_words; i++) {
> +            BITSET_WORD new_livein = (bd[b].use[i] |
> +                                      (bd[b].liveout[i] & ~bd[b].def[i]));
> +            if (new_livein & ~bd[b].livein[i]) {
> +               bd[b].livein[i] |= new_livein;
> +               cont = true;
>

Personally, I think this would be slightly clearer as:

BITSET_WORD new_livein = bd[b].livein[i] | bd[b].use[i] | (bd[b].liveout[i]
& ~bd[b].def[i]);
if (new_livein != bd[b].livein[i]) {
   bd[b].livein[i] = new_livein;
   cont = true;
}


>             }
>          }
>
> @@ -140,9 +140,11 @@ vec4_live_variables::compute_live_variables()
>             bblock_link *link = (bblock_link *)block_node;
>             bblock_t *block = link->block;
>
> -           for (int i = 0; i < num_vars; i++) {
> -              if (bd[block->block_num].livein[i] && !bd[b].liveout[i]) {
> -                 bd[b].liveout[i] = true;
> +           for (int i = 0; i < bitset_words; i++) {
> +               BITSET_WORD new_liveout = (bd[block->block_num].livein[i] &
> +                                          ~bd[b].liveout[i]);
> +               if (new_liveout) {
> +                  bd[b].liveout[i] |= new_liveout;
>                   cont = true;
>                }
>

Similarly, here I think it would be clearer to do:

BITSET_WORD new_liveout = bd[block->block_num].livein[i];
if (new_liveout != bd[b].liveout[i]) {
   bd[b].liveout[i] |= new_liveout;
   cont = true;
}

Either way, the patch is:

Reviewed-by: Paul Berry <stereotype441 at gmail.com>


>             }
> @@ -159,11 +161,12 @@
> vec4_live_variables::vec4_live_variables(vec4_visitor *v, cfg_t *cfg)
>     num_vars = v->virtual_grf_count * 4;
>     bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
>
> +   bitset_words = BITSET_WORDS(num_vars);
>     for (int i = 0; i < cfg->num_blocks; i++) {
> -      bd[i].def = rzalloc_array(mem_ctx, bool, num_vars);
> -      bd[i].use = rzalloc_array(mem_ctx, bool, num_vars);
> -      bd[i].livein = rzalloc_array(mem_ctx, bool, num_vars);
> -      bd[i].liveout = rzalloc_array(mem_ctx, bool, num_vars);
> +      bd[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
> +      bd[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
> +      bd[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
> +      bd[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
>     }
>
>     setup_def_use();
> @@ -248,12 +251,12 @@ vec4_visitor::calculate_live_intervals()
>
>     for (int b = 0; b < cfg.num_blocks; b++) {
>        for (int i = 0; i < livevars.num_vars; i++) {
> -        if (livevars.bd[b].livein[i]) {
> +        if (BITSET_TEST(livevars.bd[b].livein, i)) {
>             start[i / 4] = MIN2(start[i / 4], cfg.blocks[b]->start_ip);
>             end[i / 4] = MAX2(end[i / 4], cfg.blocks[b]->start_ip);
>          }
>
> -        if (livevars.bd[b].liveout[i]) {
> +        if (BITSET_TEST(livevars.bd[b].liveout, i)) {
>             start[i / 4] = MIN2(start[i / 4], cfg.blocks[b]->end_ip);
>             end[i / 4] = MAX2(end[i / 4], cfg.blocks[b]->end_ip);
>          }
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h
> b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h
> index 296468a..b2d8b33 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h
> @@ -25,6 +25,7 @@
>   *
>   */
>
> +#include "main/bitset.h"
>  #include "brw_vec4.h"
>
>  namespace brw {
> @@ -36,18 +37,18 @@ struct block_data {
>      * Note that for our purposes, "defined" means unconditionally,
> completely
>      * defined.
>      */
> -   bool *def;
> +   BITSET_WORD *def;
>
>     /**
>      * Which variables are used before being defined in the block.
>      */
> -   bool *use;
> +   BITSET_WORD *use;
>
>     /** Which defs reach the entry point of the block. */
> -   bool *livein;
> +   BITSET_WORD *livein;
>
>     /** Which defs reach the exit point of the block. */
> -   bool *liveout;
> +   BITSET_WORD *liveout;
>  };
>
>  class vec4_live_variables {
> @@ -65,6 +66,7 @@ public:
>     void *mem_ctx;
>
>     int num_vars;
> +   int bitset_words;
>
>     /** Per-basic-block information on live variables */
>     struct block_data *bd;
> --
> 1.8.4.rc3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20131027/e93a8926/attachment-0001.html>


More information about the mesa-dev mailing list