[Mesa-dev] [PATCH v2 2/2] intel/fs: Improve liveness range calculation for partial writes

Francisco Jerez currojerez at riseup.net
Tue Aug 21 20:58:48 UTC 2018


Jose Maria Casanova Crespo <jmcasanova at igalia.com> writes:

> We use the information of the registers read/write patterns
> to improve variable liveness analysis avoiding extending the
> liveness range of a variable to the beginning of the block so
> it always reaches the beginning of the shader.
>
> This optimization analyses inside each block if a partial write
> defines completely the bytes used by a following instruction
> in the block. So we are not in the case of the use of an undefined
> value in the block.
>
> This avoids almost all the spilling that happens with 8bit/16bit
> storage tests, without any compilation performance impact for shader-db
> execution that is compensated by spilling reductions.
>
> At this moment we don't extend the logic to intra-block calculations
> of livein/liveout to not hurt performance on the general case because of
> not taking advance of BITWORD operations.
>
> The execution time for running dEQP-VK.*8bit_storage.* tests is reduced
> from 7m27.966s to 13.015s.
>
> shader-bd on SKL shows improvements reducing spilling on
> deus-ex-mankind-divided and dophin without increasing execution time.
>
> total instructions in shared programs: 14867218 -> 14863959 (-0.02%)
> instructions in affected programs: 121570 -> 118311 (-2.68%)
> helped: 38
> HURT: 0
>
> total cycles in shared programs: 537923248 -> 537720965 (-0.04%)
> cycles in affected programs: 63154229 -> 62951946 (-0.32%)
> helped: 61
> HURT: 26
>
> total loops in shared programs: 4828 -> 4828 (0.00%)
> loops in affected programs: 0 -> 0
> helped: 0
> HURT: 0
>
> total spills in shared programs: 7790 -> 7375 (-5.33%)
> spills in affected programs: 2824 -> 2409 (-14.70%)
> helped: 35
> HURT: 0
>
> total fills in shared programs: 10557 -> 10024 (-5.05%)
> fills in affected programs: 3752 -> 3219 (-14.21%)
> helped: 38
> HURT: 0
>
> v2: - Use functions dst_write_pattern and src_read_pattern
>       introduced in previous patch at v2.
>     - Avoid calculating read_pattern if defpartial is 0
>
> Cc: Francisco Jerez <currojerez at riseup.net>
> ---
>  src/intel/compiler/brw_fs_live_variables.cpp | 61 ++++++++++++--------
>  src/intel/compiler/brw_fs_live_variables.h   | 13 ++++-
>  2 files changed, 47 insertions(+), 27 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs_live_variables.cpp b/src/intel/compiler/brw_fs_live_variables.cpp
> index 059f076fa51..d3559e3114f 100644
> --- a/src/intel/compiler/brw_fs_live_variables.cpp
> +++ b/src/intel/compiler/brw_fs_live_variables.cpp
> @@ -54,9 +54,9 @@ using namespace brw;
>  
>  void
>  fs_live_variables::setup_one_read(struct block_data *bd, fs_inst *inst,
> -                                  int ip, const fs_reg &reg)
> +                                  int ip, int src, int reg_offset)
>  {
> -   int var = var_from_reg(reg);
> +   int var = var_from_reg(inst->src[src]) + reg_offset;
>     assert(var < num_vars);
>  
>     start[var] = MIN2(start[var], ip);
> @@ -64,31 +64,48 @@ fs_live_variables::setup_one_read(struct block_data *bd, fs_inst *inst,
>  
>     /* The use[] bitset marks when the block makes use of a variable (VGRF
>      * channel) without having completely defined that variable within the
> -    * block.
> +    * block. We take into account that a partial write could have defined
> +    * completely the read bytes in the block.
>      */
> -   if (!BITSET_TEST(bd->def, var))
> -      BITSET_SET(bd->use, var);
> +   if (!BITSET_TEST(bd->def, var)) {
> +      if (!bd->defpartial[var]) {
> +         BITSET_SET(bd->use, var);

Is there any measurable benefit from not executing the functionally
equivalent else block in this case?

> +      } else {
> +         unsigned read_pattern = inst->src_read_pattern(src, reg_offset);
> +         if ((bd->defpartial[var] & read_pattern) != read_pattern)
> +            BITSET_SET(bd->use, var);
> +      }
> +   }
>  }
>  
>  void
>  fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
> -                                   int ip, const fs_reg &reg)
> +                                   int ip, int reg_offset)
>  {
> -   int var = var_from_reg(reg);
> +   int var = var_from_reg(inst->dst) + reg_offset;
>     assert(var < num_vars);
>  
>     start[var] = MIN2(start[var], ip);
>     end[var] = MAX2(end[var], ip);
>  
>     /* The def[] bitset marks when an initialization in a block completely
> -    * screens off previous updates of that variable (VGRF channel).
> +    * screens off previous updates of that variable (VGRF channel). If
> +    * we have a partial write now we store the write pattern so next
> +    * reads in the block can check if what they read was completelly screened

"completely"

> +    * of by this partial write.

"off"

>      */
> -   if (inst->dst.file == VGRF) {
> -      if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
> +   assert(inst->dst.file == VGRF);
> +   if(!BITSET_TEST(bd->use, var)) {
> +      if (!inst->is_partial_write()) {
>           BITSET_SET(bd->def, var);
> -
> -      BITSET_SET(bd->defout, var);
> +         bd->defpartial[var] = ~0u;

Same question as above.  Is there any measurable benefit from not
executing the functionally equivalent else block here when the if
condition evaluates to true?

> +      } else {
> +         bd->defpartial[var] |= inst->dst_write_pattern(reg_offset);
> +         if (bd->defpartial[var] == ~0u)
> +            BITSET_SET(bd->def, var);
> +      }
>     }
> +   BITSET_SET(bd->defout, var);
>  }
>  
>  /**
> @@ -115,14 +132,9 @@ fs_live_variables::setup_def_use()
>        foreach_inst_in_block(fs_inst, inst, block) {
>  	 /* Set use[] for this instruction */
>  	 for (unsigned int i = 0; i < inst->sources; i++) {
> -            fs_reg reg = inst->src[i];
> -
> -            if (reg.file != VGRF)
> -               continue;
> -
> -            for (unsigned j = 0; j < regs_read(inst, i); j++) {
> -               setup_one_read(bd, inst, ip, reg);
> -               reg.offset += REG_SIZE;
> +            if (inst->src[i].file == VGRF) {
> +               for (unsigned j = 0; j < regs_read(inst, i); j++)
> +                  setup_one_read(bd, inst, ip, i, j);
>              }
>  	 }
>  
> @@ -130,11 +142,8 @@ fs_live_variables::setup_def_use()
>  
>           /* Set def[] for this instruction */
>           if (inst->dst.file == VGRF) {
> -            fs_reg reg = inst->dst;
> -            for (unsigned j = 0; j < regs_written(inst); j++) {
> -               setup_one_write(bd, inst, ip, reg);
> -               reg.offset += REG_SIZE;
> -            }
> +            for (unsigned j = 0; j < regs_written(inst); j++)
> +               setup_one_write(bd, inst, ip, j);
>  	 }
>  
>           if (!inst->predicate && inst->exec_size >= 8)
> @@ -281,6 +290,7 @@ fs_live_variables::fs_live_variables(fs_visitor *v, const cfg_t *cfg)
>     bitset_words = BITSET_WORDS(num_vars);
>     for (int i = 0; i < cfg->num_blocks; i++) {
>        block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
> +      block_data[i].defpartial = rzalloc_array(mem_ctx, unsigned, num_vars);
>        block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
>        block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
>        block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
> @@ -319,6 +329,7 @@ fs_visitor::invalidate_live_intervals()
>  void
>  fs_visitor::calculate_live_intervals()
>  {
> +

Stray whitespace.

>     if (this->live_intervals)
>        return;
>  
> diff --git a/src/intel/compiler/brw_fs_live_variables.h b/src/intel/compiler/brw_fs_live_variables.h
> index 9e95e443170..5842b24f3e5 100644
> --- a/src/intel/compiler/brw_fs_live_variables.h
> +++ b/src/intel/compiler/brw_fs_live_variables.h
> @@ -44,6 +44,15 @@ struct block_data {
>      */
>     BITSET_WORD *def;
>  
> +   /**
> +    * Which bytes of a variable are defined before being used in the block.
> +    * Each bit of the 32-bit integer represents one of the register 32 bytes.
> +    *
> +    * As above, "defined" means unconditionally defined but at byte level.
> +    */
> +
> +   unsigned *defpartial;
> +
>     /**
>      * Which variables are used before being defined in the block.
>      */
> @@ -115,9 +124,9 @@ public:
>  protected:
>     void setup_def_use();
>     void setup_one_read(struct block_data *bd, fs_inst *inst, int ip,
> -                       const fs_reg &reg);
> +                       int src, int reg_offset);
>     void setup_one_write(struct block_data *bd, fs_inst *inst, int ip,
> -                        const fs_reg &reg);
> +                        int reg_offset);
>     void compute_live_variables();
>     void compute_start_end();
>  
> -- 
> 2.17.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180821/d77db42f/attachment.sig>


More information about the mesa-dev mailing list