[Mesa-dev] [PATCH 6/9] nir: Make dead_write_vars pass global

Jason Ekstrand jason at jlekstrand.net
Sat Aug 25 15:04:54 UTC 2018


On Sat, Aug 25, 2018 at 9:40 AM Jason Ekstrand <jason at jlekstrand.net> wrote:

> Sorry I haven't given you any in-line review yet.  I've been letting this
> pass simmer in my brain a bit and thinking about complexity and data
> structures.  Here's a few questions for which I do not expect you to have
> actual answers:
>
>  1) How many unique derefs should we expect to see in a shader?  Dozens?
> Hundreds?  Thousands?  My guess is that, after vars_to_ssa has picked off
> all the easy ones, it should be on the order of a couple hundred at most
> but I don't actually know what a reasonable upper limit to assume is.
>
>  2) Is every deref going to be eventually compared to every other deref or
> are we going to have separate little groups of derefs?
>
> I'm asking these questions because I have some concern about how expensive
> this pass (and copy prop) is going to be.  If you have a shader with a huge
> number of loops each of which has a very self-contained set of derefs, the
> current pass as-is probably has an efficiency that's around O(n) in the
> number of instructions.  However, in the case where most derefs are in play
> all the time, then I supsect this pass as-is looks more like O(n^3) in the
> number of derefs or worse.
>
> If the total number of unique derefs is reasonable AND the graph of deref
> comparisons isn't too sparse, then I wonder if it wouldn't be better to
> have some intermediate data structure like this:
>
> struct deref_compare_map {
>    void *mem_ctx;
>
>    struct hash_table *id_map;
>
>    nir_deref_path *paths;
>    BITSET_WORD **alias;
>    BITSET_WORD **contains;
>    BITSET_WORD **contained_in;
> };
>
> void deref_compare_map_init(struct deref_compare_map *map, void *mem_ctx)
> {
>    memset(map, 0, sizeof(*map));
>    map->mem_ctx = ralloc_context(mem_ctx);
>
>    /* I'm pretty sure we have deref hashing and equality functions
> somewhere already, we just have to pull them out and use them.  It is,
> however, possible that they were in nir_instr_set.c and got deleted as part
> of the move to deref instructions. */
>    map->id_map = _mesa_hash_table_create(map->mem_ctx, deref_hash,
> deref_equal);
> }
>
> void deref_compare_map_finish(struct deref_compare_map *map)
> {
>    ralloc_free(map->mem_ctx);
> }
>
> uint32_t deref_compare_map_get_id(struct deref_compare_map *map,
> nir_deref_instr *deref)
> {
>    struct hash_entry *entry = _mesa_hash_set_lookup(map->info_map, deref);
>    if (entry)
>       return (uint32_t)(uintptr_t)entry->data;
>
>    /* We can't be adding anything after we've baked it */
>    assert(map->alias == NULL);
>
>    uint32_t id = map->id_map->entries;
>    _mesa_hash_set_add(map->id_map, deref, id);
>    assert(map->id_map->entries == id + 1);
> }
>
> static BITSET_WORDS **
> alloc_set_of_sets(void *mem_ctx, unsigned entries)
> {
>    BITSET_WORDS **sets = ralloc_array(mem_ctx, BITSET_WORD *, entries);
>    BITSET_WORDS *data = rzalloc_array(mem_ctx, BITSET_WORD, entries *
> BITSET_WORDS(entries));
>    for (unsigned i = 0; i < entries; i++)
>       sets[i] = data + i * BITSET_WORDS(entries);
>    return sets;
> }
>
> void deref_compare_map_bake(struct deref_compare_map *map)
> {
>    const unsigned num_derefs = map->id_map->entries;
>
>    map->paths = ralloc_array(map->mem_ctx, nir_deref_path, num_derefs);
>    struct hash_entry *entry;
>    hash_table_foreach(map->id_map, entry) {
>       uint32_t id = (uintptr_t)entry->data;
>       nir_deref_path_init(&map->paths[id], (void *)entry->key,
> map->mem_ctx);
>    }
>
>    map->alias = alloc_set_of_sets(map->mem_ctx, map->id_map->entries);
>    map->contains = alloc_set_of_sets(map->mem_ctx, map->id_map->entries);
>    map->contained_in = alloc_set_of_sets(map->mem_ctx,
> map->id_map->entries);
>
>    for (unsigned a = 0; a < num_derefs; a++) {
>       /* Handle comparing a with itself */
>       BITSET_SET(map->alias[a], a);
>       BITSET_SET(map->contains[a], a);
>       BITSET_SET(map->contained_in[a], a);
>
>       for (unsigned b = 0; b < a; b++) {
>          enum nir_deref_compare_result comp =
>             nir_compare_deref_paths(&map->paths[a], &map->paths[b]);
>          if (comp & derefs_may_alias_bit) {
>             BITSET_SET(map->alias[a], b);
>             BITSET_SET(map->alias[b], a);
>          }
>
>          if (comp & derefs_a_contains_b_bit) {
>             BITSET_SET(map->contains[a], b);
>             BITSET_SET(map->contained_in[b], a);
>          }
>
>          if (comp & derefs_b_contains_a_bit) {
>             BITSET_SET(map->contains[b], a);
>             BITSET_SET(map->contained_in[a], b);
>          }
>       }
>    }
> }
>
> The idea would be to make it a 5-step pass:
>
>  1) find all derefs
>  2) Bake the deref compare map
>  3) Local elimination and building kill etc. sets.
>  4) Data-flow
>  5) global elimination
>
> Doing that would give us some better run-time guarantees:
>
>  1) Set operations on bitsets are O(n) (where n is actually
> DIV_ROUND_UP(n, 32) thanks to bitsets)
>  2) We call nir_deref_path_init exactly num_derefs times for the entire
> pass
>  3) We call nir_compare_deref_paths exactly (num_derefs * (num_derefs -
> 1)) / 2 times for the entire pass.
>
> As it is, deref_map_add_map is O(n^2) comparisons and we do that operation
> inside our fixed-point loop which has an unknown number of iterations.
> This data structure could easily be put in nir_derefs.c and shared between
> this pass and copy-prop.
>
> Just a thought.  Of course, you've spent far more time playing with this
> than I have so I may be completely off-base.  Before you go off and
> re-write it, let's have a little e-mail discussion to decide if we think
> the rewrite is worth it.  :-)
>

After giving it more thought, I realized I hadn't done anything with
components like you did.  While local variables don't currently support
write-masks, that should probably change in the not-so-distant future so we
should handle them.  One idea would be to replace our many arrays with one
array of structs such as

struct deref_comp_info {
   /** Offset into other info's bitsets */
   uint32_t offset;

   /** Number of components for vector derefs, zero for others
    *
    * If num_components == 0, the deref is represented by the single bit at
offset in the bitsets.
    * If num_components > 0, the deref is represented by num_components
bits in the b, then
    * the deref is represented by num_components bits starting at offset.
    */
   unsigned num_components;

   nir_deref_path path;

   BITSET_WORD *alias;
   BITSET_WORD *contains;
   BITSET_WORD *contained_in;
};

Also, if we're going to share it between passes, it might be good to have
nir_deref_compare_map_init take a shader and do the walk to find all derefs
and the bake in a single step and we can avoid exposing quite as many
details to the user.


> --Jason
>
> On Wed, Aug 15, 2018 at 4:57 PM Caio Marcelo de Oliveira Filho <
> caio.oliveira at intel.com> wrote:
>
>> The pass will remove not only writes that are proven to be dead in
>> block, but also writes that can only be considered dead by looking at
>> multiple blocks.  Such global analysis is necessary to remove dead
>> writes such as the one marked below:
>>
>>     int total = gl_VertexIndex * 10;
>>     float r = gl_VertexIndex;
>>
>>     for (int i = 0; i < total; i++) {
>>         arr[i] = i;                     /* DEAD WRITE. */
>>         if ((i % 2) == 0) {
>>             arr[i] = i * 3.0;
>>         } else {
>>             arr[i] = i * 5.0;
>>         }
>>         r = arr[i];
>>     }
>>
>>     out_color = vec4(r,r,r,r);
>>
>> Without knowing that both sucessor blocks will overwrite the value,
>> it was not possible to remove the the "arr[i] = i" write.
>>
>> The local pass was incremented with some extra data collected to allow
>> a iterative (backward) data-flow analysis to run.  The analysis is
>> used to find out, for each block, what derefs are used after the
>> block.  This information is then used to decide what unused writes in
>> the end of the block can be removed.  In a local-only analysis we
>> couldn't remove any, because they could be used later.
>> ---
>>  src/compiler/nir/nir_opt_dead_write_vars.c | 546 ++++++++++++++++++++-
>>  1 file changed, 532 insertions(+), 14 deletions(-)
>>
>> diff --git a/src/compiler/nir/nir_opt_dead_write_vars.c
>> b/src/compiler/nir/nir_opt_dead_write_vars.c
>> index 822bfa5595d..fe72ab784a8 100644
>> --- a/src/compiler/nir/nir_opt_dead_write_vars.c
>> +++ b/src/compiler/nir/nir_opt_dead_write_vars.c
>> @@ -27,6 +27,85 @@
>>
>>  #include "util/u_dynarray.h"
>>
>> +/**
>> + * Elimination of dead writes based on derefs.
>> + *
>> + * Dead writes are stores and copies that write to a deref, which then
>> gets
>> + * another write before it was used (read or sourced for a copy).  Those
>> + * writes can be removed since they don't affect anything.
>> + *
>> + * For derefs that refer to a memory area that can be read after the
>> program,
>> + * the last write is considered used.  The presence of certain
>> instructions
>> + * may also cause writes to be considered used, e.g. memory barrier (in
>> this case
>> + * the value must be written as other thread might use it).
>> + *
>> + * The write mask for store instructions is considered, so it is
>> possible that
>> + * a store is removed because of the combination of other stores
>> overwritten
>> + * its value.
>> + *
>> + * The pass works in three steps:
>> + *
>> + * (1) For each block, perform a local dead write removal.  Removing only
>> + *     writes we are sure are dead, so keeping around the last unused
>> writes
>> + *     at the end of the block.  These last unused will be revisited in
>> (3).
>> + *
>> + * (2) Perform a data-flow analysis to identify for each block the "uses
>> + *     before write" of successor blocks.  E.g. given the blocks below
>> (block0
>> + *     precedes block1, that precedes block2)
>> + *
>> + *       block0:            block1:            block2:
>> + *         write A            load B             load D
>> + *         write B            write C            load E
>> + *         write C            load C
>> + *         write D            write D
>> + *         write E
>> + *         load A
>> + *
>> + *    the goal is know that B and E are "used before write" after
>> block0; D
>> + *    and E are "used before write" after block1.  Note that usage of E
>> + *    doesn't come from an immediate succesor of block0.  The "used
>> before
>> + *    written" information will decided later what unused writes are
>> removed.
>> + *
>> + *    The data-flow analysis is implemented by starting with per-block
>> sets
>> + *    and iterate fixing the content in those sets.  The nature of the
>> + *    "fixing" ensures that the sets get stable (i.e. no infinite loops).
>> + *    Each block gets two sets representing its local information:
>> + *
>> + *    - GEN: the derefs used before write by the block.  E.g.
>> GEN(block1) = {B}
>> + *    - KILL: the derefs overwritten by the block.  E.g. KILL(block1) =
>> {C, D}
>> + *
>> + *    And two other sets representing its current state:
>> + *
>> + *    - IN: the derefs used before write for this block and all
>> successors.
>> + *      E.g. at the end of the analysis, IN(block1) = {B, E}.  Note how
>> E is
>> + *      present but not D.
>> + *    - OUT: the derefs used before write for the successors of the
>> block.
>> + *
>> + *    In this pass the iteration happens "backwards", so a block uses the
>> + *    succesor information to improve its own information.  The IN and
>> OUT
>> + *    sets are updated as follows:
>> + *
>> + *        OUT = union IN of all successors
>> + *        IN  = GEN union (OUT minus KILL)
>> + *
>> + * 3) Remove the last unused writes for each block that are not used by
>> + *    successor blocks.  After the analysis in (2), this information is
>> + *    available in the OUT set of each block.
>> + *
>> + * One extra factor in play is that not trivial to decide whether two
>> derefs
>> + * are referring to the same storage or not.  Currently array indirects
>> are
>> + * assumed to alias with each other, i.e. "write A[i]" is considered
>> used by a
>> + * later "load A[j]".  This logic is mostly handled as part of
>> + * nir_compare_deref_paths() function.  This prevents us from
>> representing the
>> + * data-flow sets as bitsets.
>> + *
>> + * Data-flow analysis code here is inspired by Muchnick's "Advanced
>> Compiler
>> + * Design and Implementation" and by the data-flow analysis
>> implementation in
>> + * src/intel/compiler/brw_fs_copy_propagation.cpp.
>> + */
>> +
>> +static const bool debug = false;
>> +
>>  struct state {
>>     void *mem_ctx;
>>
>> @@ -34,6 +113,24 @@ struct state {
>>      * rebuilding the paths for the same deref. */
>>     struct hash_table *paths;
>>     void *path_lin_ctx;
>> +
>> +   struct block_data *block_data;
>> +};
>> +
>> +struct block_data {
>> +   struct deref_map *in;
>> +   struct deref_map *out;
>> +   struct deref_map *gen;
>> +   struct deref_map *kill;
>> +
>> +   struct deref_map *kill_before_memory_barrier;
>> +   struct deref_map *kill_before_emit_vertex;
>> +
>> +   /* All the writes not used by the end of the block.  Used after the
>> +    * data-flow analysis, to identify globally unused writes, that can be
>> +    * removed.
>> +    */
>> +   struct util_dynarray unused_writes;
>>  };
>>
>>  static nir_deref_path *
>> @@ -50,6 +147,23 @@ get_path(struct state *state, nir_deref_instr *deref)
>>     }
>>  }
>>
>> +static void
>> +print_instr(const char *prefix, const nir_instr *instr, const char
>> *suffix)
>> +{
>> +   if (prefix) fputs(prefix, stdout);
>> +   nir_print_instr(instr, stdout);
>> +   if (suffix) fputs(suffix, stdout);
>> +}
>> +
>> +static void
>> +print_path(const char *prefix, nir_deref_path *path, const char *suffix)
>> +{
>> +   int i;
>> +   for (i = 0; path->path[i + 1]; i++);
>> +   nir_deref_instr *deref = path->path[i];
>> +   print_instr(prefix, &deref->instr, suffix);
>> +}
>> +
>>  /* Entry for unused_writes arrays. */
>>  struct write_entry {
>>     /* If NULL indicates the entry is free to be reused. */
>> @@ -58,6 +172,161 @@ struct write_entry {
>>     nir_deref_path *path;
>>  };
>>
>> +struct deref_map_entry {
>> +   nir_deref_path *path;
>> +   uintptr_t mask;
>> +};
>> +
>> +/* Maps derefs to a (write) mask value.  Used by the various data-flow
>> + * analysis sets.
>> + *
>> + * TODO: Consider a tree-based data structure to represent this instead
>> of
>> + * arrays of paths.
>> + */
>> +struct deref_map {
>> +   struct util_dynarray entries;
>> +   void *mem_ctx;
>> +};
>> +
>> +static struct deref_map *
>> +deref_map_create(void *mem_ctx)
>> +{
>> +   struct deref_map *map = rzalloc(mem_ctx, struct deref_map);
>> +   map->mem_ctx = mem_ctx;
>> +   util_dynarray_init(&map->entries, mem_ctx);
>> +   return map;
>> +}
>> +
>> +static bool
>> +deref_map_empty(struct deref_map *map)
>> +{
>> +   return map->entries.size == 0;
>> +}
>> +
>> +/* Add a deref (represented by its path) to a deref_map, with a given
>> mask.
>> + *
>> + * Returns whether the maps was changed or not.
>> + */
>> +static bool
>> +deref_map_add(struct deref_map *map, nir_deref_path *path, uintptr_t
>> mask)
>> +{
>> +   util_dynarray_foreach (&map->entries, struct deref_map_entry,
>> map_entry) {
>> +      nir_deref_compare_result comp = nir_compare_deref_paths(path,
>> map_entry->path);
>> +      if (comp & nir_derefs_equal_bit) {
>> +         /* The deref is already present, no new entry needed. */
>> +         uintptr_t old_mask = map_entry->mask;
>> +         map_entry->mask |= mask;
>> +         return old_mask != map_entry->mask;
>> +      }
>> +   }
>> +
>> +   struct deref_map_entry map_entry = {
>> +      .path = path,
>> +      .mask = mask,
>> +   };
>> +   util_dynarray_append(&map->entries, struct deref_map_entry,
>> map_entry);
>> +
>> +   return true;
>> +}
>> +
>> +static bool
>> +deref_map_contains(struct deref_map *map, nir_deref_path *path,
>> uintptr_t mask)
>> +{
>> +   util_dynarray_foreach (&map->entries, struct deref_map_entry,
>> map_entry) {
>> +      nir_deref_compare_result comp = nir_compare_deref_paths(path,
>> map_entry->path);
>> +      /* All the bits in the mask must be present in the map. */
>> +      if ((comp & nir_derefs_b_contains_a_bit) && ((mask &
>> map_entry->mask) == mask))
>> +         return true;
>> +   }
>> +   return false;
>> +}
>> +
>> +static bool
>> +deref_map_may_alias(struct deref_map *map, nir_deref_path *path,
>> uintptr_t mask)
>> +{
>> +   util_dynarray_foreach (&map->entries, struct deref_map_entry,
>> map_entry) {
>> +      nir_deref_compare_result comp = nir_compare_deref_paths(path,
>> map_entry->path);
>> +      /* A single overlapping bit is enough to characterize an alias. */
>> +      if ((comp & nir_derefs_may_alias_bit) && (mask & map_entry->mask))
>> +         return true;
>> +   }
>> +   return false;
>> +}
>> +
>> +#define deref_map_foreach(map, elem) \
>> +   util_dynarray_foreach(&map->entries, struct deref_map_entry, elem)
>> +
>> +static bool
>> +deref_map_add_map(struct deref_map *map, struct deref_map *to_add)
>> +{
>> +   bool changed = false;
>> +   deref_map_foreach(to_add, map_entry)
>> +      changed |= deref_map_add(map, map_entry->path, map_entry->mask);
>> +   return changed;
>> +}
>> +
>> +
>> +static struct deref_map *
>> +deref_map_clone(struct deref_map *from, void *mem_ctx)
>> +{
>> +   struct deref_map *map = deref_map_create(mem_ctx);
>> +   util_dynarray_clone(&map->entries, mem_ctx, &from->entries);
>> +   return map;
>> +}
>> +
>> +/* The zero mask to indicate entry should be removed. */
>> +static void
>> +deref_map_remove_zero_masks(struct deref_map *map)
>> +{
>> +   deref_map_foreach(map, map_entry) {
>> +      if (map_entry->mask)
>> +         continue;
>> +
>> +      /* Replace the deleted entry with the last element. */
>> +      struct deref_map_entry *last =
>> +         util_dynarray_pop_ptr(&map->entries, struct deref_map_entry);
>> +      if (map_entry != last)
>> +         *map_entry = *last;
>> +   }
>> +}
>> +
>> +/* Removes from map all derefs that are contained by any deref in
>> to_sub. */
>> +static bool
>> +deref_map_sub_map(struct deref_map *map, struct deref_map *to_sub)
>> +{
>> +   bool changed = false;
>> +   deref_map_foreach(to_sub, sub_entry) {
>> +      nir_deref_path *path = sub_entry->path;
>> +      uintptr_t mask = sub_entry->mask;
>> +
>> +      deref_map_foreach(map, map_entry) {
>> +         nir_deref_compare_result comp = nir_compare_deref_paths(path,
>> map_entry->path);
>> +         if (comp & nir_derefs_a_contains_b_bit) {
>> +            uintptr_t new_mask = map_entry->mask & ~mask;
>> +            if (new_mask != map_entry->mask)
>> +               changed = true;
>> +            map_entry->mask = new_mask;
>> +         }
>> +      }
>> +   }
>> +
>> +   deref_map_remove_zero_masks(map);
>> +
>> +   return changed;
>> +}
>> +
>> +/* Removes from map all derefs that the variable matches the given
>> modes. */
>> +static void
>> +deref_map_sub_modes(struct deref_map *map, nir_variable_mode modes)
>> +{
>> +   deref_map_foreach(map, map_entry) {
>> +      nir_variable *var = map_entry->path->path[0]->var;
>> +      if (var->data.mode & modes)
>> +         map_entry->mask = 0;
>> +   }
>> +   deref_map_remove_zero_masks(map);
>> +}
>> +
>>  static void
>>  clear_unused_for_modes(struct util_dynarray *unused_writes,
>> nir_variable_mode modes)
>>  {
>> @@ -96,10 +365,15 @@ update_unused_writes_with_dst(struct util_dynarray
>> *unused_writes,
>>           empty_entry = entry;
>>           continue;
>>        }
>> +
>>        nir_deref_compare_result comp = nir_compare_deref_paths(dst_path,
>> entry->path);
>>        if (comp & nir_derefs_a_contains_b_bit) {
>>           entry->mask &= ~mask;
>>           if (entry->mask == 0) {
>> +            if (debug) {
>> +               print_instr("REMOVE: ", &entry->intrin->instr,
>> +                           "\n  (value overwritten)\n");
>> +            }
>>              nir_instr_remove(&entry->intrin->instr);
>>              entry->intrin = NULL;
>>              empty_entry = entry;
>> @@ -123,13 +397,36 @@ update_unused_writes_with_dst(struct util_dynarray
>> *unused_writes,
>>     return progress;
>>  }
>>
>> +static void
>> +tag_use_before_write(struct deref_map *gen, struct deref_map *kill,
>> +                     nir_deref_path *path)
>> +{
>> +   if (deref_map_contains(kill, path, ~(1 << NIR_MAX_VEC_COMPONENTS)))
>> +      return;
>> +   deref_map_add(gen, path, ~(1 << NIR_MAX_VEC_COMPONENTS));
>> +}
>> +
>> +static void
>> +tag_write_before_use(struct deref_map *gen, struct deref_map *kill,
>> +                     nir_deref_path *path, uintptr_t mask)
>> +{
>> +   deref_map_add(kill, path, mask);
>> +}
>> +
>> +static const nir_variable_mode memory_barrier_modes =
>> +   ~(nir_var_local | nir_var_global | nir_var_shader_in |
>> nir_var_uniform);
>> +
>> +static const nir_variable_mode emit_vertex_modes = nir_var_shader_out;
>> +
>> +static const nir_variable_mode end_of_program_modes =
>> +   ~(nir_var_local | nir_var_shader_in | nir_var_uniform);
>> +
>>  static bool
>> -remove_dead_write_vars_local(struct state *state, nir_block *block)
>> +remove_dead_write_vars_local(struct state *state, nir_block *block,
>> struct block_data *bd)
>>  {
>>     bool progress = false;
>>
>> -   struct util_dynarray unused_writes;
>> -   util_dynarray_init(&unused_writes, state->mem_ctx);
>> +   util_dynarray_init(&bd->unused_writes, state->mem_ctx);
>>
>>     nir_foreach_instr_safe(instr, block) {
>>        if (instr->type != nir_instr_type_intrinsic)
>> @@ -139,16 +436,21 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>>        switch (intrin->intrinsic) {
>>        case nir_intrinsic_barrier:
>>        case nir_intrinsic_memory_barrier: {
>> -         nir_variable_mode modes = ~(nir_var_local | nir_var_global |
>> -                                     nir_var_shader_in |
>> nir_var_uniform);
>> -         clear_unused_for_modes(&unused_writes, modes);
>> +         clear_unused_for_modes(&bd->unused_writes,
>> memory_barrier_modes);
>> +         if (!bd->kill_before_memory_barrier) {
>> +            bd->kill_before_memory_barrier = deref_map_clone(bd->kill,
>> state->mem_ctx);
>> +            deref_map_sub_modes(bd->kill_before_memory_barrier,
>> memory_barrier_modes);
>> +         }
>>           break;
>>        }
>>
>>        case nir_intrinsic_emit_vertex:
>>        case nir_intrinsic_emit_vertex_with_counter: {
>> -         nir_variable_mode modes = nir_var_shader_out;
>> -         clear_unused_for_modes(&unused_writes, modes);
>> +         clear_unused_for_modes(&bd->unused_writes, emit_vertex_modes);
>> +         if (!bd->kill_before_emit_vertex) {
>> +            bd->kill_before_emit_vertex = deref_map_clone(bd->kill,
>> state->mem_ctx);
>> +            deref_map_sub_modes(bd->kill_before_emit_vertex,
>> emit_vertex_modes);
>> +         }
>>           break;
>>        }
>>
>> @@ -156,7 +458,8 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>>           nir_deref_instr *src = nir_src_as_deref(intrin->src[0]);
>>           nir_deref_path *src_path = get_path(state, src);
>>
>> -         clear_unused_for_src(&unused_writes, src_path);
>> +         clear_unused_for_src(&bd->unused_writes, src_path);
>> +         tag_use_before_write(bd->gen, bd->kill, src_path);
>>           break;
>>        }
>>
>> @@ -165,7 +468,8 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>>           nir_deref_path *dst_path = get_path(state, dst);
>>
>>           uintptr_t mask = nir_intrinsic_write_mask(intrin);
>> -         progress |= update_unused_writes_with_dst(&unused_writes,
>> intrin, dst_path, mask);
>> +         progress |= update_unused_writes_with_dst(&bd->unused_writes,
>> intrin, dst_path, mask);
>> +         tag_write_before_use(bd->gen, bd->kill, dst_path, mask);
>>           break;
>>        }
>>
>> @@ -178,14 +482,20 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>>
>>           /* Self-copy is removed. */
>>           if (src == dst || (nir_compare_deref_paths(src_path, dst_path)
>> & nir_derefs_equal_bit)) {
>> +            if (debug) {
>> +               print_instr("REMOVE: ", instr,
>> +                           "\n  (self-copy)\n");
>> +            }
>>              nir_instr_remove(instr);
>>              progress = true;
>>              break;
>>           }
>>
>>           uintptr_t mask = ~(1 << NIR_MAX_VEC_COMPONENTS);
>> -         clear_unused_for_src(&unused_writes, src_path);
>> -         progress |= update_unused_writes_with_dst(&unused_writes,
>> intrin, dst_path, mask);
>> +         clear_unused_for_src(&bd->unused_writes, src_path);
>> +         progress |= update_unused_writes_with_dst(&bd->unused_writes,
>> intrin, dst_path, mask);
>> +         tag_use_before_write(bd->gen, bd->kill, src_path);
>> +         tag_write_before_use(bd->gen, bd->kill, dst_path, mask);
>>           break;
>>        }
>>
>> @@ -201,6 +511,60 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>>     return progress;
>>  }
>>
>> +enum {
>> +   DUMP_GEN           = 1 << 0,
>> +   DUMP_KILL          = 1 << 1,
>> +   DUMP_IN            = 1 << 2,
>> +   DUMP_OUT           = 1 << 3,
>> +   DUMP_UNUSED_WRITES = 1 << 4,
>> +};
>> +
>> +static void
>> +dump_block(struct state *state, unsigned block_index, unsigned flags)
>> +{
>> +   struct block_data *bd = &state->block_data[block_index];
>> +   printf("--- block%d\n", block_index);
>> +
>> +   if (bd->kill_before_memory_barrier)
>> +      printf("  has memory barrier\n");
>> +   if (bd->kill_before_emit_vertex)
>> +      printf("  has emit vertex\n");
>> +
>> +   if ((flags & DUMP_GEN) && !deref_map_empty(bd->gen)) {
>> +      printf("  gen\n");
>> +      deref_map_foreach(bd->gen, map_entry)
>> +         print_path("    ", map_entry->path, "\n");
>> +   }
>> +
>> +   if ((flags & DUMP_KILL) && !deref_map_empty(bd->kill)) {
>> +      printf("  kill\n");
>> +      deref_map_foreach(bd->kill, map_entry)
>> +         print_path("    ", map_entry->path, "\n");
>> +   }
>> +
>> +   if ((flags & DUMP_IN) && !deref_map_empty(bd->in)) {
>> +      printf("  in\n");
>> +      deref_map_foreach(bd->in, map_entry)
>> +         print_path("    ", map_entry->path, "\n");
>> +   }
>> +
>> +   if ((flags & DUMP_OUT) && !deref_map_empty(bd->out)) {
>> +      printf("  out\n");
>> +      deref_map_foreach(bd->out, map_entry)
>> +         print_path("    ", map_entry->path, "\n");
>> +   }
>> +
>> +   if ((flags & DUMP_UNUSED_WRITES) && bd->unused_writes.size != 0) {
>> +      printf("  unused writes\n");
>> +      util_dynarray_foreach(&bd->unused_writes, struct write_entry,
>> entry) {
>> +         if (!entry->intrin)
>> +            continue;
>> +         print_instr("    ", &entry->intrin->instr, "");
>> +         print_path(" -- ", entry->path, "\n");
>> +      }
>> +   }
>> +}
>> +
>>  static bool
>>  remove_dead_write_vars_impl(void *mem_ctx, nir_function_impl *func)
>>  {
>> @@ -211,10 +575,161 @@ remove_dead_write_vars_impl(void *mem_ctx,
>> nir_function_impl *func)
>>
>>        .path_lin_ctx = linear_alloc_parent(mem_ctx, 0),
>>        .paths = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
>> _mesa_key_pointer_equal),
>> +
>> +      /* Count one extra data element for the end_block. */
>> +      .block_data = rzalloc_array(mem_ctx, struct block_data,
>> func->num_blocks + 1),
>>     };
>>
>> -   nir_foreach_block(block, func)
>> -      progress |= remove_dead_write_vars_local(&state, block);
>> +   /* Step 1.  Perform local optimization and initialize GEN, KILL and
>> +    * UNUSED_WRITES sets.
>> +    */
>> +   bool all_gens_empty = true;
>> +   nir_foreach_block(block, func) {
>> +      struct block_data *bd = &state.block_data[block->index];
>> +      bd->gen  = deref_map_create(mem_ctx);
>> +      bd->kill = deref_map_create(mem_ctx);
>> +      bd->out  = deref_map_create(mem_ctx);
>> +
>> +      progress |= remove_dead_write_vars_local(&state, block, bd);
>> +
>> +      if (!deref_map_empty(bd->gen))
>> +         all_gens_empty = false;
>> +   }
>> +
>> +   if (func->num_blocks < 2 || all_gens_empty)
>> +      return progress;
>> +
>> +   /* Identify the derefs that are considered used by other intrinsics
>> (or the
>> +    * end of the program), then include them as part of the GEN of the
>> blocks
>> +    * that contains those intrinsics.
>> +    */
>> +   struct deref_map *used_by_memory_barrier = deref_map_create(mem_ctx);
>> +   struct deref_map *used_by_emit_vertex    = deref_map_create(mem_ctx);
>> +   struct deref_map *used_by_end_of_program = deref_map_create(mem_ctx);
>> +
>> +   for (int i = 0; i < func->num_blocks; i++) {
>> +      struct block_data *bd = &state.block_data[i];
>> +
>> +      util_dynarray_foreach(&bd->unused_writes, struct write_entry,
>> entry) {
>> +         if (!entry->intrin)
>> +            continue;
>> +
>> +         nir_deref_path *dst_path = entry->path;
>> +         nir_variable *var = dst_path->path[0]->var;
>> +
>> +         if (var->data.mode & memory_barrier_modes)
>> +            deref_map_add(used_by_memory_barrier, dst_path, entry->mask);
>> +
>> +         if (var->data.mode & emit_vertex_modes)
>> +            deref_map_add(used_by_emit_vertex, dst_path, entry->mask);
>> +
>> +         if (var->data.mode & end_of_program_modes)
>> +            deref_map_add(used_by_end_of_program, dst_path, entry->mask);
>> +      }
>> +   }
>> +
>> +   /* Update GEN to include extra uses based on the presence of other
>> +    * intrinsics.  Make sure not include uses for derefs we already
>> overwrite
>> +    * (kill) before the intrinsic in the block.
>> +    */
>> +   for (int i = 0; i < func->num_blocks; i++) {
>> +      struct block_data *bd = &state.block_data[i];
>> +      if (bd->kill_before_memory_barrier) {
>> +         struct deref_map *temp =
>> deref_map_clone(used_by_memory_barrier, mem_ctx);
>> +         deref_map_sub_map(temp, bd->kill_before_memory_barrier);
>> +         deref_map_add_map(bd->gen, temp);
>> +      }
>> +      if (bd->kill_before_emit_vertex) {
>> +         struct deref_map *temp = deref_map_clone(used_by_emit_vertex,
>> mem_ctx);
>> +         deref_map_sub_map(temp, bd->kill_before_emit_vertex);
>> +         deref_map_add_map(bd->gen, temp);
>> +      }
>> +
>> +      /* Initialize the IN set with GEN, since we don't have out
>> information yet. */
>> +      bd->in = deref_map_clone(bd->gen, mem_ctx);
>> +   }
>> +
>> +   /* Since end_block is used as successor for the last blocks executed,
>> we
>> +    * can conveniently fill its IN set to propagate the usage.
>> +    */
>> +   struct block_data *end_bd = &state.block_data[func->end_block->index];
>> +   end_bd->in = used_by_end_of_program;
>> +
>> +   if (debug) {
>> +      printf("\n=== Before data-flow analysis\n");
>> +      for (int i = 0; i < func->num_blocks; i++)
>> +         dump_block(&state, i, DUMP_IN | DUMP_KILL | DUMP_UNUSED_WRITES);
>> +      dump_block(&state, func->end_block->index, DUMP_IN);
>> +   }
>> +
>> +   /* Step 2.  Iterate backwards computing the OUT and IN sets.
>> +    *
>> +    *     OUT = union IN of all successors
>> +    *     IN  = GEN union (OUT minus KILL)
>> +    *
>> +    * The sets will always grow, so we can reuse the IN instead of
>> repeatedly
>> +    * calculate it.  This helps identifying whether the set changed or
>> not.
>> +    */
>> +   bool flow_progress;
>> +   int iterations = 0;
>> +   do {
>> +      flow_progress = false;
>> +
>> +      if (debug)
>> +         printf("\n=== Blocks with IN/OUT changed in iteration %d\n",
>> ++iterations);
>> +
>> +      nir_foreach_block_reverse(block, func) {
>> +         if (block == func->end_block)
>> +            continue;
>> +
>> +         struct block_data *bd = &state.block_data[block->index];
>> +
>> +         bool out_changed = false;
>> +         for (int i = 0; i < 2; i++) {
>> +            nir_block *successor = block->successors[i];
>> +            if (!successor)
>> +               continue;
>> +            struct block_data *successor_bd =
>> &state.block_data[successor->index];
>> +            out_changed |= deref_map_add_map(bd->out, successor_bd->in);
>> +         }
>> +
>> +         if (!out_changed)
>> +            continue;
>> +
>> +         struct deref_map *temp = deref_map_clone(bd->out, mem_ctx);
>> +         deref_map_sub_map(temp, bd->kill);
>> +
>> +         bool in_changed = deref_map_add_map(bd->in, temp);
>> +         flow_progress |= in_changed;
>> +
>> +         if (debug && in_changed)
>> +            dump_block(&state, block->index, DUMP_IN | DUMP_OUT);
>> +      }
>> +   } while (flow_progress);
>> +
>> +   if (debug)
>> +      printf("\n=== After data flow analysis (%d iterations)\n",
>> iterations);
>> +
>> +   /* Step 3.  Remove unused writes based on the data-flow analysis. */
>> +   nir_foreach_block(block, func) {
>> +      struct block_data *bd = &state.block_data[block->index];
>> +
>> +      if (debug)
>> +         dump_block(&state, block->index, DUMP_OUT | DUMP_UNUSED_WRITES);
>> +
>> +      util_dynarray_foreach(&bd->unused_writes, struct write_entry,
>> entry) {
>> +         if (!entry->intrin)
>> +            continue;
>> +         if (!deref_map_may_alias(bd->out, entry->path, entry->mask)) {
>> +            if (debug) {
>> +               print_instr("REMOVE: ", &entry->intrin->instr,
>> +                           "\n  (unused after data flow analysis)\n");
>> +            }
>> +            nir_instr_remove(&entry->intrin->instr);
>> +            progress = true;
>> +         }
>> +      }
>> +   }
>>
>>     return progress;
>>  }
>> @@ -225,6 +740,9 @@ nir_opt_dead_write_vars(nir_shader *shader)
>>     void *mem_ctx = ralloc_context(NULL);
>>     bool progress = false;
>>
>> +   if (debug)
>> +      nir_print_shader(shader, stdout);
>> +
>>     nir_foreach_function(function, shader) {
>>        if (!function->impl)
>>           continue;
>> --
>> 2.18.0
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180825/b9d8d7d9/attachment-0001.html>


More information about the mesa-dev mailing list