[Mesa-dev] [PATCH 6/9] nir: Make dead_write_vars pass global

Jason Ekstrand jason at jlekstrand.net
Sat Aug 25 14:40:01 UTC 2018


Sorry I haven't given you any in-line review yet.  I've been letting this
pass simmer in my brain a bit and thinking about complexity and data
structures.  Here's a few questions for which I do not expect you to have
actual answers:

 1) How many unique derefs should we expect to see in a shader?  Dozens?
Hundreds?  Thousands?  My guess is that, after vars_to_ssa has picked off
all the easy ones, it should be on the order of a couple hundred at most
but I don't actually know what a reasonable upper limit to assume is.

 2) Is every deref going to be eventually compared to every other deref or
are we going to have separate little groups of derefs?

I'm asking these questions because I have some concern about how expensive
this pass (and copy prop) is going to be.  If you have a shader with a huge
number of loops each of which has a very self-contained set of derefs, the
current pass as-is probably has an efficiency that's around O(n) in the
number of instructions.  However, in the case where most derefs are in play
all the time, then I supsect this pass as-is looks more like O(n^3) in the
number of derefs or worse.

If the total number of unique derefs is reasonable AND the graph of deref
comparisons isn't too sparse, then I wonder if it wouldn't be better to
have some intermediate data structure like this:

struct deref_compare_map {
   void *mem_ctx;

   struct hash_table *id_map;

   nir_deref_path *paths;
   BITSET_WORD **alias;
   BITSET_WORD **contains;
   BITSET_WORD **contained_in;
};

void deref_compare_map_init(struct deref_compare_map *map, void *mem_ctx)
{
   memset(map, 0, sizeof(*map));
   map->mem_ctx = ralloc_context(mem_ctx);

   /* I'm pretty sure we have deref hashing and equality functions
somewhere already, we just have to pull them out and use them.  It is,
however, possible that they were in nir_instr_set.c and got deleted as part
of the move to deref instructions. */
   map->id_map = _mesa_hash_table_create(map->mem_ctx, deref_hash,
deref_equal);
}

void deref_compare_map_finish(struct deref_compare_map *map)
{
   ralloc_free(map->mem_ctx);
}

uint32_t deref_compare_map_get_id(struct deref_compare_map *map,
nir_deref_instr *deref)
{
   struct hash_entry *entry = _mesa_hash_set_lookup(map->info_map, deref);
   if (entry)
      return (uint32_t)(uintptr_t)entry->data;

   /* We can't be adding anything after we've baked it */
   assert(map->alias == NULL);

   uint32_t id = map->id_map->entries;
   _mesa_hash_set_add(map->id_map, deref, id);
   assert(map->id_map->entries == id + 1);
}

static BITSET_WORDS **
alloc_set_of_sets(void *mem_ctx, unsigned entries)
{
   BITSET_WORDS **sets = ralloc_array(mem_ctx, BITSET_WORD *, entries);
   BITSET_WORDS *data = rzalloc_array(mem_ctx, BITSET_WORD, entries *
BITSET_WORDS(entries));
   for (unsigned i = 0; i < entries; i++)
      sets[i] = data + i * BITSET_WORDS(entries);
   return sets;
}

void deref_compare_map_bake(struct deref_compare_map *map)
{
   const unsigned num_derefs = map->id_map->entries;

   map->paths = ralloc_array(map->mem_ctx, nir_deref_path, num_derefs);
   struct hash_entry *entry;
   hash_table_foreach(map->id_map, entry) {
      uint32_t id = (uintptr_t)entry->data;
      nir_deref_path_init(&map->paths[id], (void *)entry->key,
map->mem_ctx);
   }

   map->alias = alloc_set_of_sets(map->mem_ctx, map->id_map->entries);
   map->contains = alloc_set_of_sets(map->mem_ctx, map->id_map->entries);
   map->contained_in = alloc_set_of_sets(map->mem_ctx,
map->id_map->entries);

   for (unsigned a = 0; a < num_derefs; a++) {
      /* Handle comparing a with itself */
      BITSET_SET(map->alias[a], a);
      BITSET_SET(map->contains[a], a);
      BITSET_SET(map->contained_in[a], a);

      for (unsigned b = 0; b < a; b++) {
         enum nir_deref_compare_result comp =
            nir_compare_deref_paths(&map->paths[a], &map->paths[b]);
         if (comp & derefs_may_alias_bit) {
            BITSET_SET(map->alias[a], b);
            BITSET_SET(map->alias[b], a);
         }

         if (comp & derefs_a_contains_b_bit) {
            BITSET_SET(map->contains[a], b);
            BITSET_SET(map->contained_in[b], a);
         }

         if (comp & derefs_b_contains_a_bit) {
            BITSET_SET(map->contains[b], a);
            BITSET_SET(map->contained_in[a], b);
         }
      }
   }
}

The idea would be to make it a 5-step pass:

 1) find all derefs
 2) Bake the deref compare map
 3) Local elimination and building kill etc. sets.
 4) Data-flow
 5) global elimination

Doing that would give us some better run-time guarantees:

 1) Set operations on bitsets are O(n) (where n is actually DIV_ROUND_UP(n,
32) thanks to bitsets)
 2) We call nir_deref_path_init exactly num_derefs times for the entire pass
 3) We call nir_compare_deref_paths exactly (num_derefs * (num_derefs - 1))
/ 2 times for the entire pass.

As it is, deref_map_add_map is O(n^2) comparisons and we do that operation
inside our fixed-point loop which has an unknown number of iterations.
This data structure could easily be put in nir_derefs.c and shared between
this pass and copy-prop.

Just a thought.  Of course, you've spent far more time playing with this
than I have so I may be completely off-base.  Before you go off and
re-write it, let's have a little e-mail discussion to decide if we think
the rewrite is worth it.  :-)

--Jason

On Wed, Aug 15, 2018 at 4:57 PM Caio Marcelo de Oliveira Filho <
caio.oliveira at intel.com> wrote:

> The pass will remove not only writes that are proven to be dead in
> block, but also writes that can only be considered dead by looking at
> multiple blocks.  Such global analysis is necessary to remove dead
> writes such as the one marked below:
>
>     int total = gl_VertexIndex * 10;
>     float r = gl_VertexIndex;
>
>     for (int i = 0; i < total; i++) {
>         arr[i] = i;                     /* DEAD WRITE. */
>         if ((i % 2) == 0) {
>             arr[i] = i * 3.0;
>         } else {
>             arr[i] = i * 5.0;
>         }
>         r = arr[i];
>     }
>
>     out_color = vec4(r,r,r,r);
>
> Without knowing that both sucessor blocks will overwrite the value,
> it was not possible to remove the the "arr[i] = i" write.
>
> The local pass was incremented with some extra data collected to allow
> a iterative (backward) data-flow analysis to run.  The analysis is
> used to find out, for each block, what derefs are used after the
> block.  This information is then used to decide what unused writes in
> the end of the block can be removed.  In a local-only analysis we
> couldn't remove any, because they could be used later.
> ---
>  src/compiler/nir/nir_opt_dead_write_vars.c | 546 ++++++++++++++++++++-
>  1 file changed, 532 insertions(+), 14 deletions(-)
>
> diff --git a/src/compiler/nir/nir_opt_dead_write_vars.c
> b/src/compiler/nir/nir_opt_dead_write_vars.c
> index 822bfa5595d..fe72ab784a8 100644
> --- a/src/compiler/nir/nir_opt_dead_write_vars.c
> +++ b/src/compiler/nir/nir_opt_dead_write_vars.c
> @@ -27,6 +27,85 @@
>
>  #include "util/u_dynarray.h"
>
> +/**
> + * Elimination of dead writes based on derefs.
> + *
> + * Dead writes are stores and copies that write to a deref, which then
> gets
> + * another write before it was used (read or sourced for a copy).  Those
> + * writes can be removed since they don't affect anything.
> + *
> + * For derefs that refer to a memory area that can be read after the
> program,
> + * the last write is considered used.  The presence of certain
> instructions
> + * may also cause writes to be considered used, e.g. memory barrier (in
> this case
> + * the value must be written as other thread might use it).
> + *
> + * The write mask for store instructions is considered, so it is possible
> that
> + * a store is removed because of the combination of other stores
> overwritten
> + * its value.
> + *
> + * The pass works in three steps:
> + *
> + * (1) For each block, perform a local dead write removal.  Removing only
> + *     writes we are sure are dead, so keeping around the last unused
> writes
> + *     at the end of the block.  These last unused will be revisited in
> (3).
> + *
> + * (2) Perform a data-flow analysis to identify for each block the "uses
> + *     before write" of successor blocks.  E.g. given the blocks below
> (block0
> + *     precedes block1, that precedes block2)
> + *
> + *       block0:            block1:            block2:
> + *         write A            load B             load D
> + *         write B            write C            load E
> + *         write C            load C
> + *         write D            write D
> + *         write E
> + *         load A
> + *
> + *    the goal is know that B and E are "used before write" after block0;
> D
> + *    and E are "used before write" after block1.  Note that usage of E
> + *    doesn't come from an immediate succesor of block0.  The "used before
> + *    written" information will decided later what unused writes are
> removed.
> + *
> + *    The data-flow analysis is implemented by starting with per-block
> sets
> + *    and iterate fixing the content in those sets.  The nature of the
> + *    "fixing" ensures that the sets get stable (i.e. no infinite loops).
> + *    Each block gets two sets representing its local information:
> + *
> + *    - GEN: the derefs used before write by the block.  E.g. GEN(block1)
> = {B}
> + *    - KILL: the derefs overwritten by the block.  E.g. KILL(block1) =
> {C, D}
> + *
> + *    And two other sets representing its current state:
> + *
> + *    - IN: the derefs used before write for this block and all
> successors.
> + *      E.g. at the end of the analysis, IN(block1) = {B, E}.  Note how E
> is
> + *      present but not D.
> + *    - OUT: the derefs used before write for the successors of the block.
> + *
> + *    In this pass the iteration happens "backwards", so a block uses the
> + *    succesor information to improve its own information.  The IN and OUT
> + *    sets are updated as follows:
> + *
> + *        OUT = union IN of all successors
> + *        IN  = GEN union (OUT minus KILL)
> + *
> + * 3) Remove the last unused writes for each block that are not used by
> + *    successor blocks.  After the analysis in (2), this information is
> + *    available in the OUT set of each block.
> + *
> + * One extra factor in play is that not trivial to decide whether two
> derefs
> + * are referring to the same storage or not.  Currently array indirects
> are
> + * assumed to alias with each other, i.e. "write A[i]" is considered used
> by a
> + * later "load A[j]".  This logic is mostly handled as part of
> + * nir_compare_deref_paths() function.  This prevents us from
> representing the
> + * data-flow sets as bitsets.
> + *
> + * Data-flow analysis code here is inspired by Muchnick's "Advanced
> Compiler
> + * Design and Implementation" and by the data-flow analysis
> implementation in
> + * src/intel/compiler/brw_fs_copy_propagation.cpp.
> + */
> +
> +static const bool debug = false;
> +
>  struct state {
>     void *mem_ctx;
>
> @@ -34,6 +113,24 @@ struct state {
>      * rebuilding the paths for the same deref. */
>     struct hash_table *paths;
>     void *path_lin_ctx;
> +
> +   struct block_data *block_data;
> +};
> +
> +struct block_data {
> +   struct deref_map *in;
> +   struct deref_map *out;
> +   struct deref_map *gen;
> +   struct deref_map *kill;
> +
> +   struct deref_map *kill_before_memory_barrier;
> +   struct deref_map *kill_before_emit_vertex;
> +
> +   /* All the writes not used by the end of the block.  Used after the
> +    * data-flow analysis, to identify globally unused writes, that can be
> +    * removed.
> +    */
> +   struct util_dynarray unused_writes;
>  };
>
>  static nir_deref_path *
> @@ -50,6 +147,23 @@ get_path(struct state *state, nir_deref_instr *deref)
>     }
>  }
>
> +static void
> +print_instr(const char *prefix, const nir_instr *instr, const char
> *suffix)
> +{
> +   if (prefix) fputs(prefix, stdout);
> +   nir_print_instr(instr, stdout);
> +   if (suffix) fputs(suffix, stdout);
> +}
> +
> +static void
> +print_path(const char *prefix, nir_deref_path *path, const char *suffix)
> +{
> +   int i;
> +   for (i = 0; path->path[i + 1]; i++);
> +   nir_deref_instr *deref = path->path[i];
> +   print_instr(prefix, &deref->instr, suffix);
> +}
> +
>  /* Entry for unused_writes arrays. */
>  struct write_entry {
>     /* If NULL indicates the entry is free to be reused. */
> @@ -58,6 +172,161 @@ struct write_entry {
>     nir_deref_path *path;
>  };
>
> +struct deref_map_entry {
> +   nir_deref_path *path;
> +   uintptr_t mask;
> +};
> +
> +/* Maps derefs to a (write) mask value.  Used by the various data-flow
> + * analysis sets.
> + *
> + * TODO: Consider a tree-based data structure to represent this instead of
> + * arrays of paths.
> + */
> +struct deref_map {
> +   struct util_dynarray entries;
> +   void *mem_ctx;
> +};
> +
> +static struct deref_map *
> +deref_map_create(void *mem_ctx)
> +{
> +   struct deref_map *map = rzalloc(mem_ctx, struct deref_map);
> +   map->mem_ctx = mem_ctx;
> +   util_dynarray_init(&map->entries, mem_ctx);
> +   return map;
> +}
> +
> +static bool
> +deref_map_empty(struct deref_map *map)
> +{
> +   return map->entries.size == 0;
> +}
> +
> +/* Add a deref (represented by its path) to a deref_map, with a given
> mask.
> + *
> + * Returns whether the maps was changed or not.
> + */
> +static bool
> +deref_map_add(struct deref_map *map, nir_deref_path *path, uintptr_t mask)
> +{
> +   util_dynarray_foreach (&map->entries, struct deref_map_entry,
> map_entry) {
> +      nir_deref_compare_result comp = nir_compare_deref_paths(path,
> map_entry->path);
> +      if (comp & nir_derefs_equal_bit) {
> +         /* The deref is already present, no new entry needed. */
> +         uintptr_t old_mask = map_entry->mask;
> +         map_entry->mask |= mask;
> +         return old_mask != map_entry->mask;
> +      }
> +   }
> +
> +   struct deref_map_entry map_entry = {
> +      .path = path,
> +      .mask = mask,
> +   };
> +   util_dynarray_append(&map->entries, struct deref_map_entry, map_entry);
> +
> +   return true;
> +}
> +
> +static bool
> +deref_map_contains(struct deref_map *map, nir_deref_path *path, uintptr_t
> mask)
> +{
> +   util_dynarray_foreach (&map->entries, struct deref_map_entry,
> map_entry) {
> +      nir_deref_compare_result comp = nir_compare_deref_paths(path,
> map_entry->path);
> +      /* All the bits in the mask must be present in the map. */
> +      if ((comp & nir_derefs_b_contains_a_bit) && ((mask &
> map_entry->mask) == mask))
> +         return true;
> +   }
> +   return false;
> +}
> +
> +static bool
> +deref_map_may_alias(struct deref_map *map, nir_deref_path *path,
> uintptr_t mask)
> +{
> +   util_dynarray_foreach (&map->entries, struct deref_map_entry,
> map_entry) {
> +      nir_deref_compare_result comp = nir_compare_deref_paths(path,
> map_entry->path);
> +      /* A single overlapping bit is enough to characterize an alias. */
> +      if ((comp & nir_derefs_may_alias_bit) && (mask & map_entry->mask))
> +         return true;
> +   }
> +   return false;
> +}
> +
> +#define deref_map_foreach(map, elem) \
> +   util_dynarray_foreach(&map->entries, struct deref_map_entry, elem)
> +
> +static bool
> +deref_map_add_map(struct deref_map *map, struct deref_map *to_add)
> +{
> +   bool changed = false;
> +   deref_map_foreach(to_add, map_entry)
> +      changed |= deref_map_add(map, map_entry->path, map_entry->mask);
> +   return changed;
> +}
> +
> +
> +static struct deref_map *
> +deref_map_clone(struct deref_map *from, void *mem_ctx)
> +{
> +   struct deref_map *map = deref_map_create(mem_ctx);
> +   util_dynarray_clone(&map->entries, mem_ctx, &from->entries);
> +   return map;
> +}
> +
> +/* The zero mask to indicate entry should be removed. */
> +static void
> +deref_map_remove_zero_masks(struct deref_map *map)
> +{
> +   deref_map_foreach(map, map_entry) {
> +      if (map_entry->mask)
> +         continue;
> +
> +      /* Replace the deleted entry with the last element. */
> +      struct deref_map_entry *last =
> +         util_dynarray_pop_ptr(&map->entries, struct deref_map_entry);
> +      if (map_entry != last)
> +         *map_entry = *last;
> +   }
> +}
> +
> +/* Removes from map all derefs that are contained by any deref in to_sub.
> */
> +static bool
> +deref_map_sub_map(struct deref_map *map, struct deref_map *to_sub)
> +{
> +   bool changed = false;
> +   deref_map_foreach(to_sub, sub_entry) {
> +      nir_deref_path *path = sub_entry->path;
> +      uintptr_t mask = sub_entry->mask;
> +
> +      deref_map_foreach(map, map_entry) {
> +         nir_deref_compare_result comp = nir_compare_deref_paths(path,
> map_entry->path);
> +         if (comp & nir_derefs_a_contains_b_bit) {
> +            uintptr_t new_mask = map_entry->mask & ~mask;
> +            if (new_mask != map_entry->mask)
> +               changed = true;
> +            map_entry->mask = new_mask;
> +         }
> +      }
> +   }
> +
> +   deref_map_remove_zero_masks(map);
> +
> +   return changed;
> +}
> +
> +/* Removes from map all derefs that the variable matches the given modes.
> */
> +static void
> +deref_map_sub_modes(struct deref_map *map, nir_variable_mode modes)
> +{
> +   deref_map_foreach(map, map_entry) {
> +      nir_variable *var = map_entry->path->path[0]->var;
> +      if (var->data.mode & modes)
> +         map_entry->mask = 0;
> +   }
> +   deref_map_remove_zero_masks(map);
> +}
> +
>  static void
>  clear_unused_for_modes(struct util_dynarray *unused_writes,
> nir_variable_mode modes)
>  {
> @@ -96,10 +365,15 @@ update_unused_writes_with_dst(struct util_dynarray
> *unused_writes,
>           empty_entry = entry;
>           continue;
>        }
> +
>        nir_deref_compare_result comp = nir_compare_deref_paths(dst_path,
> entry->path);
>        if (comp & nir_derefs_a_contains_b_bit) {
>           entry->mask &= ~mask;
>           if (entry->mask == 0) {
> +            if (debug) {
> +               print_instr("REMOVE: ", &entry->intrin->instr,
> +                           "\n  (value overwritten)\n");
> +            }
>              nir_instr_remove(&entry->intrin->instr);
>              entry->intrin = NULL;
>              empty_entry = entry;
> @@ -123,13 +397,36 @@ update_unused_writes_with_dst(struct util_dynarray
> *unused_writes,
>     return progress;
>  }
>
> +static void
> +tag_use_before_write(struct deref_map *gen, struct deref_map *kill,
> +                     nir_deref_path *path)
> +{
> +   if (deref_map_contains(kill, path, ~(1 << NIR_MAX_VEC_COMPONENTS)))
> +      return;
> +   deref_map_add(gen, path, ~(1 << NIR_MAX_VEC_COMPONENTS));
> +}
> +
> +static void
> +tag_write_before_use(struct deref_map *gen, struct deref_map *kill,
> +                     nir_deref_path *path, uintptr_t mask)
> +{
> +   deref_map_add(kill, path, mask);
> +}
> +
> +static const nir_variable_mode memory_barrier_modes =
> +   ~(nir_var_local | nir_var_global | nir_var_shader_in |
> nir_var_uniform);
> +
> +static const nir_variable_mode emit_vertex_modes = nir_var_shader_out;
> +
> +static const nir_variable_mode end_of_program_modes =
> +   ~(nir_var_local | nir_var_shader_in | nir_var_uniform);
> +
>  static bool
> -remove_dead_write_vars_local(struct state *state, nir_block *block)
> +remove_dead_write_vars_local(struct state *state, nir_block *block,
> struct block_data *bd)
>  {
>     bool progress = false;
>
> -   struct util_dynarray unused_writes;
> -   util_dynarray_init(&unused_writes, state->mem_ctx);
> +   util_dynarray_init(&bd->unused_writes, state->mem_ctx);
>
>     nir_foreach_instr_safe(instr, block) {
>        if (instr->type != nir_instr_type_intrinsic)
> @@ -139,16 +436,21 @@ remove_dead_write_vars_local(struct state *state,
> nir_block *block)
>        switch (intrin->intrinsic) {
>        case nir_intrinsic_barrier:
>        case nir_intrinsic_memory_barrier: {
> -         nir_variable_mode modes = ~(nir_var_local | nir_var_global |
> -                                     nir_var_shader_in | nir_var_uniform);
> -         clear_unused_for_modes(&unused_writes, modes);
> +         clear_unused_for_modes(&bd->unused_writes, memory_barrier_modes);
> +         if (!bd->kill_before_memory_barrier) {
> +            bd->kill_before_memory_barrier = deref_map_clone(bd->kill,
> state->mem_ctx);
> +            deref_map_sub_modes(bd->kill_before_memory_barrier,
> memory_barrier_modes);
> +         }
>           break;
>        }
>
>        case nir_intrinsic_emit_vertex:
>        case nir_intrinsic_emit_vertex_with_counter: {
> -         nir_variable_mode modes = nir_var_shader_out;
> -         clear_unused_for_modes(&unused_writes, modes);
> +         clear_unused_for_modes(&bd->unused_writes, emit_vertex_modes);
> +         if (!bd->kill_before_emit_vertex) {
> +            bd->kill_before_emit_vertex = deref_map_clone(bd->kill,
> state->mem_ctx);
> +            deref_map_sub_modes(bd->kill_before_emit_vertex,
> emit_vertex_modes);
> +         }
>           break;
>        }
>
> @@ -156,7 +458,8 @@ remove_dead_write_vars_local(struct state *state,
> nir_block *block)
>           nir_deref_instr *src = nir_src_as_deref(intrin->src[0]);
>           nir_deref_path *src_path = get_path(state, src);
>
> -         clear_unused_for_src(&unused_writes, src_path);
> +         clear_unused_for_src(&bd->unused_writes, src_path);
> +         tag_use_before_write(bd->gen, bd->kill, src_path);
>           break;
>        }
>
> @@ -165,7 +468,8 @@ remove_dead_write_vars_local(struct state *state,
> nir_block *block)
>           nir_deref_path *dst_path = get_path(state, dst);
>
>           uintptr_t mask = nir_intrinsic_write_mask(intrin);
> -         progress |= update_unused_writes_with_dst(&unused_writes,
> intrin, dst_path, mask);
> +         progress |= update_unused_writes_with_dst(&bd->unused_writes,
> intrin, dst_path, mask);
> +         tag_write_before_use(bd->gen, bd->kill, dst_path, mask);
>           break;
>        }
>
> @@ -178,14 +482,20 @@ remove_dead_write_vars_local(struct state *state,
> nir_block *block)
>
>           /* Self-copy is removed. */
>           if (src == dst || (nir_compare_deref_paths(src_path, dst_path) &
> nir_derefs_equal_bit)) {
> +            if (debug) {
> +               print_instr("REMOVE: ", instr,
> +                           "\n  (self-copy)\n");
> +            }
>              nir_instr_remove(instr);
>              progress = true;
>              break;
>           }
>
>           uintptr_t mask = ~(1 << NIR_MAX_VEC_COMPONENTS);
> -         clear_unused_for_src(&unused_writes, src_path);
> -         progress |= update_unused_writes_with_dst(&unused_writes,
> intrin, dst_path, mask);
> +         clear_unused_for_src(&bd->unused_writes, src_path);
> +         progress |= update_unused_writes_with_dst(&bd->unused_writes,
> intrin, dst_path, mask);
> +         tag_use_before_write(bd->gen, bd->kill, src_path);
> +         tag_write_before_use(bd->gen, bd->kill, dst_path, mask);
>           break;
>        }
>
> @@ -201,6 +511,60 @@ remove_dead_write_vars_local(struct state *state,
> nir_block *block)
>     return progress;
>  }
>
> +enum {
> +   DUMP_GEN           = 1 << 0,
> +   DUMP_KILL          = 1 << 1,
> +   DUMP_IN            = 1 << 2,
> +   DUMP_OUT           = 1 << 3,
> +   DUMP_UNUSED_WRITES = 1 << 4,
> +};
> +
> +static void
> +dump_block(struct state *state, unsigned block_index, unsigned flags)
> +{
> +   struct block_data *bd = &state->block_data[block_index];
> +   printf("--- block%d\n", block_index);
> +
> +   if (bd->kill_before_memory_barrier)
> +      printf("  has memory barrier\n");
> +   if (bd->kill_before_emit_vertex)
> +      printf("  has emit vertex\n");
> +
> +   if ((flags & DUMP_GEN) && !deref_map_empty(bd->gen)) {
> +      printf("  gen\n");
> +      deref_map_foreach(bd->gen, map_entry)
> +         print_path("    ", map_entry->path, "\n");
> +   }
> +
> +   if ((flags & DUMP_KILL) && !deref_map_empty(bd->kill)) {
> +      printf("  kill\n");
> +      deref_map_foreach(bd->kill, map_entry)
> +         print_path("    ", map_entry->path, "\n");
> +   }
> +
> +   if ((flags & DUMP_IN) && !deref_map_empty(bd->in)) {
> +      printf("  in\n");
> +      deref_map_foreach(bd->in, map_entry)
> +         print_path("    ", map_entry->path, "\n");
> +   }
> +
> +   if ((flags & DUMP_OUT) && !deref_map_empty(bd->out)) {
> +      printf("  out\n");
> +      deref_map_foreach(bd->out, map_entry)
> +         print_path("    ", map_entry->path, "\n");
> +   }
> +
> +   if ((flags & DUMP_UNUSED_WRITES) && bd->unused_writes.size != 0) {
> +      printf("  unused writes\n");
> +      util_dynarray_foreach(&bd->unused_writes, struct write_entry,
> entry) {
> +         if (!entry->intrin)
> +            continue;
> +         print_instr("    ", &entry->intrin->instr, "");
> +         print_path(" -- ", entry->path, "\n");
> +      }
> +   }
> +}
> +
>  static bool
>  remove_dead_write_vars_impl(void *mem_ctx, nir_function_impl *func)
>  {
> @@ -211,10 +575,161 @@ remove_dead_write_vars_impl(void *mem_ctx,
> nir_function_impl *func)
>
>        .path_lin_ctx = linear_alloc_parent(mem_ctx, 0),
>        .paths = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
> _mesa_key_pointer_equal),
> +
> +      /* Count one extra data element for the end_block. */
> +      .block_data = rzalloc_array(mem_ctx, struct block_data,
> func->num_blocks + 1),
>     };
>
> -   nir_foreach_block(block, func)
> -      progress |= remove_dead_write_vars_local(&state, block);
> +   /* Step 1.  Perform local optimization and initialize GEN, KILL and
> +    * UNUSED_WRITES sets.
> +    */
> +   bool all_gens_empty = true;
> +   nir_foreach_block(block, func) {
> +      struct block_data *bd = &state.block_data[block->index];
> +      bd->gen  = deref_map_create(mem_ctx);
> +      bd->kill = deref_map_create(mem_ctx);
> +      bd->out  = deref_map_create(mem_ctx);
> +
> +      progress |= remove_dead_write_vars_local(&state, block, bd);
> +
> +      if (!deref_map_empty(bd->gen))
> +         all_gens_empty = false;
> +   }
> +
> +   if (func->num_blocks < 2 || all_gens_empty)
> +      return progress;
> +
> +   /* Identify the derefs that are considered used by other intrinsics
> (or the
> +    * end of the program), then include them as part of the GEN of the
> blocks
> +    * that contains those intrinsics.
> +    */
> +   struct deref_map *used_by_memory_barrier = deref_map_create(mem_ctx);
> +   struct deref_map *used_by_emit_vertex    = deref_map_create(mem_ctx);
> +   struct deref_map *used_by_end_of_program = deref_map_create(mem_ctx);
> +
> +   for (int i = 0; i < func->num_blocks; i++) {
> +      struct block_data *bd = &state.block_data[i];
> +
> +      util_dynarray_foreach(&bd->unused_writes, struct write_entry,
> entry) {
> +         if (!entry->intrin)
> +            continue;
> +
> +         nir_deref_path *dst_path = entry->path;
> +         nir_variable *var = dst_path->path[0]->var;
> +
> +         if (var->data.mode & memory_barrier_modes)
> +            deref_map_add(used_by_memory_barrier, dst_path, entry->mask);
> +
> +         if (var->data.mode & emit_vertex_modes)
> +            deref_map_add(used_by_emit_vertex, dst_path, entry->mask);
> +
> +         if (var->data.mode & end_of_program_modes)
> +            deref_map_add(used_by_end_of_program, dst_path, entry->mask);
> +      }
> +   }
> +
> +   /* Update GEN to include extra uses based on the presence of other
> +    * intrinsics.  Make sure not include uses for derefs we already
> overwrite
> +    * (kill) before the intrinsic in the block.
> +    */
> +   for (int i = 0; i < func->num_blocks; i++) {
> +      struct block_data *bd = &state.block_data[i];
> +      if (bd->kill_before_memory_barrier) {
> +         struct deref_map *temp = deref_map_clone(used_by_memory_barrier,
> mem_ctx);
> +         deref_map_sub_map(temp, bd->kill_before_memory_barrier);
> +         deref_map_add_map(bd->gen, temp);
> +      }
> +      if (bd->kill_before_emit_vertex) {
> +         struct deref_map *temp = deref_map_clone(used_by_emit_vertex,
> mem_ctx);
> +         deref_map_sub_map(temp, bd->kill_before_emit_vertex);
> +         deref_map_add_map(bd->gen, temp);
> +      }
> +
> +      /* Initialize the IN set with GEN, since we don't have out
> information yet. */
> +      bd->in = deref_map_clone(bd->gen, mem_ctx);
> +   }
> +
> +   /* Since end_block is used as successor for the last blocks executed,
> we
> +    * can conveniently fill its IN set to propagate the usage.
> +    */
> +   struct block_data *end_bd = &state.block_data[func->end_block->index];
> +   end_bd->in = used_by_end_of_program;
> +
> +   if (debug) {
> +      printf("\n=== Before data-flow analysis\n");
> +      for (int i = 0; i < func->num_blocks; i++)
> +         dump_block(&state, i, DUMP_IN | DUMP_KILL | DUMP_UNUSED_WRITES);
> +      dump_block(&state, func->end_block->index, DUMP_IN);
> +   }
> +
> +   /* Step 2.  Iterate backwards computing the OUT and IN sets.
> +    *
> +    *     OUT = union IN of all successors
> +    *     IN  = GEN union (OUT minus KILL)
> +    *
> +    * The sets will always grow, so we can reuse the IN instead of
> repeatedly
> +    * calculate it.  This helps identifying whether the set changed or
> not.
> +    */
> +   bool flow_progress;
> +   int iterations = 0;
> +   do {
> +      flow_progress = false;
> +
> +      if (debug)
> +         printf("\n=== Blocks with IN/OUT changed in iteration %d\n",
> ++iterations);
> +
> +      nir_foreach_block_reverse(block, func) {
> +         if (block == func->end_block)
> +            continue;
> +
> +         struct block_data *bd = &state.block_data[block->index];
> +
> +         bool out_changed = false;
> +         for (int i = 0; i < 2; i++) {
> +            nir_block *successor = block->successors[i];
> +            if (!successor)
> +               continue;
> +            struct block_data *successor_bd =
> &state.block_data[successor->index];
> +            out_changed |= deref_map_add_map(bd->out, successor_bd->in);
> +         }
> +
> +         if (!out_changed)
> +            continue;
> +
> +         struct deref_map *temp = deref_map_clone(bd->out, mem_ctx);
> +         deref_map_sub_map(temp, bd->kill);
> +
> +         bool in_changed = deref_map_add_map(bd->in, temp);
> +         flow_progress |= in_changed;
> +
> +         if (debug && in_changed)
> +            dump_block(&state, block->index, DUMP_IN | DUMP_OUT);
> +      }
> +   } while (flow_progress);
> +
> +   if (debug)
> +      printf("\n=== After data flow analysis (%d iterations)\n",
> iterations);
> +
> +   /* Step 3.  Remove unused writes based on the data-flow analysis. */
> +   nir_foreach_block(block, func) {
> +      struct block_data *bd = &state.block_data[block->index];
> +
> +      if (debug)
> +         dump_block(&state, block->index, DUMP_OUT | DUMP_UNUSED_WRITES);
> +
> +      util_dynarray_foreach(&bd->unused_writes, struct write_entry,
> entry) {
> +         if (!entry->intrin)
> +            continue;
> +         if (!deref_map_may_alias(bd->out, entry->path, entry->mask)) {
> +            if (debug) {
> +               print_instr("REMOVE: ", &entry->intrin->instr,
> +                           "\n  (unused after data flow analysis)\n");
> +            }
> +            nir_instr_remove(&entry->intrin->instr);
> +            progress = true;
> +         }
> +      }
> +   }
>
>     return progress;
>  }
> @@ -225,6 +740,9 @@ nir_opt_dead_write_vars(nir_shader *shader)
>     void *mem_ctx = ralloc_context(NULL);
>     bool progress = false;
>
> +   if (debug)
> +      nir_print_shader(shader, stdout);
> +
>     nir_foreach_function(function, shader) {
>        if (!function->impl)
>           continue;
> --
> 2.18.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180825/cdcfded9/attachment-0001.html>


More information about the mesa-dev mailing list