[Mesa-dev] [PATCH 6/9] nir: Make dead_write_vars pass global
Jason Ekstrand
jason at jlekstrand.net
Sat Aug 25 15:04:54 UTC 2018
On Sat, Aug 25, 2018 at 9:40 AM Jason Ekstrand <jason at jlekstrand.net> wrote:
> Sorry I haven't given you any in-line review yet. I've been letting this
> pass simmer in my brain a bit and thinking about complexity and data
> structures. Here's a few questions for which I do not expect you to have
> actual answers:
>
> 1) How many unique derefs should we expect to see in a shader? Dozens?
> Hundreds? Thousands? My guess is that, after vars_to_ssa has picked off
> all the easy ones, it should be on the order of a couple hundred at most
> but I don't actually know what a reasonable upper limit to assume is.
>
> 2) Is every deref going to be eventually compared to every other deref or
> are we going to have separate little groups of derefs?
>
> I'm asking these questions because I have some concern about how expensive
> this pass (and copy prop) is going to be. If you have a shader with a huge
> number of loops each of which has a very self-contained set of derefs, the
> current pass as-is probably has an efficiency that's around O(n) in the
> number of instructions. However, in the case where most derefs are in play
> all the time, then I supsect this pass as-is looks more like O(n^3) in the
> number of derefs or worse.
>
> If the total number of unique derefs is reasonable AND the graph of deref
> comparisons isn't too sparse, then I wonder if it wouldn't be better to
> have some intermediate data structure like this:
>
> struct deref_compare_map {
> void *mem_ctx;
>
> struct hash_table *id_map;
>
> nir_deref_path *paths;
> BITSET_WORD **alias;
> BITSET_WORD **contains;
> BITSET_WORD **contained_in;
> };
>
> void deref_compare_map_init(struct deref_compare_map *map, void *mem_ctx)
> {
> memset(map, 0, sizeof(*map));
> map->mem_ctx = ralloc_context(mem_ctx);
>
> /* I'm pretty sure we have deref hashing and equality functions
> somewhere already, we just have to pull them out and use them. It is,
> however, possible that they were in nir_instr_set.c and got deleted as part
> of the move to deref instructions. */
> map->id_map = _mesa_hash_table_create(map->mem_ctx, deref_hash,
> deref_equal);
> }
>
> void deref_compare_map_finish(struct deref_compare_map *map)
> {
> ralloc_free(map->mem_ctx);
> }
>
> uint32_t deref_compare_map_get_id(struct deref_compare_map *map,
> nir_deref_instr *deref)
> {
> struct hash_entry *entry = _mesa_hash_set_lookup(map->info_map, deref);
> if (entry)
> return (uint32_t)(uintptr_t)entry->data;
>
> /* We can't be adding anything after we've baked it */
> assert(map->alias == NULL);
>
> uint32_t id = map->id_map->entries;
> _mesa_hash_set_add(map->id_map, deref, id);
> assert(map->id_map->entries == id + 1);
> }
>
> static BITSET_WORDS **
> alloc_set_of_sets(void *mem_ctx, unsigned entries)
> {
> BITSET_WORDS **sets = ralloc_array(mem_ctx, BITSET_WORD *, entries);
> BITSET_WORDS *data = rzalloc_array(mem_ctx, BITSET_WORD, entries *
> BITSET_WORDS(entries));
> for (unsigned i = 0; i < entries; i++)
> sets[i] = data + i * BITSET_WORDS(entries);
> return sets;
> }
>
> void deref_compare_map_bake(struct deref_compare_map *map)
> {
> const unsigned num_derefs = map->id_map->entries;
>
> map->paths = ralloc_array(map->mem_ctx, nir_deref_path, num_derefs);
> struct hash_entry *entry;
> hash_table_foreach(map->id_map, entry) {
> uint32_t id = (uintptr_t)entry->data;
> nir_deref_path_init(&map->paths[id], (void *)entry->key,
> map->mem_ctx);
> }
>
> map->alias = alloc_set_of_sets(map->mem_ctx, map->id_map->entries);
> map->contains = alloc_set_of_sets(map->mem_ctx, map->id_map->entries);
> map->contained_in = alloc_set_of_sets(map->mem_ctx,
> map->id_map->entries);
>
> for (unsigned a = 0; a < num_derefs; a++) {
> /* Handle comparing a with itself */
> BITSET_SET(map->alias[a], a);
> BITSET_SET(map->contains[a], a);
> BITSET_SET(map->contained_in[a], a);
>
> for (unsigned b = 0; b < a; b++) {
> enum nir_deref_compare_result comp =
> nir_compare_deref_paths(&map->paths[a], &map->paths[b]);
> if (comp & derefs_may_alias_bit) {
> BITSET_SET(map->alias[a], b);
> BITSET_SET(map->alias[b], a);
> }
>
> if (comp & derefs_a_contains_b_bit) {
> BITSET_SET(map->contains[a], b);
> BITSET_SET(map->contained_in[b], a);
> }
>
> if (comp & derefs_b_contains_a_bit) {
> BITSET_SET(map->contains[b], a);
> BITSET_SET(map->contained_in[a], b);
> }
> }
> }
> }
>
> The idea would be to make it a 5-step pass:
>
> 1) find all derefs
> 2) Bake the deref compare map
> 3) Local elimination and building kill etc. sets.
> 4) Data-flow
> 5) global elimination
>
> Doing that would give us some better run-time guarantees:
>
> 1) Set operations on bitsets are O(n) (where n is actually
> DIV_ROUND_UP(n, 32) thanks to bitsets)
> 2) We call nir_deref_path_init exactly num_derefs times for the entire
> pass
> 3) We call nir_compare_deref_paths exactly (num_derefs * (num_derefs -
> 1)) / 2 times for the entire pass.
>
> As it is, deref_map_add_map is O(n^2) comparisons and we do that operation
> inside our fixed-point loop which has an unknown number of iterations.
> This data structure could easily be put in nir_derefs.c and shared between
> this pass and copy-prop.
>
> Just a thought. Of course, you've spent far more time playing with this
> than I have so I may be completely off-base. Before you go off and
> re-write it, let's have a little e-mail discussion to decide if we think
> the rewrite is worth it. :-)
>
After giving it more thought, I realized I hadn't done anything with
components like you did. While local variables don't currently support
write-masks, that should probably change in the not-so-distant future so we
should handle them. One idea would be to replace our many arrays with one
array of structs such as
struct deref_comp_info {
/** Offset into other info's bitsets */
uint32_t offset;
/** Number of components for vector derefs, zero for others
*
* If num_components == 0, the deref is represented by the single bit at
offset in the bitsets.
* If num_components > 0, the deref is represented by num_components
bits in the b, then
* the deref is represented by num_components bits starting at offset.
*/
unsigned num_components;
nir_deref_path path;
BITSET_WORD *alias;
BITSET_WORD *contains;
BITSET_WORD *contained_in;
};
Also, if we're going to share it between passes, it might be good to have
nir_deref_compare_map_init take a shader and do the walk to find all derefs
and the bake in a single step and we can avoid exposing quite as many
details to the user.
> --Jason
>
> On Wed, Aug 15, 2018 at 4:57 PM Caio Marcelo de Oliveira Filho <
> caio.oliveira at intel.com> wrote:
>
>> The pass will remove not only writes that are proven to be dead in
>> block, but also writes that can only be considered dead by looking at
>> multiple blocks. Such global analysis is necessary to remove dead
>> writes such as the one marked below:
>>
>> int total = gl_VertexIndex * 10;
>> float r = gl_VertexIndex;
>>
>> for (int i = 0; i < total; i++) {
>> arr[i] = i; /* DEAD WRITE. */
>> if ((i % 2) == 0) {
>> arr[i] = i * 3.0;
>> } else {
>> arr[i] = i * 5.0;
>> }
>> r = arr[i];
>> }
>>
>> out_color = vec4(r,r,r,r);
>>
>> Without knowing that both sucessor blocks will overwrite the value,
>> it was not possible to remove the the "arr[i] = i" write.
>>
>> The local pass was incremented with some extra data collected to allow
>> a iterative (backward) data-flow analysis to run. The analysis is
>> used to find out, for each block, what derefs are used after the
>> block. This information is then used to decide what unused writes in
>> the end of the block can be removed. In a local-only analysis we
>> couldn't remove any, because they could be used later.
>> ---
>> src/compiler/nir/nir_opt_dead_write_vars.c | 546 ++++++++++++++++++++-
>> 1 file changed, 532 insertions(+), 14 deletions(-)
>>
>> diff --git a/src/compiler/nir/nir_opt_dead_write_vars.c
>> b/src/compiler/nir/nir_opt_dead_write_vars.c
>> index 822bfa5595d..fe72ab784a8 100644
>> --- a/src/compiler/nir/nir_opt_dead_write_vars.c
>> +++ b/src/compiler/nir/nir_opt_dead_write_vars.c
>> @@ -27,6 +27,85 @@
>>
>> #include "util/u_dynarray.h"
>>
>> +/**
>> + * Elimination of dead writes based on derefs.
>> + *
>> + * Dead writes are stores and copies that write to a deref, which then
>> gets
>> + * another write before it was used (read or sourced for a copy). Those
>> + * writes can be removed since they don't affect anything.
>> + *
>> + * For derefs that refer to a memory area that can be read after the
>> program,
>> + * the last write is considered used. The presence of certain
>> instructions
>> + * may also cause writes to be considered used, e.g. memory barrier (in
>> this case
>> + * the value must be written as other thread might use it).
>> + *
>> + * The write mask for store instructions is considered, so it is
>> possible that
>> + * a store is removed because of the combination of other stores
>> overwritten
>> + * its value.
>> + *
>> + * The pass works in three steps:
>> + *
>> + * (1) For each block, perform a local dead write removal. Removing only
>> + * writes we are sure are dead, so keeping around the last unused
>> writes
>> + * at the end of the block. These last unused will be revisited in
>> (3).
>> + *
>> + * (2) Perform a data-flow analysis to identify for each block the "uses
>> + * before write" of successor blocks. E.g. given the blocks below
>> (block0
>> + * precedes block1, that precedes block2)
>> + *
>> + * block0: block1: block2:
>> + * write A load B load D
>> + * write B write C load E
>> + * write C load C
>> + * write D write D
>> + * write E
>> + * load A
>> + *
>> + * the goal is know that B and E are "used before write" after
>> block0; D
>> + * and E are "used before write" after block1. Note that usage of E
>> + * doesn't come from an immediate succesor of block0. The "used
>> before
>> + * written" information will decided later what unused writes are
>> removed.
>> + *
>> + * The data-flow analysis is implemented by starting with per-block
>> sets
>> + * and iterate fixing the content in those sets. The nature of the
>> + * "fixing" ensures that the sets get stable (i.e. no infinite loops).
>> + * Each block gets two sets representing its local information:
>> + *
>> + * - GEN: the derefs used before write by the block. E.g.
>> GEN(block1) = {B}
>> + * - KILL: the derefs overwritten by the block. E.g. KILL(block1) =
>> {C, D}
>> + *
>> + * And two other sets representing its current state:
>> + *
>> + * - IN: the derefs used before write for this block and all
>> successors.
>> + * E.g. at the end of the analysis, IN(block1) = {B, E}. Note how
>> E is
>> + * present but not D.
>> + * - OUT: the derefs used before write for the successors of the
>> block.
>> + *
>> + * In this pass the iteration happens "backwards", so a block uses the
>> + * succesor information to improve its own information. The IN and
>> OUT
>> + * sets are updated as follows:
>> + *
>> + * OUT = union IN of all successors
>> + * IN = GEN union (OUT minus KILL)
>> + *
>> + * 3) Remove the last unused writes for each block that are not used by
>> + * successor blocks. After the analysis in (2), this information is
>> + * available in the OUT set of each block.
>> + *
>> + * One extra factor in play is that not trivial to decide whether two
>> derefs
>> + * are referring to the same storage or not. Currently array indirects
>> are
>> + * assumed to alias with each other, i.e. "write A[i]" is considered
>> used by a
>> + * later "load A[j]". This logic is mostly handled as part of
>> + * nir_compare_deref_paths() function. This prevents us from
>> representing the
>> + * data-flow sets as bitsets.
>> + *
>> + * Data-flow analysis code here is inspired by Muchnick's "Advanced
>> Compiler
>> + * Design and Implementation" and by the data-flow analysis
>> implementation in
>> + * src/intel/compiler/brw_fs_copy_propagation.cpp.
>> + */
>> +
>> +static const bool debug = false;
>> +
>> struct state {
>> void *mem_ctx;
>>
>> @@ -34,6 +113,24 @@ struct state {
>> * rebuilding the paths for the same deref. */
>> struct hash_table *paths;
>> void *path_lin_ctx;
>> +
>> + struct block_data *block_data;
>> +};
>> +
>> +struct block_data {
>> + struct deref_map *in;
>> + struct deref_map *out;
>> + struct deref_map *gen;
>> + struct deref_map *kill;
>> +
>> + struct deref_map *kill_before_memory_barrier;
>> + struct deref_map *kill_before_emit_vertex;
>> +
>> + /* All the writes not used by the end of the block. Used after the
>> + * data-flow analysis, to identify globally unused writes, that can be
>> + * removed.
>> + */
>> + struct util_dynarray unused_writes;
>> };
>>
>> static nir_deref_path *
>> @@ -50,6 +147,23 @@ get_path(struct state *state, nir_deref_instr *deref)
>> }
>> }
>>
>> +static void
>> +print_instr(const char *prefix, const nir_instr *instr, const char
>> *suffix)
>> +{
>> + if (prefix) fputs(prefix, stdout);
>> + nir_print_instr(instr, stdout);
>> + if (suffix) fputs(suffix, stdout);
>> +}
>> +
>> +static void
>> +print_path(const char *prefix, nir_deref_path *path, const char *suffix)
>> +{
>> + int i;
>> + for (i = 0; path->path[i + 1]; i++);
>> + nir_deref_instr *deref = path->path[i];
>> + print_instr(prefix, &deref->instr, suffix);
>> +}
>> +
>> /* Entry for unused_writes arrays. */
>> struct write_entry {
>> /* If NULL indicates the entry is free to be reused. */
>> @@ -58,6 +172,161 @@ struct write_entry {
>> nir_deref_path *path;
>> };
>>
>> +struct deref_map_entry {
>> + nir_deref_path *path;
>> + uintptr_t mask;
>> +};
>> +
>> +/* Maps derefs to a (write) mask value. Used by the various data-flow
>> + * analysis sets.
>> + *
>> + * TODO: Consider a tree-based data structure to represent this instead
>> of
>> + * arrays of paths.
>> + */
>> +struct deref_map {
>> + struct util_dynarray entries;
>> + void *mem_ctx;
>> +};
>> +
>> +static struct deref_map *
>> +deref_map_create(void *mem_ctx)
>> +{
>> + struct deref_map *map = rzalloc(mem_ctx, struct deref_map);
>> + map->mem_ctx = mem_ctx;
>> + util_dynarray_init(&map->entries, mem_ctx);
>> + return map;
>> +}
>> +
>> +static bool
>> +deref_map_empty(struct deref_map *map)
>> +{
>> + return map->entries.size == 0;
>> +}
>> +
>> +/* Add a deref (represented by its path) to a deref_map, with a given
>> mask.
>> + *
>> + * Returns whether the maps was changed or not.
>> + */
>> +static bool
>> +deref_map_add(struct deref_map *map, nir_deref_path *path, uintptr_t
>> mask)
>> +{
>> + util_dynarray_foreach (&map->entries, struct deref_map_entry,
>> map_entry) {
>> + nir_deref_compare_result comp = nir_compare_deref_paths(path,
>> map_entry->path);
>> + if (comp & nir_derefs_equal_bit) {
>> + /* The deref is already present, no new entry needed. */
>> + uintptr_t old_mask = map_entry->mask;
>> + map_entry->mask |= mask;
>> + return old_mask != map_entry->mask;
>> + }
>> + }
>> +
>> + struct deref_map_entry map_entry = {
>> + .path = path,
>> + .mask = mask,
>> + };
>> + util_dynarray_append(&map->entries, struct deref_map_entry,
>> map_entry);
>> +
>> + return true;
>> +}
>> +
>> +static bool
>> +deref_map_contains(struct deref_map *map, nir_deref_path *path,
>> uintptr_t mask)
>> +{
>> + util_dynarray_foreach (&map->entries, struct deref_map_entry,
>> map_entry) {
>> + nir_deref_compare_result comp = nir_compare_deref_paths(path,
>> map_entry->path);
>> + /* All the bits in the mask must be present in the map. */
>> + if ((comp & nir_derefs_b_contains_a_bit) && ((mask &
>> map_entry->mask) == mask))
>> + return true;
>> + }
>> + return false;
>> +}
>> +
>> +static bool
>> +deref_map_may_alias(struct deref_map *map, nir_deref_path *path,
>> uintptr_t mask)
>> +{
>> + util_dynarray_foreach (&map->entries, struct deref_map_entry,
>> map_entry) {
>> + nir_deref_compare_result comp = nir_compare_deref_paths(path,
>> map_entry->path);
>> + /* A single overlapping bit is enough to characterize an alias. */
>> + if ((comp & nir_derefs_may_alias_bit) && (mask & map_entry->mask))
>> + return true;
>> + }
>> + return false;
>> +}
>> +
>> +#define deref_map_foreach(map, elem) \
>> + util_dynarray_foreach(&map->entries, struct deref_map_entry, elem)
>> +
>> +static bool
>> +deref_map_add_map(struct deref_map *map, struct deref_map *to_add)
>> +{
>> + bool changed = false;
>> + deref_map_foreach(to_add, map_entry)
>> + changed |= deref_map_add(map, map_entry->path, map_entry->mask);
>> + return changed;
>> +}
>> +
>> +
>> +static struct deref_map *
>> +deref_map_clone(struct deref_map *from, void *mem_ctx)
>> +{
>> + struct deref_map *map = deref_map_create(mem_ctx);
>> + util_dynarray_clone(&map->entries, mem_ctx, &from->entries);
>> + return map;
>> +}
>> +
>> +/* The zero mask to indicate entry should be removed. */
>> +static void
>> +deref_map_remove_zero_masks(struct deref_map *map)
>> +{
>> + deref_map_foreach(map, map_entry) {
>> + if (map_entry->mask)
>> + continue;
>> +
>> + /* Replace the deleted entry with the last element. */
>> + struct deref_map_entry *last =
>> + util_dynarray_pop_ptr(&map->entries, struct deref_map_entry);
>> + if (map_entry != last)
>> + *map_entry = *last;
>> + }
>> +}
>> +
>> +/* Removes from map all derefs that are contained by any deref in
>> to_sub. */
>> +static bool
>> +deref_map_sub_map(struct deref_map *map, struct deref_map *to_sub)
>> +{
>> + bool changed = false;
>> + deref_map_foreach(to_sub, sub_entry) {
>> + nir_deref_path *path = sub_entry->path;
>> + uintptr_t mask = sub_entry->mask;
>> +
>> + deref_map_foreach(map, map_entry) {
>> + nir_deref_compare_result comp = nir_compare_deref_paths(path,
>> map_entry->path);
>> + if (comp & nir_derefs_a_contains_b_bit) {
>> + uintptr_t new_mask = map_entry->mask & ~mask;
>> + if (new_mask != map_entry->mask)
>> + changed = true;
>> + map_entry->mask = new_mask;
>> + }
>> + }
>> + }
>> +
>> + deref_map_remove_zero_masks(map);
>> +
>> + return changed;
>> +}
>> +
>> +/* Removes from map all derefs that the variable matches the given
>> modes. */
>> +static void
>> +deref_map_sub_modes(struct deref_map *map, nir_variable_mode modes)
>> +{
>> + deref_map_foreach(map, map_entry) {
>> + nir_variable *var = map_entry->path->path[0]->var;
>> + if (var->data.mode & modes)
>> + map_entry->mask = 0;
>> + }
>> + deref_map_remove_zero_masks(map);
>> +}
>> +
>> static void
>> clear_unused_for_modes(struct util_dynarray *unused_writes,
>> nir_variable_mode modes)
>> {
>> @@ -96,10 +365,15 @@ update_unused_writes_with_dst(struct util_dynarray
>> *unused_writes,
>> empty_entry = entry;
>> continue;
>> }
>> +
>> nir_deref_compare_result comp = nir_compare_deref_paths(dst_path,
>> entry->path);
>> if (comp & nir_derefs_a_contains_b_bit) {
>> entry->mask &= ~mask;
>> if (entry->mask == 0) {
>> + if (debug) {
>> + print_instr("REMOVE: ", &entry->intrin->instr,
>> + "\n (value overwritten)\n");
>> + }
>> nir_instr_remove(&entry->intrin->instr);
>> entry->intrin = NULL;
>> empty_entry = entry;
>> @@ -123,13 +397,36 @@ update_unused_writes_with_dst(struct util_dynarray
>> *unused_writes,
>> return progress;
>> }
>>
>> +static void
>> +tag_use_before_write(struct deref_map *gen, struct deref_map *kill,
>> + nir_deref_path *path)
>> +{
>> + if (deref_map_contains(kill, path, ~(1 << NIR_MAX_VEC_COMPONENTS)))
>> + return;
>> + deref_map_add(gen, path, ~(1 << NIR_MAX_VEC_COMPONENTS));
>> +}
>> +
>> +static void
>> +tag_write_before_use(struct deref_map *gen, struct deref_map *kill,
>> + nir_deref_path *path, uintptr_t mask)
>> +{
>> + deref_map_add(kill, path, mask);
>> +}
>> +
>> +static const nir_variable_mode memory_barrier_modes =
>> + ~(nir_var_local | nir_var_global | nir_var_shader_in |
>> nir_var_uniform);
>> +
>> +static const nir_variable_mode emit_vertex_modes = nir_var_shader_out;
>> +
>> +static const nir_variable_mode end_of_program_modes =
>> + ~(nir_var_local | nir_var_shader_in | nir_var_uniform);
>> +
>> static bool
>> -remove_dead_write_vars_local(struct state *state, nir_block *block)
>> +remove_dead_write_vars_local(struct state *state, nir_block *block,
>> struct block_data *bd)
>> {
>> bool progress = false;
>>
>> - struct util_dynarray unused_writes;
>> - util_dynarray_init(&unused_writes, state->mem_ctx);
>> + util_dynarray_init(&bd->unused_writes, state->mem_ctx);
>>
>> nir_foreach_instr_safe(instr, block) {
>> if (instr->type != nir_instr_type_intrinsic)
>> @@ -139,16 +436,21 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>> switch (intrin->intrinsic) {
>> case nir_intrinsic_barrier:
>> case nir_intrinsic_memory_barrier: {
>> - nir_variable_mode modes = ~(nir_var_local | nir_var_global |
>> - nir_var_shader_in |
>> nir_var_uniform);
>> - clear_unused_for_modes(&unused_writes, modes);
>> + clear_unused_for_modes(&bd->unused_writes,
>> memory_barrier_modes);
>> + if (!bd->kill_before_memory_barrier) {
>> + bd->kill_before_memory_barrier = deref_map_clone(bd->kill,
>> state->mem_ctx);
>> + deref_map_sub_modes(bd->kill_before_memory_barrier,
>> memory_barrier_modes);
>> + }
>> break;
>> }
>>
>> case nir_intrinsic_emit_vertex:
>> case nir_intrinsic_emit_vertex_with_counter: {
>> - nir_variable_mode modes = nir_var_shader_out;
>> - clear_unused_for_modes(&unused_writes, modes);
>> + clear_unused_for_modes(&bd->unused_writes, emit_vertex_modes);
>> + if (!bd->kill_before_emit_vertex) {
>> + bd->kill_before_emit_vertex = deref_map_clone(bd->kill,
>> state->mem_ctx);
>> + deref_map_sub_modes(bd->kill_before_emit_vertex,
>> emit_vertex_modes);
>> + }
>> break;
>> }
>>
>> @@ -156,7 +458,8 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>> nir_deref_instr *src = nir_src_as_deref(intrin->src[0]);
>> nir_deref_path *src_path = get_path(state, src);
>>
>> - clear_unused_for_src(&unused_writes, src_path);
>> + clear_unused_for_src(&bd->unused_writes, src_path);
>> + tag_use_before_write(bd->gen, bd->kill, src_path);
>> break;
>> }
>>
>> @@ -165,7 +468,8 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>> nir_deref_path *dst_path = get_path(state, dst);
>>
>> uintptr_t mask = nir_intrinsic_write_mask(intrin);
>> - progress |= update_unused_writes_with_dst(&unused_writes,
>> intrin, dst_path, mask);
>> + progress |= update_unused_writes_with_dst(&bd->unused_writes,
>> intrin, dst_path, mask);
>> + tag_write_before_use(bd->gen, bd->kill, dst_path, mask);
>> break;
>> }
>>
>> @@ -178,14 +482,20 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>>
>> /* Self-copy is removed. */
>> if (src == dst || (nir_compare_deref_paths(src_path, dst_path)
>> & nir_derefs_equal_bit)) {
>> + if (debug) {
>> + print_instr("REMOVE: ", instr,
>> + "\n (self-copy)\n");
>> + }
>> nir_instr_remove(instr);
>> progress = true;
>> break;
>> }
>>
>> uintptr_t mask = ~(1 << NIR_MAX_VEC_COMPONENTS);
>> - clear_unused_for_src(&unused_writes, src_path);
>> - progress |= update_unused_writes_with_dst(&unused_writes,
>> intrin, dst_path, mask);
>> + clear_unused_for_src(&bd->unused_writes, src_path);
>> + progress |= update_unused_writes_with_dst(&bd->unused_writes,
>> intrin, dst_path, mask);
>> + tag_use_before_write(bd->gen, bd->kill, src_path);
>> + tag_write_before_use(bd->gen, bd->kill, dst_path, mask);
>> break;
>> }
>>
>> @@ -201,6 +511,60 @@ remove_dead_write_vars_local(struct state *state,
>> nir_block *block)
>> return progress;
>> }
>>
>> +enum {
>> + DUMP_GEN = 1 << 0,
>> + DUMP_KILL = 1 << 1,
>> + DUMP_IN = 1 << 2,
>> + DUMP_OUT = 1 << 3,
>> + DUMP_UNUSED_WRITES = 1 << 4,
>> +};
>> +
>> +static void
>> +dump_block(struct state *state, unsigned block_index, unsigned flags)
>> +{
>> + struct block_data *bd = &state->block_data[block_index];
>> + printf("--- block%d\n", block_index);
>> +
>> + if (bd->kill_before_memory_barrier)
>> + printf(" has memory barrier\n");
>> + if (bd->kill_before_emit_vertex)
>> + printf(" has emit vertex\n");
>> +
>> + if ((flags & DUMP_GEN) && !deref_map_empty(bd->gen)) {
>> + printf(" gen\n");
>> + deref_map_foreach(bd->gen, map_entry)
>> + print_path(" ", map_entry->path, "\n");
>> + }
>> +
>> + if ((flags & DUMP_KILL) && !deref_map_empty(bd->kill)) {
>> + printf(" kill\n");
>> + deref_map_foreach(bd->kill, map_entry)
>> + print_path(" ", map_entry->path, "\n");
>> + }
>> +
>> + if ((flags & DUMP_IN) && !deref_map_empty(bd->in)) {
>> + printf(" in\n");
>> + deref_map_foreach(bd->in, map_entry)
>> + print_path(" ", map_entry->path, "\n");
>> + }
>> +
>> + if ((flags & DUMP_OUT) && !deref_map_empty(bd->out)) {
>> + printf(" out\n");
>> + deref_map_foreach(bd->out, map_entry)
>> + print_path(" ", map_entry->path, "\n");
>> + }
>> +
>> + if ((flags & DUMP_UNUSED_WRITES) && bd->unused_writes.size != 0) {
>> + printf(" unused writes\n");
>> + util_dynarray_foreach(&bd->unused_writes, struct write_entry,
>> entry) {
>> + if (!entry->intrin)
>> + continue;
>> + print_instr(" ", &entry->intrin->instr, "");
>> + print_path(" -- ", entry->path, "\n");
>> + }
>> + }
>> +}
>> +
>> static bool
>> remove_dead_write_vars_impl(void *mem_ctx, nir_function_impl *func)
>> {
>> @@ -211,10 +575,161 @@ remove_dead_write_vars_impl(void *mem_ctx,
>> nir_function_impl *func)
>>
>> .path_lin_ctx = linear_alloc_parent(mem_ctx, 0),
>> .paths = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
>> _mesa_key_pointer_equal),
>> +
>> + /* Count one extra data element for the end_block. */
>> + .block_data = rzalloc_array(mem_ctx, struct block_data,
>> func->num_blocks + 1),
>> };
>>
>> - nir_foreach_block(block, func)
>> - progress |= remove_dead_write_vars_local(&state, block);
>> + /* Step 1. Perform local optimization and initialize GEN, KILL and
>> + * UNUSED_WRITES sets.
>> + */
>> + bool all_gens_empty = true;
>> + nir_foreach_block(block, func) {
>> + struct block_data *bd = &state.block_data[block->index];
>> + bd->gen = deref_map_create(mem_ctx);
>> + bd->kill = deref_map_create(mem_ctx);
>> + bd->out = deref_map_create(mem_ctx);
>> +
>> + progress |= remove_dead_write_vars_local(&state, block, bd);
>> +
>> + if (!deref_map_empty(bd->gen))
>> + all_gens_empty = false;
>> + }
>> +
>> + if (func->num_blocks < 2 || all_gens_empty)
>> + return progress;
>> +
>> + /* Identify the derefs that are considered used by other intrinsics
>> (or the
>> + * end of the program), then include them as part of the GEN of the
>> blocks
>> + * that contains those intrinsics.
>> + */
>> + struct deref_map *used_by_memory_barrier = deref_map_create(mem_ctx);
>> + struct deref_map *used_by_emit_vertex = deref_map_create(mem_ctx);
>> + struct deref_map *used_by_end_of_program = deref_map_create(mem_ctx);
>> +
>> + for (int i = 0; i < func->num_blocks; i++) {
>> + struct block_data *bd = &state.block_data[i];
>> +
>> + util_dynarray_foreach(&bd->unused_writes, struct write_entry,
>> entry) {
>> + if (!entry->intrin)
>> + continue;
>> +
>> + nir_deref_path *dst_path = entry->path;
>> + nir_variable *var = dst_path->path[0]->var;
>> +
>> + if (var->data.mode & memory_barrier_modes)
>> + deref_map_add(used_by_memory_barrier, dst_path, entry->mask);
>> +
>> + if (var->data.mode & emit_vertex_modes)
>> + deref_map_add(used_by_emit_vertex, dst_path, entry->mask);
>> +
>> + if (var->data.mode & end_of_program_modes)
>> + deref_map_add(used_by_end_of_program, dst_path, entry->mask);
>> + }
>> + }
>> +
>> + /* Update GEN to include extra uses based on the presence of other
>> + * intrinsics. Make sure not include uses for derefs we already
>> overwrite
>> + * (kill) before the intrinsic in the block.
>> + */
>> + for (int i = 0; i < func->num_blocks; i++) {
>> + struct block_data *bd = &state.block_data[i];
>> + if (bd->kill_before_memory_barrier) {
>> + struct deref_map *temp =
>> deref_map_clone(used_by_memory_barrier, mem_ctx);
>> + deref_map_sub_map(temp, bd->kill_before_memory_barrier);
>> + deref_map_add_map(bd->gen, temp);
>> + }
>> + if (bd->kill_before_emit_vertex) {
>> + struct deref_map *temp = deref_map_clone(used_by_emit_vertex,
>> mem_ctx);
>> + deref_map_sub_map(temp, bd->kill_before_emit_vertex);
>> + deref_map_add_map(bd->gen, temp);
>> + }
>> +
>> + /* Initialize the IN set with GEN, since we don't have out
>> information yet. */
>> + bd->in = deref_map_clone(bd->gen, mem_ctx);
>> + }
>> +
>> + /* Since end_block is used as successor for the last blocks executed,
>> we
>> + * can conveniently fill its IN set to propagate the usage.
>> + */
>> + struct block_data *end_bd = &state.block_data[func->end_block->index];
>> + end_bd->in = used_by_end_of_program;
>> +
>> + if (debug) {
>> + printf("\n=== Before data-flow analysis\n");
>> + for (int i = 0; i < func->num_blocks; i++)
>> + dump_block(&state, i, DUMP_IN | DUMP_KILL | DUMP_UNUSED_WRITES);
>> + dump_block(&state, func->end_block->index, DUMP_IN);
>> + }
>> +
>> + /* Step 2. Iterate backwards computing the OUT and IN sets.
>> + *
>> + * OUT = union IN of all successors
>> + * IN = GEN union (OUT minus KILL)
>> + *
>> + * The sets will always grow, so we can reuse the IN instead of
>> repeatedly
>> + * calculate it. This helps identifying whether the set changed or
>> not.
>> + */
>> + bool flow_progress;
>> + int iterations = 0;
>> + do {
>> + flow_progress = false;
>> +
>> + if (debug)
>> + printf("\n=== Blocks with IN/OUT changed in iteration %d\n",
>> ++iterations);
>> +
>> + nir_foreach_block_reverse(block, func) {
>> + if (block == func->end_block)
>> + continue;
>> +
>> + struct block_data *bd = &state.block_data[block->index];
>> +
>> + bool out_changed = false;
>> + for (int i = 0; i < 2; i++) {
>> + nir_block *successor = block->successors[i];
>> + if (!successor)
>> + continue;
>> + struct block_data *successor_bd =
>> &state.block_data[successor->index];
>> + out_changed |= deref_map_add_map(bd->out, successor_bd->in);
>> + }
>> +
>> + if (!out_changed)
>> + continue;
>> +
>> + struct deref_map *temp = deref_map_clone(bd->out, mem_ctx);
>> + deref_map_sub_map(temp, bd->kill);
>> +
>> + bool in_changed = deref_map_add_map(bd->in, temp);
>> + flow_progress |= in_changed;
>> +
>> + if (debug && in_changed)
>> + dump_block(&state, block->index, DUMP_IN | DUMP_OUT);
>> + }
>> + } while (flow_progress);
>> +
>> + if (debug)
>> + printf("\n=== After data flow analysis (%d iterations)\n",
>> iterations);
>> +
>> + /* Step 3. Remove unused writes based on the data-flow analysis. */
>> + nir_foreach_block(block, func) {
>> + struct block_data *bd = &state.block_data[block->index];
>> +
>> + if (debug)
>> + dump_block(&state, block->index, DUMP_OUT | DUMP_UNUSED_WRITES);
>> +
>> + util_dynarray_foreach(&bd->unused_writes, struct write_entry,
>> entry) {
>> + if (!entry->intrin)
>> + continue;
>> + if (!deref_map_may_alias(bd->out, entry->path, entry->mask)) {
>> + if (debug) {
>> + print_instr("REMOVE: ", &entry->intrin->instr,
>> + "\n (unused after data flow analysis)\n");
>> + }
>> + nir_instr_remove(&entry->intrin->instr);
>> + progress = true;
>> + }
>> + }
>> + }
>>
>> return progress;
>> }
>> @@ -225,6 +740,9 @@ nir_opt_dead_write_vars(nir_shader *shader)
>> void *mem_ctx = ralloc_context(NULL);
>> bool progress = false;
>>
>> + if (debug)
>> + nir_print_shader(shader, stdout);
>> +
>> nir_foreach_function(function, shader) {
>> if (!function->impl)
>> continue;
>> --
>> 2.18.0
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180825/b9d8d7d9/attachment-0001.html>
More information about the mesa-dev
mailing list