[Mesa-dev] [RFC 1/2] nir: Add a new lowering pass nir_lower_vec_and_coalesce

Wed Sep 9 23:51:01 PDT 2015

On 09/09/2015 07:10 PM, Jason Ekstrand wrote:
> 
> On Sep 8, 2015 23:27, "Eduardo Lima Mitev" <elima at igalia.com
> <mailto:elima at igalia.com>> wrote:
>>
>> This pass will propagate the destination components of a vecN
> instructions,
>> as destination of the instructions that define its sources; if certain
>> conditions are met.
>>
>> If all the components of the destination register in the vecN instruction
>> can be propagated, the instruction is removed. Otherwise, a new, reduced
>> vecN instruction is emitted with the channels that remained.
>>
>> This effectively coalesces registers and reduces indirection.
>>
>> By now, this pass will only propagate to ALU instructions, but it could
>> be extended to include other instructions like load_input intrinsic.
>>
>> It also propagates to instructions within the same block as the vecN
>> instruction. But it could be made to work cross-block in the future,
>> though there are non-trivial issues with this like considering
>> registers that are written in different branches of a conditional.
>> More analysis is needed to correctly cover these cases.
>>
>> This pass works on a NIR shader in final form (after SSA), and is
>> expected to run before nir_lower_vec_to_movs().
>> ---
>>  src/glsl/Makefile.sources                 |   1 +
>>  src/glsl/nir/nir.h                        |   1 +
>>  src/glsl/nir/nir_lower_vec_and_coalesce.c | 301
> ++++++++++++++++++++++++++++++
>>  3 files changed, 303 insertions(+)
>>  create mode 100644 src/glsl/nir/nir_lower_vec_and_coalesce.c
>>
>> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
>> index c422303..015f242 100644
>> --- a/src/glsl/Makefile.sources
>> +++ b/src/glsl/Makefile.sources
>> @@ -48,6 +48,7 @@ NIR_FILES = \
>>         nir/nir_lower_vars_to_ssa.c \
>>         nir/nir_lower_var_copies.c \
>>         nir/nir_lower_vec_to_movs.c \
>> +       nir/nir_lower_vec_and_coalesce.c \
>>         nir/nir_metadata.c \
>>         nir/nir_normalize_cubemap_coords.c \
>>         nir/nir_opt_constant_folding.c \
>> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
>> index 3c375f3..6a89f1d 100644
>> --- a/src/glsl/nir/nir.h
>> +++ b/src/glsl/nir/nir.h
>> @@ -1786,6 +1786,7 @@ void nir_lower_vars_to_ssa(nir_shader *shader);
>>  void nir_remove_dead_variables(nir_shader *shader);
>>
>>  void nir_lower_vec_to_movs(nir_shader *shader);
>> +void nir_lower_vec_and_coalesce(nir_shader *shader);
>>  void nir_lower_alu_to_scalar(nir_shader *shader);
>>  void nir_lower_load_const_to_scalar(nir_shader *shader);
>>
>> diff --git a/src/glsl/nir/nir_lower_vec_and_coalesce.c
> b/src/glsl/nir/nir_lower_vec_and_coalesce.c
>> new file mode 100644
>> index 0000000..2b21ec1
>> --- /dev/null
>> +++ b/src/glsl/nir/nir_lower_vec_and_coalesce.c
>> @@ -0,0 +1,301 @@
>> +/*
>> + * Copyright © 2015 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person
> obtaining a
>> + * copy of this software and associated documentation files (the
> "Software"),
>> + * to deal in the Software without restriction, including without
> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including
> the next
>> + * paragraph) shall be included in all copies or substantial portions
> of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
> OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + * Authors:
>> + *    Eduardo Lima Mitev (elima at igalia.com <mailto:elima at igalia.com>)
>> + *
>> + */
>> +
>> +#include "nir.h"
>> +
>> +/*
>> + * Implements a pass that lowers vecN instructions by propagating the
>> + * components of their destinations, as the destination of the
>> + * instructions that defines the sources of the vecN instruction.
>> + *
>> + * This effectively coalesces registers and reduces indirection.
>> + *
>> + * If all the components of the destination register in the vecN
>> + * instruction can be propagated, the instruction is removed. Otherwise,
>> + * a new, reduced vecN instruction is emitted with the channels that
>> + * remained.
>> + *
>> + * By now, this pass will only propagate to ALU instructions, but it
> could
>> + * be extended to include load_const instructions or some intrinsics like
>> + * load_input.
>> + *
>> + * This pass works on a NIR shader in final form (after SSA), and is
>> + * expected to run before nir_lower_vec_to_movs().
>> + */
>> +
>> +/**
>> + * Clone an ALU instruction and override the destination with the one
> given by
>> + * new_dest. It copies sources from original ALU to the new one,
> adjusting
>> + * their swizzles.
>> + *
>> + * Returns the new ALU instruction.
>> + */
>> +static nir_alu_instr *
>> +clone_alu_instr_and_override_dest(nir_alu_instr *alu_instr,
>> +                                  nir_alu_dest *new_dest, unsigned index,
>> +                                  void *mem_ctx)
>> +{
>> +   nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx,
> alu_instr->op);
>> +
>> +   for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs;
> i++) {
>> +      nir_alu_src_copy(&new_alu_instr->src[i], &alu_instr->src[i],
> mem_ctx);
>> +
>> +      /* Dot-product operations should not be swizzle'd */
>> +      switch (alu_instr->op) {
>> +      case nir_op_fdot2:
>> +      case nir_op_fdot3:
>> +      case nir_op_fdot4:
>> +         continue;
>> +      default:
>> +         break;
>> +      }
> 
> You should use nir_op_infos[alu_instr->op].output_size == 0 to determine
> whether or not the instruction is per-channel.
> 
> You also need to check that nir_op_infos[op].input_sizes[i] == 0 to make
> sure that the components of the source actually map to components of of
> the destination.  If output_size == 0 but input_sizes[i] != 0 for some
> source, we should probably bail entirely.  The one exception to this
> that I can think of off-hand is bcsel where it has a single
> one-component source and you are free to reswizzle the others.
> 

Good to know! I will definitely consider this in the future.

>> +      new_alu_instr->src[i].swizzle[index] =
>> +         alu_instr->src[i].swizzle[0];
>> +   }
>> +
>> +   nir_alu_dest_copy(&new_alu_instr->dest, new_dest, mem_ctx);
>> +   new_alu_instr->dest.write_mask = 1 << index;
>> +
>> +   return new_alu_instr;
>> +}
>> +
>> +/**
>> + * Simply searches an entry in the list, and return true if found, false
>> + * otherwise.
>> + */
>> +static bool
>> +register_already_tracked(const nir_register *reg, nir_register *list[4],
>> +                         unsigned num_items)
>> +{
>> +   for (unsigned i = 0; i < num_items; i++) {
>> +      if (list[i] == reg)
>> +         return true;
>> +   }
>> +   return false;
>> +}
>> +
>> +/**
>> + * Tells whether an instruction comes before another one inside the same
>> + * block. Return true if 'target' appears after 'other', false otherwise.
>> + */
>> +static bool
>> +instr_is_previous_in_block(nir_instr *target, nir_instr *other)
>> +{
>> +   nir_instr *prev = target;
>> +   while ((prev = nir_instr_prev(prev)) != NULL) {
>> +      if (prev == other)
>> +         return true;
>> +   }
>> +   return false;
>> +}
>> +
>> +static bool
>> +lower_vec_and_coalesce_block(nir_block *block, void *mem_ctx)
>> +{
>> +   nir_foreach_instr_safe(block, instr) {
>> +      if (instr->type != nir_instr_type_alu)
>> +         continue;
>> +
>> +      nir_alu_instr *vec = nir_instr_as_alu(instr);
>> +
>> +      switch (vec->op) {
>> +      case nir_op_vec2:
>> +      case nir_op_vec3:
>> +      case nir_op_vec4:
>> +         break;
>> +      default:
>> +         continue; /* The loop */
>> +      }
>> +
>> +      /* Since we insert multiple MOVs, we have to be non-SSA. */
>> +      assert(!vec->dest.dest.is_ssa);
>> +
>> +      unsigned finished_write_mask = 0;
>> +      nir_register *tracked_registers[4] = {0};
>> +      unsigned num_tracked_registers = 0;
>> +
>> +      for (unsigned i = 0; i < 4; i++) {
>> +         if (!(vec->dest.write_mask & (1 << i)))
>> +            continue;
>> +
>> +         if (vec->src[i].src.is_ssa)
>> +            continue;
>> +
>> +         nir_register *reg = vec->src[i].src.reg.reg;
>> +
>> +         /* By now, lets ignore the sources whose register is the
> same as the
>> +          * destination register of the vecN instruction.
>> +          */
>> +         if (reg == vec->dest.dest.reg.reg)
>> +            continue;
>> +
>> +         nir_foreach_def_safe(reg, src) {
>> +            nir_instr *parent_instr = src->reg.parent_instr;
>> +
>> +            /* By now, we enforce that the parent instruction must be
> in the
>> +             * same block as the vecN instruction. This is because if
> it is in
>> +             * other block, like a branch of a conditional, we need
> to be sure
>> +             * that the register component is propagated in all branches.
>> +             * Otherwise we cannot reduce the current vecN instruction.
>> +             * @TODO: To extend this pass to work across blocks, more
> analysis
>> +             * is needed.
>> +             */
>> +            if (parent_instr->block != block)
>> +               continue;
>> +
>> +            /* The instruction to propagate to must come before the vecN
>> +             * instruction inside the block.
>> +             */
>> +            if (!instr_is_previous_in_block(instr, parent_instr))
>> +               continue;
>> +
>> +            /* We only coalesce registers written by ALU
> instructions, by now.
>> +             * @TODO: consider other type of instructions, like
> intrinsics, etc.
>> +             */
>> +            if (parent_instr->type != nir_instr_type_alu)
>> +               continue;
>> +
>> +            nir_alu_instr *parent_alu_instr =
> nir_instr_as_alu(parent_instr);
>> +            nir_register *parent_dest_reg =
> parent_alu_instr->dest.dest.reg.reg;
>> +
>> +            /* We only override dest registers that are only used
> once, and in
>> +             * this vecX instruction.
>> +             * @TODO: In the future we might consider registers used
> more than
>> +             * once as sources of the same vecX instruction.
>> +             */
> 
> Yeah, we definitely want to handle that case and we want to handle it by
> emitting a single instruction that does multiple channels.
> 

I started implementing that as part of of this patch but quickly
realized it was not trivial and basically postponed it until I got some
feedback for the basics.

>> +            if (list_length(&parent_dest_reg->uses) != 1)
>> +               continue;
>> +
>> +            nir_alu_instr *new_alu_instr =
>> +               clone_alu_instr_and_override_dest(parent_alu_instr,
> &vec->dest,
>> +                                                 i, mem_ctx);
> 
> This mess would be a lot easier if we were still in SSA form.  I've been
> thinking for a while that we should make lower_vec_to_movs work in SSA
> form (it would replace SSA destinations with register destinations). 
> Then use partial SSA lowering like we do in the FS backend.  That would
> make this pass much easier together correct.
>

I didn't know about the partial SSA pass in FS backend, I could have got
some inspiration there. But did tried hard to implement this while in
SSA form at the beginning, without much success because the combination
of nir_from_ssa and lower_vec_to_movs would screw things after any SSA pass.

> In general, I'm kind of thinking that this might be better done as an
> extension to the lower_vec_to_moves pass or as a new version of it.  You
> would loop through the sources of a vecN and, if you can rewrite a
> destination, you do and if you can't, you insert moves.  We may also
> want to  it handle SSA while we're at it.  Note: it will get run after
> nir_from_ssa so it will have to handle partial SSA where some
> destinations are SSA and some are registers.
> 

Actually, the first version of this patch, I wrote it against
lower_vec_to_movs, but then Matt suggested that likely it would be
easier for the opt_register_coalsece pass to avoid lower_vec_to_movs and
deal with vecN instructions in the backend. Then I rewrote this pass as
a separate thing that would re-emit reduced vecN instructions, to
essentially make it compatible with Matt's suggestion, thus future proof :).

> Does this seem reasonable?
>

Absolutely. The first thing I noticed is that lowering stuff in NIR out
of SSA form is not very elegant and also not very well supported (lack
of utility APIs).

I see you sent a series to the ML implementing some of the ideas in this
patch. The shader-db results are impressive!
I will use that as training material :) and follow the review process
closely. This also means I stop working on this pass.

Thanks a lot for the review!

Eduardo

>> +            /* For imov and fmov operations, we set the original
> swizzle from
>> +             * the vecN source, directly into the new instruction's only
>> +             * source.
>> +             */
>> +            if (parent_alu_instr->op == nir_op_imov ||
>> +                parent_alu_instr->op == nir_op_fmov) {
>> +               new_alu_instr->src[0].swizzle[0] = vec->src[i].swizzle[i];
>> +            }
>> +
>> +            finished_write_mask |= new_alu_instr->dest.write_mask;
>> +
>> +            /* Remove the old instruction */
>> +            nir_instr_remove(&parent_alu_instr->instr);
>> +            ralloc_free(parent_alu_instr);
>> +
>> +            /* Track the intermediate register, to remove it later if
> not used */
>> +            if (!register_already_tracked(parent_dest_reg,
> tracked_registers,
>> +                                          num_tracked_registers)) {
>> +               tracked_registers[num_tracked_registers] =
> parent_dest_reg;
>> +               num_tracked_registers++;
>> +            }
>> +
>> +            /* Insert the new instruction with the overwritten
> destination */
>> +            nir_instr_insert_before(&vec->instr, &new_alu_instr->instr);
>> +         }
>> +      }
>> +
>> +      if (finished_write_mask == 0)
>> +         continue;
>> +
>> +      nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx,
> nir_op_vec4);
>> +      nir_alu_dest_copy(&new_alu_instr->dest, &vec->dest, mem_ctx);
>> +      new_alu_instr->dest.write_mask = 0;
>> +
>> +      unsigned c = 0;
>> +      for (unsigned i = 0; i < nir_op_infos[vec->op].num_inputs; i++) {
>> +         if (!(vec->dest.write_mask & (1 << i)))
>> +            continue;
>> +
>> +         if (finished_write_mask & (1 << i))
>> +            continue;
>> +
>> +         nir_alu_src_copy(&new_alu_instr->src[c], &vec->src[i], mem_ctx);
>> +         new_alu_instr->src[c].swizzle[i] = vec->src[i].swizzle[c];
>> +
>> +         new_alu_instr->dest.write_mask |= (1 << i);
>> +
>> +         c++;
>> +      }
>> +
>> +      switch (c) {
>> +      case 0:
>> +         /* we coalesced all register components, so no need to emit any
>> +          * reduced instruction.
>> +          */
>> +         ralloc_free(new_alu_instr);
>> +         break;
>> +      case 1:
>> +         new_alu_instr->op = nir_op_imov;
>> +         break;
>> +      case 2:
>> +         new_alu_instr->op = nir_op_vec2;
>> +         break;
>> +      case 3:
>> +         new_alu_instr->op = nir_op_vec3;
>> +         break;
>> +      default:
>> +         unreachable("Not reached");
>> +      }
>> +
>> +      if (c != 0)
>> +         nir_instr_insert_before(&vec->instr, &new_alu_instr->instr);
>> +
>> +      /* Remove the original vecN instruction */
>> +      nir_instr_remove(&vec->instr);
>> +      ralloc_free(vec);
>> +
>> +      /* Remove the tracked registers that are no longer used */
>> +      for (unsigned i = 0; i < num_tracked_registers; i++) {
>> +         if (list_length(&tracked_registers[i]->defs) == 0 &&
>> +             list_length(&tracked_registers[i]->uses) == 0 &&
>> +             list_length(&tracked_registers[i]->if_uses) == 0) {
>> +            nir_reg_remove(tracked_registers[i]);
>> +         }
>> +      }
>> +   }
>> +
>> +   return true;
>> +}
>> +
>> +static void
>> +nir_lower_vec_and_coalesce_impl(nir_function_impl *impl)
>> +{
>> +   nir_foreach_block(impl, lower_vec_and_coalesce_block,
> ralloc_parent(impl));
>> +}
>> +
>> +void
>> +nir_lower_vec_and_coalesce(nir_shader *shader)
>> +{
>> +   nir_foreach_overload(shader, overload) {
>> +      if (overload->impl)
>> +         nir_lower_vec_and_coalesce_impl(overload->impl);
>> +   }
>> +}
>> --
>> 2.4.6
>>
>