[Mesa-dev] [RFC 1/2] nir: Add a new lowering pass nir_lower_vec_and_coalesce

Thu Sep 10 11:45:19 PDT 2015

On Wed, Sep 9, 2015 at 11:51 PM, Eduardo Lima Mitev <elima at igalia.com> wrote:
> On 09/09/2015 07:10 PM, Jason Ekstrand wrote:
>>
>> On Sep 8, 2015 23:27, "Eduardo Lima Mitev" <elima at igalia.com
>> <mailto:elima at igalia.com>> wrote:
>>>
>>> This pass will propagate the destination components of a vecN
>> instructions,
>>> as destination of the instructions that define its sources; if certain
>>> conditions are met.
>>>
>>> If all the components of the destination register in the vecN instruction
>>> can be propagated, the instruction is removed. Otherwise, a new, reduced
>>> vecN instruction is emitted with the channels that remained.
>>>
>>> This effectively coalesces registers and reduces indirection.
>>>
>>> By now, this pass will only propagate to ALU instructions, but it could
>>> be extended to include other instructions like load_input intrinsic.
>>>
>>> It also propagates to instructions within the same block as the vecN
>>> instruction. But it could be made to work cross-block in the future,
>>> though there are non-trivial issues with this like considering
>>> registers that are written in different branches of a conditional.
>>> More analysis is needed to correctly cover these cases.
>>>
>>> This pass works on a NIR shader in final form (after SSA), and is
>>> expected to run before nir_lower_vec_to_movs().
>>> ---
>>>  src/glsl/Makefile.sources                 |   1 +
>>>  src/glsl/nir/nir.h                        |   1 +
>>>  src/glsl/nir/nir_lower_vec_and_coalesce.c | 301
>> ++++++++++++++++++++++++++++++
>>>  3 files changed, 303 insertions(+)
>>>  create mode 100644 src/glsl/nir/nir_lower_vec_and_coalesce.c
>>>
>>> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
>>> index c422303..015f242 100644
>>> --- a/src/glsl/Makefile.sources
>>> +++ b/src/glsl/Makefile.sources
>>> @@ -48,6 +48,7 @@ NIR_FILES = \
>>>         nir/nir_lower_vars_to_ssa.c \
>>>         nir/nir_lower_var_copies.c \
>>>         nir/nir_lower_vec_to_movs.c \
>>> +       nir/nir_lower_vec_and_coalesce.c \
>>>         nir/nir_metadata.c \
>>>         nir/nir_normalize_cubemap_coords.c \
>>>         nir/nir_opt_constant_folding.c \
>>> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
>>> index 3c375f3..6a89f1d 100644
>>> --- a/src/glsl/nir/nir.h
>>> +++ b/src/glsl/nir/nir.h
>>> @@ -1786,6 +1786,7 @@ void nir_lower_vars_to_ssa(nir_shader *shader);
>>>  void nir_remove_dead_variables(nir_shader *shader);
>>>
>>>  void nir_lower_vec_to_movs(nir_shader *shader);
>>> +void nir_lower_vec_and_coalesce(nir_shader *shader);
>>>  void nir_lower_alu_to_scalar(nir_shader *shader);
>>>  void nir_lower_load_const_to_scalar(nir_shader *shader);
>>>
>>> diff --git a/src/glsl/nir/nir_lower_vec_and_coalesce.c
>> b/src/glsl/nir/nir_lower_vec_and_coalesce.c
>>> new file mode 100644
>>> index 0000000..2b21ec1
>>> --- /dev/null
>>> +++ b/src/glsl/nir/nir_lower_vec_and_coalesce.c
>>> @@ -0,0 +1,301 @@
>>> +/*
>>> + * Copyright © 2015 Intel Corporation
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person
>> obtaining a
>>> + * copy of this software and associated documentation files (the
>> "Software"),
>>> + * to deal in the Software without restriction, including without
>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute,
>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice (including
>> the next
>>> + * paragraph) shall be included in all copies or substantial portions
>> of the
>>> + * Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
>> SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
>> OR OTHER
>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> ARISING
>>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>> OTHER DEALINGS
>>> + * IN THE SOFTWARE.
>>> + *
>>> + * Authors:
>>> + *    Eduardo Lima Mitev (elima at igalia.com <mailto:elima at igalia.com>)
>>> + *
>>> + */
>>> +
>>> +#include "nir.h"
>>> +
>>> +/*
>>> + * Implements a pass that lowers vecN instructions by propagating the
>>> + * components of their destinations, as the destination of the
>>> + * instructions that defines the sources of the vecN instruction.
>>> + *
>>> + * This effectively coalesces registers and reduces indirection.
>>> + *
>>> + * If all the components of the destination register in the vecN
>>> + * instruction can be propagated, the instruction is removed. Otherwise,
>>> + * a new, reduced vecN instruction is emitted with the channels that
>>> + * remained.
>>> + *
>>> + * By now, this pass will only propagate to ALU instructions, but it
>> could
>>> + * be extended to include load_const instructions or some intrinsics like
>>> + * load_input.
>>> + *
>>> + * This pass works on a NIR shader in final form (after SSA), and is
>>> + * expected to run before nir_lower_vec_to_movs().
>>> + */
>>> +
>>> +/**
>>> + * Clone an ALU instruction and override the destination with the one
>> given by
>>> + * new_dest. It copies sources from original ALU to the new one,
>> adjusting
>>> + * their swizzles.
>>> + *
>>> + * Returns the new ALU instruction.
>>> + */
>>> +static nir_alu_instr *
>>> +clone_alu_instr_and_override_dest(nir_alu_instr *alu_instr,
>>> +                                  nir_alu_dest *new_dest, unsigned index,
>>> +                                  void *mem_ctx)
>>> +{
>>> +   nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx,
>> alu_instr->op);
>>> +
>>> +   for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs;
>> i++) {
>>> +      nir_alu_src_copy(&new_alu_instr->src[i], &alu_instr->src[i],
>> mem_ctx);
>>> +
>>> +      /* Dot-product operations should not be swizzle'd */
>>> +      switch (alu_instr->op) {
>>> +      case nir_op_fdot2:
>>> +      case nir_op_fdot3:
>>> +      case nir_op_fdot4:
>>> +         continue;
>>> +      default:
>>> +         break;
>>> +      }
>>
>> You should use nir_op_infos[alu_instr->op].output_size == 0 to determine
>> whether or not the instruction is per-channel.
>>
>> You also need to check that nir_op_infos[op].input_sizes[i] == 0 to make
>> sure that the components of the source actually map to components of of
>> the destination.  If output_size == 0 but input_sizes[i] != 0 for some
>> source, we should probably bail entirely.  The one exception to this
>> that I can think of off-hand is bcsel where it has a single
>> one-component source and you are free to reswizzle the others.
>>
>
> Good to know! I will definitely consider this in the future.
>
>>> +      new_alu_instr->src[i].swizzle[index] =
>>> +         alu_instr->src[i].swizzle[0];
>>> +   }
>>> +
>>> +   nir_alu_dest_copy(&new_alu_instr->dest, new_dest, mem_ctx);
>>> +   new_alu_instr->dest.write_mask = 1 << index;
>>> +
>>> +   return new_alu_instr;
>>> +}
>>> +
>>> +/**
>>> + * Simply searches an entry in the list, and return true if found, false
>>> + * otherwise.
>>> + */
>>> +static bool
>>> +register_already_tracked(const nir_register *reg, nir_register *list[4],
>>> +                         unsigned num_items)
>>> +{
>>> +   for (unsigned i = 0; i < num_items; i++) {
>>> +      if (list[i] == reg)
>>> +         return true;
>>> +   }
>>> +   return false;
>>> +}
>>> +
>>> +/**
>>> + * Tells whether an instruction comes before another one inside the same
>>> + * block. Return true if 'target' appears after 'other', false otherwise.
>>> + */
>>> +static bool
>>> +instr_is_previous_in_block(nir_instr *target, nir_instr *other)
>>> +{
>>> +   nir_instr *prev = target;
>>> +   while ((prev = nir_instr_prev(prev)) != NULL) {
>>> +      if (prev == other)
>>> +         return true;
>>> +   }
>>> +   return false;
>>> +}
>>> +
>>> +static bool
>>> +lower_vec_and_coalesce_block(nir_block *block, void *mem_ctx)
>>> +{
>>> +   nir_foreach_instr_safe(block, instr) {
>>> +      if (instr->type != nir_instr_type_alu)
>>> +         continue;
>>> +
>>> +      nir_alu_instr *vec = nir_instr_as_alu(instr);
>>> +
>>> +      switch (vec->op) {
>>> +      case nir_op_vec2:
>>> +      case nir_op_vec3:
>>> +      case nir_op_vec4:
>>> +         break;
>>> +      default:
>>> +         continue; /* The loop */
>>> +      }
>>> +
>>> +      /* Since we insert multiple MOVs, we have to be non-SSA. */
>>> +      assert(!vec->dest.dest.is_ssa);
>>> +
>>> +      unsigned finished_write_mask = 0;
>>> +      nir_register *tracked_registers[4] = {0};
>>> +      unsigned num_tracked_registers = 0;
>>> +
>>> +      for (unsigned i = 0; i < 4; i++) {
>>> +         if (!(vec->dest.write_mask & (1 << i)))
>>> +            continue;
>>> +
>>> +         if (vec->src[i].src.is_ssa)
>>> +            continue;
>>> +
>>> +         nir_register *reg = vec->src[i].src.reg.reg;
>>> +
>>> +         /* By now, lets ignore the sources whose register is the
>> same as the
>>> +          * destination register of the vecN instruction.
>>> +          */
>>> +         if (reg == vec->dest.dest.reg.reg)
>>> +            continue;
>>> +
>>> +         nir_foreach_def_safe(reg, src) {
>>> +            nir_instr *parent_instr = src->reg.parent_instr;
>>> +
>>> +            /* By now, we enforce that the parent instruction must be
>> in the
>>> +             * same block as the vecN instruction. This is because if
>> it is in
>>> +             * other block, like a branch of a conditional, we need
>> to be sure
>>> +             * that the register component is propagated in all branches.
>>> +             * Otherwise we cannot reduce the current vecN instruction.
>>> +             * @TODO: To extend this pass to work across blocks, more
>> analysis
>>> +             * is needed.
>>> +             */
>>> +            if (parent_instr->block != block)
>>> +               continue;
>>> +
>>> +            /* The instruction to propagate to must come before the vecN
>>> +             * instruction inside the block.
>>> +             */
>>> +            if (!instr_is_previous_in_block(instr, parent_instr))
>>> +               continue;
>>> +
>>> +            /* We only coalesce registers written by ALU
>> instructions, by now.
>>> +             * @TODO: consider other type of instructions, like
>> intrinsics, etc.
>>> +             */
>>> +            if (parent_instr->type != nir_instr_type_alu)
>>> +               continue;
>>> +
>>> +            nir_alu_instr *parent_alu_instr =
>> nir_instr_as_alu(parent_instr);
>>> +            nir_register *parent_dest_reg =
>> parent_alu_instr->dest.dest.reg.reg;
>>> +
>>> +            /* We only override dest registers that are only used
>> once, and in
>>> +             * this vecX instruction.
>>> +             * @TODO: In the future we might consider registers used
>> more than
>>> +             * once as sources of the same vecX instruction.
>>> +             */
>>
>> Yeah, we definitely want to handle that case and we want to handle it by
>> emitting a single instruction that does multiple channels.
>>
>
> I started implementing that as part of of this patch but quickly
> realized it was not trivial and basically postponed it until I got some
> feedback for the basics.
>
>>> +            if (list_length(&parent_dest_reg->uses) != 1)
>>> +               continue;
>>> +
>>> +            nir_alu_instr *new_alu_instr =
>>> +               clone_alu_instr_and_override_dest(parent_alu_instr,
>> &vec->dest,
>>> +                                                 i, mem_ctx);
>>
>> This mess would be a lot easier if we were still in SSA form.  I've been
>> thinking for a while that we should make lower_vec_to_movs work in SSA
>> form (it would replace SSA destinations with register destinations).
>> Then use partial SSA lowering like we do in the FS backend.  That would
>> make this pass much easier together correct.
>>
>
> I didn't know about the partial SSA pass in FS backend, I could have got
> some inspiration there. But did tried hard to implement this while in
> SSA form at the beginning, without much success because the combination
> of nir_from_ssa and lower_vec_to_movs would screw things after any SSA pass.

Yeah, partial SSA was something that's actually fairly recent and
happened in parallel with the vec4 stuff.

>> In general, I'm kind of thinking that this might be better done as an
>> extension to the lower_vec_to_moves pass or as a new version of it.  You
>> would loop through the sources of a vecN and, if you can rewrite a
>> destination, you do and if you can't, you insert moves.  We may also
>> want to  it handle SSA while we're at it.  Note: it will get run after
>> nir_from_ssa so it will have to handle partial SSA where some
>> destinations are SSA and some are registers.
>>
>
> Actually, the first version of this patch, I wrote it against
> lower_vec_to_movs, but then Matt suggested that likely it would be
> easier for the opt_register_coalsece pass to avoid lower_vec_to_movs and
> deal with vecN instructions in the backend. Then I rewrote this pass as
> a separate thing that would re-emit reduced vecN instructions, to
> essentially make it compatible with Matt's suggestion, thus future proof :).

Hrm... I'm not sure why matt suggested it be it's own pass but I can
kind of see it.  At least in my implementation, it seemed to work
fairly naturally to just put it in lower_vec_to_movs.  Is there some
place where I can see your original pass that did it there?  If you
don't have it handy, don't do a lot of work to dig it up, I'm mostly
curious.

>> Does this seem reasonable?
>>
>
> Absolutely. The first thing I noticed is that lowering stuff in NIR out
> of SSA form is not very elegant and also not very well supported (lack
> of utility APIs).
>
> I see you sent a series to the ML implementing some of the ideas in this
> patch. The shader-db results are impressive!
> I will use that as training material :) and follow the review process
> closely. This also means I stop working on this pass.

I'm sorry if I stole your thunder; I didn't mean to.  It was just one
of those things where it takes fewer lines of code to implement it
than it takes lines of prose to explain it to someone.  Most of the
work (once I saw what you were doing) was just in fixing core NIR bugs
to make it all work.
--Jason

> Thanks a lot for the review!
>
> Eduardo
>
>>> +            /* For imov and fmov operations, we set the original
>> swizzle from
>>> +             * the vecN source, directly into the new instruction's only
>>> +             * source.
>>> +             */
>>> +            if (parent_alu_instr->op == nir_op_imov ||
>>> +                parent_alu_instr->op == nir_op_fmov) {
>>> +               new_alu_instr->src[0].swizzle[0] = vec->src[i].swizzle[i];
>>> +            }
>>> +
>>> +            finished_write_mask |= new_alu_instr->dest.write_mask;
>>> +
>>> +            /* Remove the old instruction */
>>> +            nir_instr_remove(&parent_alu_instr->instr);
>>> +            ralloc_free(parent_alu_instr);
>>> +
>>> +            /* Track the intermediate register, to remove it later if
>> not used */
>>> +            if (!register_already_tracked(parent_dest_reg,
>> tracked_registers,
>>> +                                          num_tracked_registers)) {
>>> +               tracked_registers[num_tracked_registers] =
>> parent_dest_reg;
>>> +               num_tracked_registers++;
>>> +            }
>>> +
>>> +            /* Insert the new instruction with the overwritten
>> destination */
>>> +            nir_instr_insert_before(&vec->instr, &new_alu_instr->instr);
>>> +         }
>>> +      }
>>> +
>>> +      if (finished_write_mask == 0)
>>> +         continue;
>>> +
>>> +      nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx,
>> nir_op_vec4);
>>> +      nir_alu_dest_copy(&new_alu_instr->dest, &vec->dest, mem_ctx);
>>> +      new_alu_instr->dest.write_mask = 0;
>>> +
>>> +      unsigned c = 0;
>>> +      for (unsigned i = 0; i < nir_op_infos[vec->op].num_inputs; i++) {
>>> +         if (!(vec->dest.write_mask & (1 << i)))
>>> +            continue;
>>> +
>>> +         if (finished_write_mask & (1 << i))
>>> +            continue;
>>> +
>>> +         nir_alu_src_copy(&new_alu_instr->src[c], &vec->src[i], mem_ctx);
>>> +         new_alu_instr->src[c].swizzle[i] = vec->src[i].swizzle[c];
>>> +
>>> +         new_alu_instr->dest.write_mask |= (1 << i);
>>> +
>>> +         c++;
>>> +      }
>>> +
>>> +      switch (c) {
>>> +      case 0:
>>> +         /* we coalesced all register components, so no need to emit any
>>> +          * reduced instruction.
>>> +          */
>>> +         ralloc_free(new_alu_instr);
>>> +         break;
>>> +      case 1:
>>> +         new_alu_instr->op = nir_op_imov;
>>> +         break;
>>> +      case 2:
>>> +         new_alu_instr->op = nir_op_vec2;
>>> +         break;
>>> +      case 3:
>>> +         new_alu_instr->op = nir_op_vec3;
>>> +         break;
>>> +      default:
>>> +         unreachable("Not reached");
>>> +      }
>>> +
>>> +      if (c != 0)
>>> +         nir_instr_insert_before(&vec->instr, &new_alu_instr->instr);
>>> +
>>> +      /* Remove the original vecN instruction */
>>> +      nir_instr_remove(&vec->instr);
>>> +      ralloc_free(vec);
>>> +
>>> +      /* Remove the tracked registers that are no longer used */
>>> +      for (unsigned i = 0; i < num_tracked_registers; i++) {
>>> +         if (list_length(&tracked_registers[i]->defs) == 0 &&
>>> +             list_length(&tracked_registers[i]->uses) == 0 &&
>>> +             list_length(&tracked_registers[i]->if_uses) == 0) {
>>> +            nir_reg_remove(tracked_registers[i]);
>>> +         }
>>> +      }
>>> +   }
>>> +
>>> +   return true;
>>> +}
>>> +
>>> +static void
>>> +nir_lower_vec_and_coalesce_impl(nir_function_impl *impl)
>>> +{
>>> +   nir_foreach_block(impl, lower_vec_and_coalesce_block,
>> ralloc_parent(impl));
>>> +}
>>> +
>>> +void
>>> +nir_lower_vec_and_coalesce(nir_shader *shader)
>>> +{
>>> +   nir_foreach_overload(shader, overload) {
>>> +      if (overload->impl)
>>> +         nir_lower_vec_and_coalesce_impl(overload->impl);
>>> +   }
>>> +}
>>> --
>>> 2.4.6
>>>
>>
>