[Mesa-dev] [RFC 1/2] nir: Add a new lowering pass nir_lower_vec_and_coalesce

Jason Ekstrand jason at jlekstrand.net
Wed Sep 9 10:10:39 PDT 2015


On Sep 8, 2015 23:27, "Eduardo Lima Mitev" <elima at igalia.com> wrote:
>
> This pass will propagate the destination components of a vecN
instructions,
> as destination of the instructions that define its sources; if certain
> conditions are met.
>
> If all the components of the destination register in the vecN instruction
> can be propagated, the instruction is removed. Otherwise, a new, reduced
> vecN instruction is emitted with the channels that remained.
>
> This effectively coalesces registers and reduces indirection.
>
> By now, this pass will only propagate to ALU instructions, but it could
> be extended to include other instructions like load_input intrinsic.
>
> It also propagates to instructions within the same block as the vecN
> instruction. But it could be made to work cross-block in the future,
> though there are non-trivial issues with this like considering
> registers that are written in different branches of a conditional.
> More analysis is needed to correctly cover these cases.
>
> This pass works on a NIR shader in final form (after SSA), and is
> expected to run before nir_lower_vec_to_movs().
> ---
>  src/glsl/Makefile.sources                 |   1 +
>  src/glsl/nir/nir.h                        |   1 +
>  src/glsl/nir/nir_lower_vec_and_coalesce.c | 301
++++++++++++++++++++++++++++++
>  3 files changed, 303 insertions(+)
>  create mode 100644 src/glsl/nir/nir_lower_vec_and_coalesce.c
>
> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
> index c422303..015f242 100644
> --- a/src/glsl/Makefile.sources
> +++ b/src/glsl/Makefile.sources
> @@ -48,6 +48,7 @@ NIR_FILES = \
>         nir/nir_lower_vars_to_ssa.c \
>         nir/nir_lower_var_copies.c \
>         nir/nir_lower_vec_to_movs.c \
> +       nir/nir_lower_vec_and_coalesce.c \
>         nir/nir_metadata.c \
>         nir/nir_normalize_cubemap_coords.c \
>         nir/nir_opt_constant_folding.c \
> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
> index 3c375f3..6a89f1d 100644
> --- a/src/glsl/nir/nir.h
> +++ b/src/glsl/nir/nir.h
> @@ -1786,6 +1786,7 @@ void nir_lower_vars_to_ssa(nir_shader *shader);
>  void nir_remove_dead_variables(nir_shader *shader);
>
>  void nir_lower_vec_to_movs(nir_shader *shader);
> +void nir_lower_vec_and_coalesce(nir_shader *shader);
>  void nir_lower_alu_to_scalar(nir_shader *shader);
>  void nir_lower_load_const_to_scalar(nir_shader *shader);
>
> diff --git a/src/glsl/nir/nir_lower_vec_and_coalesce.c
b/src/glsl/nir/nir_lower_vec_and_coalesce.c
> new file mode 100644
> index 0000000..2b21ec1
> --- /dev/null
> +++ b/src/glsl/nir/nir_lower_vec_and_coalesce.c
> @@ -0,0 +1,301 @@
> +/*
> + * Copyright © 2015 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining
a
> + * copy of this software and associated documentation files (the
"Software"),
> + * to deal in the Software without restriction, including without
limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
next
> + * paragraph) shall be included in all copies or substantial portions of
the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * Authors:
> + *    Eduardo Lima Mitev (elima at igalia.com)
> + *
> + */
> +
> +#include "nir.h"
> +
> +/*
> + * Implements a pass that lowers vecN instructions by propagating the
> + * components of their destinations, as the destination of the
> + * instructions that defines the sources of the vecN instruction.
> + *
> + * This effectively coalesces registers and reduces indirection.
> + *
> + * If all the components of the destination register in the vecN
> + * instruction can be propagated, the instruction is removed. Otherwise,
> + * a new, reduced vecN instruction is emitted with the channels that
> + * remained.
> + *
> + * By now, this pass will only propagate to ALU instructions, but it
could
> + * be extended to include load_const instructions or some intrinsics like
> + * load_input.
> + *
> + * This pass works on a NIR shader in final form (after SSA), and is
> + * expected to run before nir_lower_vec_to_movs().
> + */
> +
> +/**
> + * Clone an ALU instruction and override the destination with the one
given by
> + * new_dest. It copies sources from original ALU to the new one,
adjusting
> + * their swizzles.
> + *
> + * Returns the new ALU instruction.
> + */
> +static nir_alu_instr *
> +clone_alu_instr_and_override_dest(nir_alu_instr *alu_instr,
> +                                  nir_alu_dest *new_dest, unsigned index,
> +                                  void *mem_ctx)
> +{
> +   nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx,
alu_instr->op);
> +
> +   for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++)
{
> +      nir_alu_src_copy(&new_alu_instr->src[i], &alu_instr->src[i],
mem_ctx);
> +
> +      /* Dot-product operations should not be swizzle'd */
> +      switch (alu_instr->op) {
> +      case nir_op_fdot2:
> +      case nir_op_fdot3:
> +      case nir_op_fdot4:
> +         continue;
> +      default:
> +         break;
> +      }

You should use nir_op_infos[alu_instr->op].output_size == 0 to determine
whether or not the instruction is per-channel.

You also need to check that nir_op_infos[op].input_sizes[i] == 0 to make
sure that the components of the source actually map to components of of the
destination.  If output_size == 0 but input_sizes[i] != 0 for some source,
we should probably bail entirely.  The one exception to this that I can
think of off-hand is bcsel where it has a single one-component source and
you are free to reswizzle the others.

> +      new_alu_instr->src[i].swizzle[index] =
> +         alu_instr->src[i].swizzle[0];
> +   }
> +
> +   nir_alu_dest_copy(&new_alu_instr->dest, new_dest, mem_ctx);
> +   new_alu_instr->dest.write_mask = 1 << index;
> +
> +   return new_alu_instr;
> +}
> +
> +/**
> + * Simply searches an entry in the list, and return true if found, false
> + * otherwise.
> + */
> +static bool
> +register_already_tracked(const nir_register *reg, nir_register *list[4],
> +                         unsigned num_items)
> +{
> +   for (unsigned i = 0; i < num_items; i++) {
> +      if (list[i] == reg)
> +         return true;
> +   }
> +   return false;
> +}
> +
> +/**
> + * Tells whether an instruction comes before another one inside the same
> + * block. Return true if 'target' appears after 'other', false otherwise.
> + */
> +static bool
> +instr_is_previous_in_block(nir_instr *target, nir_instr *other)
> +{
> +   nir_instr *prev = target;
> +   while ((prev = nir_instr_prev(prev)) != NULL) {
> +      if (prev == other)
> +         return true;
> +   }
> +   return false;
> +}
> +
> +static bool
> +lower_vec_and_coalesce_block(nir_block *block, void *mem_ctx)
> +{
> +   nir_foreach_instr_safe(block, instr) {
> +      if (instr->type != nir_instr_type_alu)
> +         continue;
> +
> +      nir_alu_instr *vec = nir_instr_as_alu(instr);
> +
> +      switch (vec->op) {
> +      case nir_op_vec2:
> +      case nir_op_vec3:
> +      case nir_op_vec4:
> +         break;
> +      default:
> +         continue; /* The loop */
> +      }
> +
> +      /* Since we insert multiple MOVs, we have to be non-SSA. */
> +      assert(!vec->dest.dest.is_ssa);
> +
> +      unsigned finished_write_mask = 0;
> +      nir_register *tracked_registers[4] = {0};
> +      unsigned num_tracked_registers = 0;
> +
> +      for (unsigned i = 0; i < 4; i++) {
> +         if (!(vec->dest.write_mask & (1 << i)))
> +            continue;
> +
> +         if (vec->src[i].src.is_ssa)
> +            continue;
> +
> +         nir_register *reg = vec->src[i].src.reg.reg;
> +
> +         /* By now, lets ignore the sources whose register is the same
as the
> +          * destination register of the vecN instruction.
> +          */
> +         if (reg == vec->dest.dest.reg.reg)
> +            continue;
> +
> +         nir_foreach_def_safe(reg, src) {
> +            nir_instr *parent_instr = src->reg.parent_instr;
> +
> +            /* By now, we enforce that the parent instruction must be in
the
> +             * same block as the vecN instruction. This is because if it
is in
> +             * other block, like a branch of a conditional, we need to
be sure
> +             * that the register component is propagated in all branches.
> +             * Otherwise we cannot reduce the current vecN instruction.
> +             * @TODO: To extend this pass to work across blocks, more
analysis
> +             * is needed.
> +             */
> +            if (parent_instr->block != block)
> +               continue;
> +
> +            /* The instruction to propagate to must come before the vecN
> +             * instruction inside the block.
> +             */
> +            if (!instr_is_previous_in_block(instr, parent_instr))
> +               continue;
> +
> +            /* We only coalesce registers written by ALU instructions,
by now.
> +             * @TODO: consider other type of instructions, like
intrinsics, etc.
> +             */
> +            if (parent_instr->type != nir_instr_type_alu)
> +               continue;
> +
> +            nir_alu_instr *parent_alu_instr =
nir_instr_as_alu(parent_instr);
> +            nir_register *parent_dest_reg =
parent_alu_instr->dest.dest.reg.reg;
> +
> +            /* We only override dest registers that are only used once,
and in
> +             * this vecX instruction.
> +             * @TODO: In the future we might consider registers used
more than
> +             * once as sources of the same vecX instruction.
> +             */

Yeah, we definitely want to handle that case and we want to handle it by
emitting a single instruction that does multiple channels.

> +            if (list_length(&parent_dest_reg->uses) != 1)
> +               continue;
> +
> +            nir_alu_instr *new_alu_instr =
> +               clone_alu_instr_and_override_dest(parent_alu_instr,
&vec->dest,
> +                                                 i, mem_ctx);

This mess would be a lot easier if we were still in SSA form.  I've been
thinking for a while that we should make lower_vec_to_movs work in SSA form
(it would replace SSA destinations with register destinations).  Then use
partial SSA lowering like we do in the FS backend.  That would make this
pass much easier together correct.

In general, I'm kind of thinking that this might be better done as an
extension to the lower_vec_to_moves pass or as a new version of it.  You
would loop through the sources of a vecN and, if you can rewrite a
destination, you do and if you can't, you insert moves.  We may also want
to  it handle SSA while we're at it.  Note: it will get run after
nir_from_ssa so it will have to handle partial SSA where some destinations
are SSA and some are registers.

Does this seem reasonable?

> +            /* For imov and fmov operations, we set the original swizzle
from
> +             * the vecN source, directly into the new instruction's only
> +             * source.
> +             */
> +            if (parent_alu_instr->op == nir_op_imov ||
> +                parent_alu_instr->op == nir_op_fmov) {
> +               new_alu_instr->src[0].swizzle[0] = vec->src[i].swizzle[i];
> +            }
> +
> +            finished_write_mask |= new_alu_instr->dest.write_mask;
> +
> +            /* Remove the old instruction */
> +            nir_instr_remove(&parent_alu_instr->instr);
> +            ralloc_free(parent_alu_instr);
> +
> +            /* Track the intermediate register, to remove it later if
not used */
> +            if (!register_already_tracked(parent_dest_reg,
tracked_registers,
> +                                          num_tracked_registers)) {
> +               tracked_registers[num_tracked_registers] =
parent_dest_reg;
> +               num_tracked_registers++;
> +            }
> +
> +            /* Insert the new instruction with the overwritten
destination */
> +            nir_instr_insert_before(&vec->instr, &new_alu_instr->instr);
> +         }
> +      }
> +
> +      if (finished_write_mask == 0)
> +         continue;
> +
> +      nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx,
nir_op_vec4);
> +      nir_alu_dest_copy(&new_alu_instr->dest, &vec->dest, mem_ctx);
> +      new_alu_instr->dest.write_mask = 0;
> +
> +      unsigned c = 0;
> +      for (unsigned i = 0; i < nir_op_infos[vec->op].num_inputs; i++) {
> +         if (!(vec->dest.write_mask & (1 << i)))
> +            continue;
> +
> +         if (finished_write_mask & (1 << i))
> +            continue;
> +
> +         nir_alu_src_copy(&new_alu_instr->src[c], &vec->src[i], mem_ctx);
> +         new_alu_instr->src[c].swizzle[i] = vec->src[i].swizzle[c];
> +
> +         new_alu_instr->dest.write_mask |= (1 << i);
> +
> +         c++;
> +      }
> +
> +      switch (c) {
> +      case 0:
> +         /* we coalesced all register components, so no need to emit any
> +          * reduced instruction.
> +          */
> +         ralloc_free(new_alu_instr);
> +         break;
> +      case 1:
> +         new_alu_instr->op = nir_op_imov;
> +         break;
> +      case 2:
> +         new_alu_instr->op = nir_op_vec2;
> +         break;
> +      case 3:
> +         new_alu_instr->op = nir_op_vec3;
> +         break;
> +      default:
> +         unreachable("Not reached");
> +      }
> +
> +      if (c != 0)
> +         nir_instr_insert_before(&vec->instr, &new_alu_instr->instr);
> +
> +      /* Remove the original vecN instruction */
> +      nir_instr_remove(&vec->instr);
> +      ralloc_free(vec);
> +
> +      /* Remove the tracked registers that are no longer used */
> +      for (unsigned i = 0; i < num_tracked_registers; i++) {
> +         if (list_length(&tracked_registers[i]->defs) == 0 &&
> +             list_length(&tracked_registers[i]->uses) == 0 &&
> +             list_length(&tracked_registers[i]->if_uses) == 0) {
> +            nir_reg_remove(tracked_registers[i]);
> +         }
> +      }
> +   }
> +
> +   return true;
> +}
> +
> +static void
> +nir_lower_vec_and_coalesce_impl(nir_function_impl *impl)
> +{
> +   nir_foreach_block(impl, lower_vec_and_coalesce_block,
ralloc_parent(impl));
> +}
> +
> +void
> +nir_lower_vec_and_coalesce(nir_shader *shader)
> +{
> +   nir_foreach_overload(shader, overload) {
> +      if (overload->impl)
> +         nir_lower_vec_and_coalesce_impl(overload->impl);
> +   }
> +}
> --
> 2.4.6
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20150909/4216b122/attachment-0001.html>


More information about the mesa-dev mailing list