[Mesa-dev] [RFC 1/2] nir: Add a new lowering pass nir_lower_vec_and_coalesce
Eduardo Lima Mitev
elima at igalia.com
Wed Sep 9 23:51:01 PDT 2015
On 09/09/2015 07:10 PM, Jason Ekstrand wrote:
>
> On Sep 8, 2015 23:27, "Eduardo Lima Mitev" <elima at igalia.com
> <mailto:elima at igalia.com>> wrote:
>>
>> This pass will propagate the destination components of a vecN
> instructions,
>> as destination of the instructions that define its sources; if certain
>> conditions are met.
>>
>> If all the components of the destination register in the vecN instruction
>> can be propagated, the instruction is removed. Otherwise, a new, reduced
>> vecN instruction is emitted with the channels that remained.
>>
>> This effectively coalesces registers and reduces indirection.
>>
>> By now, this pass will only propagate to ALU instructions, but it could
>> be extended to include other instructions like load_input intrinsic.
>>
>> It also propagates to instructions within the same block as the vecN
>> instruction. But it could be made to work cross-block in the future,
>> though there are non-trivial issues with this like considering
>> registers that are written in different branches of a conditional.
>> More analysis is needed to correctly cover these cases.
>>
>> This pass works on a NIR shader in final form (after SSA), and is
>> expected to run before nir_lower_vec_to_movs().
>> ---
>> src/glsl/Makefile.sources | 1 +
>> src/glsl/nir/nir.h | 1 +
>> src/glsl/nir/nir_lower_vec_and_coalesce.c | 301
> ++++++++++++++++++++++++++++++
>> 3 files changed, 303 insertions(+)
>> create mode 100644 src/glsl/nir/nir_lower_vec_and_coalesce.c
>>
>> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
>> index c422303..015f242 100644
>> --- a/src/glsl/Makefile.sources
>> +++ b/src/glsl/Makefile.sources
>> @@ -48,6 +48,7 @@ NIR_FILES = \
>> nir/nir_lower_vars_to_ssa.c \
>> nir/nir_lower_var_copies.c \
>> nir/nir_lower_vec_to_movs.c \
>> + nir/nir_lower_vec_and_coalesce.c \
>> nir/nir_metadata.c \
>> nir/nir_normalize_cubemap_coords.c \
>> nir/nir_opt_constant_folding.c \
>> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
>> index 3c375f3..6a89f1d 100644
>> --- a/src/glsl/nir/nir.h
>> +++ b/src/glsl/nir/nir.h
>> @@ -1786,6 +1786,7 @@ void nir_lower_vars_to_ssa(nir_shader *shader);
>> void nir_remove_dead_variables(nir_shader *shader);
>>
>> void nir_lower_vec_to_movs(nir_shader *shader);
>> +void nir_lower_vec_and_coalesce(nir_shader *shader);
>> void nir_lower_alu_to_scalar(nir_shader *shader);
>> void nir_lower_load_const_to_scalar(nir_shader *shader);
>>
>> diff --git a/src/glsl/nir/nir_lower_vec_and_coalesce.c
> b/src/glsl/nir/nir_lower_vec_and_coalesce.c
>> new file mode 100644
>> index 0000000..2b21ec1
>> --- /dev/null
>> +++ b/src/glsl/nir/nir_lower_vec_and_coalesce.c
>> @@ -0,0 +1,301 @@
>> +/*
>> + * Copyright © 2015 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person
> obtaining a
>> + * copy of this software and associated documentation files (the
> "Software"),
>> + * to deal in the Software without restriction, including without
> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including
> the next
>> + * paragraph) shall be included in all copies or substantial portions
> of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
> OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + * Authors:
>> + * Eduardo Lima Mitev (elima at igalia.com <mailto:elima at igalia.com>)
>> + *
>> + */
>> +
>> +#include "nir.h"
>> +
>> +/*
>> + * Implements a pass that lowers vecN instructions by propagating the
>> + * components of their destinations, as the destination of the
>> + * instructions that defines the sources of the vecN instruction.
>> + *
>> + * This effectively coalesces registers and reduces indirection.
>> + *
>> + * If all the components of the destination register in the vecN
>> + * instruction can be propagated, the instruction is removed. Otherwise,
>> + * a new, reduced vecN instruction is emitted with the channels that
>> + * remained.
>> + *
>> + * By now, this pass will only propagate to ALU instructions, but it
> could
>> + * be extended to include load_const instructions or some intrinsics like
>> + * load_input.
>> + *
>> + * This pass works on a NIR shader in final form (after SSA), and is
>> + * expected to run before nir_lower_vec_to_movs().
>> + */
>> +
>> +/**
>> + * Clone an ALU instruction and override the destination with the one
> given by
>> + * new_dest. It copies sources from original ALU to the new one,
> adjusting
>> + * their swizzles.
>> + *
>> + * Returns the new ALU instruction.
>> + */
>> +static nir_alu_instr *
>> +clone_alu_instr_and_override_dest(nir_alu_instr *alu_instr,
>> + nir_alu_dest *new_dest, unsigned index,
>> + void *mem_ctx)
>> +{
>> + nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx,
> alu_instr->op);
>> +
>> + for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs;
> i++) {
>> + nir_alu_src_copy(&new_alu_instr->src[i], &alu_instr->src[i],
> mem_ctx);
>> +
>> + /* Dot-product operations should not be swizzle'd */
>> + switch (alu_instr->op) {
>> + case nir_op_fdot2:
>> + case nir_op_fdot3:
>> + case nir_op_fdot4:
>> + continue;
>> + default:
>> + break;
>> + }
>
> You should use nir_op_infos[alu_instr->op].output_size == 0 to determine
> whether or not the instruction is per-channel.
>
> You also need to check that nir_op_infos[op].input_sizes[i] == 0 to make
> sure that the components of the source actually map to components of of
> the destination. If output_size == 0 but input_sizes[i] != 0 for some
> source, we should probably bail entirely. The one exception to this
> that I can think of off-hand is bcsel where it has a single
> one-component source and you are free to reswizzle the others.
>
Good to know! I will definitely consider this in the future.
>> + new_alu_instr->src[i].swizzle[index] =
>> + alu_instr->src[i].swizzle[0];
>> + }
>> +
>> + nir_alu_dest_copy(&new_alu_instr->dest, new_dest, mem_ctx);
>> + new_alu_instr->dest.write_mask = 1 << index;
>> +
>> + return new_alu_instr;
>> +}
>> +
>> +/**
>> + * Simply searches an entry in the list, and return true if found, false
>> + * otherwise.
>> + */
>> +static bool
>> +register_already_tracked(const nir_register *reg, nir_register *list[4],
>> + unsigned num_items)
>> +{
>> + for (unsigned i = 0; i < num_items; i++) {
>> + if (list[i] == reg)
>> + return true;
>> + }
>> + return false;
>> +}
>> +
>> +/**
>> + * Tells whether an instruction comes before another one inside the same
>> + * block. Return true if 'target' appears after 'other', false otherwise.
>> + */
>> +static bool
>> +instr_is_previous_in_block(nir_instr *target, nir_instr *other)
>> +{
>> + nir_instr *prev = target;
>> + while ((prev = nir_instr_prev(prev)) != NULL) {
>> + if (prev == other)
>> + return true;
>> + }
>> + return false;
>> +}
>> +
>> +static bool
>> +lower_vec_and_coalesce_block(nir_block *block, void *mem_ctx)
>> +{
>> + nir_foreach_instr_safe(block, instr) {
>> + if (instr->type != nir_instr_type_alu)
>> + continue;
>> +
>> + nir_alu_instr *vec = nir_instr_as_alu(instr);
>> +
>> + switch (vec->op) {
>> + case nir_op_vec2:
>> + case nir_op_vec3:
>> + case nir_op_vec4:
>> + break;
>> + default:
>> + continue; /* The loop */
>> + }
>> +
>> + /* Since we insert multiple MOVs, we have to be non-SSA. */
>> + assert(!vec->dest.dest.is_ssa);
>> +
>> + unsigned finished_write_mask = 0;
>> + nir_register *tracked_registers[4] = {0};
>> + unsigned num_tracked_registers = 0;
>> +
>> + for (unsigned i = 0; i < 4; i++) {
>> + if (!(vec->dest.write_mask & (1 << i)))
>> + continue;
>> +
>> + if (vec->src[i].src.is_ssa)
>> + continue;
>> +
>> + nir_register *reg = vec->src[i].src.reg.reg;
>> +
>> + /* By now, lets ignore the sources whose register is the
> same as the
>> + * destination register of the vecN instruction.
>> + */
>> + if (reg == vec->dest.dest.reg.reg)
>> + continue;
>> +
>> + nir_foreach_def_safe(reg, src) {
>> + nir_instr *parent_instr = src->reg.parent_instr;
>> +
>> + /* By now, we enforce that the parent instruction must be
> in the
>> + * same block as the vecN instruction. This is because if
> it is in
>> + * other block, like a branch of a conditional, we need
> to be sure
>> + * that the register component is propagated in all branches.
>> + * Otherwise we cannot reduce the current vecN instruction.
>> + * @TODO: To extend this pass to work across blocks, more
> analysis
>> + * is needed.
>> + */
>> + if (parent_instr->block != block)
>> + continue;
>> +
>> + /* The instruction to propagate to must come before the vecN
>> + * instruction inside the block.
>> + */
>> + if (!instr_is_previous_in_block(instr, parent_instr))
>> + continue;
>> +
>> + /* We only coalesce registers written by ALU
> instructions, by now.
>> + * @TODO: consider other type of instructions, like
> intrinsics, etc.
>> + */
>> + if (parent_instr->type != nir_instr_type_alu)
>> + continue;
>> +
>> + nir_alu_instr *parent_alu_instr =
> nir_instr_as_alu(parent_instr);
>> + nir_register *parent_dest_reg =
> parent_alu_instr->dest.dest.reg.reg;
>> +
>> + /* We only override dest registers that are only used
> once, and in
>> + * this vecX instruction.
>> + * @TODO: In the future we might consider registers used
> more than
>> + * once as sources of the same vecX instruction.
>> + */
>
> Yeah, we definitely want to handle that case and we want to handle it by
> emitting a single instruction that does multiple channels.
>
I started implementing that as part of of this patch but quickly
realized it was not trivial and basically postponed it until I got some
feedback for the basics.
>> + if (list_length(&parent_dest_reg->uses) != 1)
>> + continue;
>> +
>> + nir_alu_instr *new_alu_instr =
>> + clone_alu_instr_and_override_dest(parent_alu_instr,
> &vec->dest,
>> + i, mem_ctx);
>
> This mess would be a lot easier if we were still in SSA form. I've been
> thinking for a while that we should make lower_vec_to_movs work in SSA
> form (it would replace SSA destinations with register destinations).
> Then use partial SSA lowering like we do in the FS backend. That would
> make this pass much easier together correct.
>
I didn't know about the partial SSA pass in FS backend, I could have got
some inspiration there. But did tried hard to implement this while in
SSA form at the beginning, without much success because the combination
of nir_from_ssa and lower_vec_to_movs would screw things after any SSA pass.
> In general, I'm kind of thinking that this might be better done as an
> extension to the lower_vec_to_moves pass or as a new version of it. You
> would loop through the sources of a vecN and, if you can rewrite a
> destination, you do and if you can't, you insert moves. We may also
> want to it handle SSA while we're at it. Note: it will get run after
> nir_from_ssa so it will have to handle partial SSA where some
> destinations are SSA and some are registers.
>
Actually, the first version of this patch, I wrote it against
lower_vec_to_movs, but then Matt suggested that likely it would be
easier for the opt_register_coalsece pass to avoid lower_vec_to_movs and
deal with vecN instructions in the backend. Then I rewrote this pass as
a separate thing that would re-emit reduced vecN instructions, to
essentially make it compatible with Matt's suggestion, thus future proof :).
> Does this seem reasonable?
>
Absolutely. The first thing I noticed is that lowering stuff in NIR out
of SSA form is not very elegant and also not very well supported (lack
of utility APIs).
I see you sent a series to the ML implementing some of the ideas in this
patch. The shader-db results are impressive!
I will use that as training material :) and follow the review process
closely. This also means I stop working on this pass.
Thanks a lot for the review!
Eduardo
>> + /* For imov and fmov operations, we set the original
> swizzle from
>> + * the vecN source, directly into the new instruction's only
>> + * source.
>> + */
>> + if (parent_alu_instr->op == nir_op_imov ||
>> + parent_alu_instr->op == nir_op_fmov) {
>> + new_alu_instr->src[0].swizzle[0] = vec->src[i].swizzle[i];
>> + }
>> +
>> + finished_write_mask |= new_alu_instr->dest.write_mask;
>> +
>> + /* Remove the old instruction */
>> + nir_instr_remove(&parent_alu_instr->instr);
>> + ralloc_free(parent_alu_instr);
>> +
>> + /* Track the intermediate register, to remove it later if
> not used */
>> + if (!register_already_tracked(parent_dest_reg,
> tracked_registers,
>> + num_tracked_registers)) {
>> + tracked_registers[num_tracked_registers] =
> parent_dest_reg;
>> + num_tracked_registers++;
>> + }
>> +
>> + /* Insert the new instruction with the overwritten
> destination */
>> + nir_instr_insert_before(&vec->instr, &new_alu_instr->instr);
>> + }
>> + }
>> +
>> + if (finished_write_mask == 0)
>> + continue;
>> +
>> + nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx,
> nir_op_vec4);
>> + nir_alu_dest_copy(&new_alu_instr->dest, &vec->dest, mem_ctx);
>> + new_alu_instr->dest.write_mask = 0;
>> +
>> + unsigned c = 0;
>> + for (unsigned i = 0; i < nir_op_infos[vec->op].num_inputs; i++) {
>> + if (!(vec->dest.write_mask & (1 << i)))
>> + continue;
>> +
>> + if (finished_write_mask & (1 << i))
>> + continue;
>> +
>> + nir_alu_src_copy(&new_alu_instr->src[c], &vec->src[i], mem_ctx);
>> + new_alu_instr->src[c].swizzle[i] = vec->src[i].swizzle[c];
>> +
>> + new_alu_instr->dest.write_mask |= (1 << i);
>> +
>> + c++;
>> + }
>> +
>> + switch (c) {
>> + case 0:
>> + /* we coalesced all register components, so no need to emit any
>> + * reduced instruction.
>> + */
>> + ralloc_free(new_alu_instr);
>> + break;
>> + case 1:
>> + new_alu_instr->op = nir_op_imov;
>> + break;
>> + case 2:
>> + new_alu_instr->op = nir_op_vec2;
>> + break;
>> + case 3:
>> + new_alu_instr->op = nir_op_vec3;
>> + break;
>> + default:
>> + unreachable("Not reached");
>> + }
>> +
>> + if (c != 0)
>> + nir_instr_insert_before(&vec->instr, &new_alu_instr->instr);
>> +
>> + /* Remove the original vecN instruction */
>> + nir_instr_remove(&vec->instr);
>> + ralloc_free(vec);
>> +
>> + /* Remove the tracked registers that are no longer used */
>> + for (unsigned i = 0; i < num_tracked_registers; i++) {
>> + if (list_length(&tracked_registers[i]->defs) == 0 &&
>> + list_length(&tracked_registers[i]->uses) == 0 &&
>> + list_length(&tracked_registers[i]->if_uses) == 0) {
>> + nir_reg_remove(tracked_registers[i]);
>> + }
>> + }
>> + }
>> +
>> + return true;
>> +}
>> +
>> +static void
>> +nir_lower_vec_and_coalesce_impl(nir_function_impl *impl)
>> +{
>> + nir_foreach_block(impl, lower_vec_and_coalesce_block,
> ralloc_parent(impl));
>> +}
>> +
>> +void
>> +nir_lower_vec_and_coalesce(nir_shader *shader)
>> +{
>> + nir_foreach_overload(shader, overload) {
>> + if (overload->impl)
>> + nir_lower_vec_and_coalesce_impl(overload->impl);
>> + }
>> +}
>> --
>> 2.4.6
>>
>
More information about the mesa-dev
mailing list