Mesa (main): ac: add ac_nir_optimize_outputs, a NIR version of ac_optimize_vs_outputs

Fri Apr 22 22:47:28 UTC 2022

Module: Mesa
Branch: main
Commit: 11c28d9798cc5b37b88e139517484d5810b6a2e9
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=11c28d9798cc5b37b88e139517484d5810b6a2e9

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sun Dec 12 21:20:09 2021 -0500

ac: add ac_nir_optimize_outputs, a NIR version of ac_optimize_vs_outputs

ac_optimize_vs_outputs is an LLVM IR pass, and it will be replaced by this.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14414>

---

 src/amd/common/ac_nir.h                  |   4 +
 src/amd/common/ac_nir_opt_outputs.c      | 328 +++++++++++++++++++++++++++++++
 src/amd/common/meson.build               |   1 +
 src/gallium/drivers/radeonsi/si_shader.c |   1 -
 4 files changed, 333 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h
index 22159506a38..78b22116fee 100644
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@@ -52,6 +52,10 @@ enum
 struct nir_builder;
 typedef struct nir_builder nir_builder;
 
+bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed,
+                             int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
+                             uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS]);
+
 void
 ac_nir_lower_ls_outputs_to_mem(nir_shader *ls,
                                bool tcs_in_out_eq,
diff --git a/src/amd/common/ac_nir_opt_outputs.c b/src/amd/common/ac_nir_opt_outputs.c
new file mode 100644
index 00000000000..be0258850af
--- /dev/null
+++ b/src/amd/common/ac_nir_opt_outputs.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright © 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* This helps separate shaders because the next shader doesn't have to be known.
+ *
+ * It optimizes VS and TES outputs before FS as follows:
+ * - Eliminate and merge equal outputs, and treat undef as equal to everything, e.g.
+ *   (x,y,undef,undef) == (undef,y,z,undef) --> (x,y,z,undef) regardless of the interpolation
+ *   qualifier (AMD can map 1 output to multiple PS inputs and interpolate each differently).
+ * - Remove constant outputs that match AMD DEFAULT_VAL options, e.g. (0,0,0,1),
+ *   treat undef as whatever.
+ *
+ * It requires that there is no indirect indexing and all output stores must be scalar.
+ */
+
+#include "ac_nir.h"
+#include "nir_builder.h"
+
+struct ac_chan_info {
+   nir_instr *value;
+   nir_intrinsic_instr *store_intr; /* The intrinsic writing the value. */
+};
+
+struct ac_out_info {
+   unsigned base; /* nir_intrinsic_base */
+   nir_alu_type types;
+   bool duplicated;
+   bool constant;
+
+   /* Channels 0-3 are 32-bit channels or low bits of 16-bit channels.
+    * Channels 4-7 are high bits of 16-bit channels.
+    */
+   struct ac_chan_info chan[8];
+};
+
+static void ac_remove_varying(struct ac_out_info *out)
+{
+   /* Remove the output. (all channels) */
+   for (unsigned i = 0; i < ARRAY_SIZE(out->chan); i++) {
+      if (out->chan[i].store_intr) {
+         nir_remove_varying(out->chan[i].store_intr);
+         out->chan[i].store_intr = NULL;
+         out->chan[i].value = NULL;
+      }
+   }
+}
+
+/* Return true if the output matches DEFAULT_VAL and has been eliminated. */
+static bool ac_eliminate_const_output(struct ac_out_info *out,
+                                      gl_varying_slot semantic,
+                                      uint8_t *param_export_index)
+{
+   if (!(out->types & 32))
+      return false;
+
+   bool is_zero[4] = {0}, is_one[4] = {0};
+
+   for (unsigned i = 0; i < 4; i++) {
+      /* NULL means undef. */
+      if (!out->chan[i].value) {
+         is_zero[i] = true;
+         is_one[i] = true;
+      } else if (out->chan[i].value->type == nir_instr_type_load_const) {
+         if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 0)
+            is_zero[i] = true;
+         else if (nir_instr_as_load_const(out->chan[i].value)->value[0].f32 == 1)
+            is_one[i] = true;
+         else
+            return false; /* other constant */
+      } else
+         return false;
+   }
+
+   /* Only certain combinations of 0 and 1 are supported. */
+   unsigned default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
+
+   if (is_zero[0] && is_zero[1] && is_zero[2]) {
+      if (is_zero[3])
+         default_val = AC_EXP_PARAM_DEFAULT_VAL_0000;
+      else if (is_one[3])
+         default_val = AC_EXP_PARAM_DEFAULT_VAL_0001;
+      else
+         return false;
+   } else if (is_one[0] && is_one[1] && is_one[2]) {
+      if (is_zero[3])
+         default_val = AC_EXP_PARAM_DEFAULT_VAL_1110;
+      else if (is_one[3])
+         default_val = AC_EXP_PARAM_DEFAULT_VAL_1111;
+      else
+         return false;
+   } else {
+      return false;
+   }
+
+   /* Change OFFSET to DEFAULT_VAL. */
+   param_export_index[semantic] = default_val;
+   out->constant = true;
+   ac_remove_varying(out);
+   return true;
+}
+
+static bool ac_eliminate_duplicated_output(struct ac_out_info *outputs,
+                                           BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS),
+                                           gl_varying_slot current, struct nir_builder *b,
+                                           int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
+{
+   struct ac_out_info *cur = &outputs[current];
+   unsigned p, copy_back_channels = 0;
+
+   /* Check all outputs before current. */
+   BITSET_FOREACH_SET(p, outputs_optimized, current) {
+      struct ac_out_info *prev = &outputs[p];
+
+      /* Only compare with real outputs. */
+      if (prev->constant || prev->duplicated)
+         continue;
+
+      /* The types must match (only 16-bit and 32-bit types are allowed). */
+      if ((prev->types & 16) != (cur->types & 16))
+         continue;
+
+      bool different = false;
+
+      /* Iterate over all channels, including 16-bit channels in chan_hi. */
+      for (unsigned j = 0; j < 8; j++) {
+         nir_instr *prev_chan = prev->chan[j].value;
+         nir_instr *cur_chan = cur->chan[j].value;
+
+         /* Treat undef as a match. */
+         if (!cur_chan)
+            continue;
+
+         /* If prev is undef but cur isn't, we can merge the outputs
+          * and consider the output duplicated.
+          */
+         if (!prev_chan) {
+            copy_back_channels |= 1 << j;
+            continue;
+         }
+
+         /* Test whether the values are different. */
+         if (prev_chan != cur_chan &&
+             (prev_chan->type != nir_instr_type_load_const ||
+              cur_chan->type != nir_instr_type_load_const ||
+              nir_instr_as_load_const(prev_chan)->value[0].u32 !=
+              nir_instr_as_load_const(cur_chan)->value[0].u32)) {
+            different = true;
+            break;
+         }
+      }
+      if (!different)
+         break;
+
+      copy_back_channels = 0;
+   }
+   if (p == current)
+      return false;
+
+   /* An equal output already exists. Make FS use the existing one instead.
+    * This effectively disables the current output and the param export shouldn't
+    * be generated.
+    */
+   cur->duplicated = true;
+
+   /* p is gl_varying_slot in addition to being an index into outputs. */
+   slot_remap[current] = p;
+
+   /* If the matching preceding output has undef where the current one has a proper value,
+    * move the value to the preceding output.
+    */
+   struct ac_out_info *prev = &outputs[p];
+
+   while (copy_back_channels) {
+      unsigned i = u_bit_scan(&copy_back_channels);
+      struct ac_chan_info *prev_chan = &prev->chan[i];
+      struct ac_chan_info *cur_chan = &cur->chan[i];
+
+      b->cursor = nir_after_instr(&cur_chan->store_intr->instr);
+
+      /* The store intrinsic doesn't exist for this channel. Create a new one. */
+      nir_alu_type src_type = nir_intrinsic_src_type(cur_chan->store_intr);
+      struct nir_io_semantics sem = nir_intrinsic_io_semantics(cur_chan->store_intr);
+      struct nir_io_xfb xfb = nir_intrinsic_io_xfb(cur_chan->store_intr);
+      struct nir_io_xfb xfb2 = nir_intrinsic_io_xfb2(cur_chan->store_intr);
+
+      /* p is gl_varying_slot in addition to being an index into outputs. */
+      sem.location = p;
+      assert(sem.high_16bits == i / 4);
+
+      /* If it's a sysval output (such as CLIPDIST), we move the varying portion but keep
+       * the system value output. This is just the varying portion.
+       */
+      sem.no_sysval_output = 1;
+
+      /* Write just one component. */
+      prev_chan->store_intr = nir_store_output(b, nir_instr_ssa_def(cur_chan->value),
+                                               nir_imm_int(b, 0),
+                                               .base = prev->base,
+                                               .component = i % 4,
+                                               .io_semantics = sem,
+                                               .src_type = src_type,
+                                               .write_mask = 0x1,
+                                               .io_xfb = xfb,
+                                               .io_xfb2 = xfb2);
+
+      /* Update the undef channels in the output info. */
+      assert(!prev_chan->value);
+      prev_chan->value = cur_chan->value;
+
+      /* Remove transform feedback info from the current instruction because
+       * we moved it too. The instruction might not be removed if it's a system
+       * value output.
+       */
+      static struct nir_io_xfb zero_xfb;
+      nir_intrinsic_set_io_xfb(cur->chan[i].store_intr, zero_xfb);
+      nir_intrinsic_set_io_xfb2(cur->chan[i].store_intr, zero_xfb);
+   }
+
+   ac_remove_varying(cur);
+   return true;
+}
+
+bool ac_nir_optimize_outputs(nir_shader *nir, bool sprite_tex_disallowed,
+                             int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
+                             uint8_t param_export_index[NUM_TOTAL_VARYING_SLOTS])
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   assert(impl);
+
+   if (nir->info.stage != MESA_SHADER_VERTEX &&
+       nir->info.stage != MESA_SHADER_TESS_EVAL) {
+      nir_metadata_preserve(impl, nir_metadata_all);
+      return false;
+   }
+
+   struct ac_out_info outputs[NUM_TOTAL_VARYING_SLOTS] = {};
+
+   BITSET_DECLARE(outputs_optimized, NUM_TOTAL_VARYING_SLOTS);
+   BITSET_ZERO(outputs_optimized);
+
+   /* Gather outputs. */
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         if (intr->intrinsic != nir_intrinsic_store_output)
+            continue;
+
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+         /* Only process varyings that appear as param exports. */
+         if (!nir_slot_is_varying(sem.location) || sem.no_varying)
+            continue;
+
+         /* We can't optimize texture coordinates if sprite_coord_enable can override them. */
+         if (sem.location >= VARYING_SLOT_TEX0 && sem.location <= VARYING_SLOT_TEX7 &&
+             !sprite_tex_disallowed)
+            continue;
+
+         BITSET_SET(outputs_optimized, sem.location);
+
+         /* No indirect indexing allowed. */
+         ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
+         assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
+
+         /* nir_lower_io_to_scalar is required before this */
+         assert(intr->src[0].ssa->num_components == 1);
+         /* No intrinsic should store undef. */
+         assert(intr->src[0].ssa->parent_instr->type != nir_instr_type_ssa_undef);
+
+         /* Gather the output. */
+         struct ac_out_info *out_info = &outputs[sem.location];
+         if (!out_info->types)
+            out_info->base = nir_intrinsic_base(intr);
+         else
+            assert(out_info->base == nir_intrinsic_base(intr));
+
+         out_info->types |= nir_intrinsic_src_type(intr);
+
+         unsigned chan = sem.high_16bits * 4 + nir_intrinsic_component(intr);
+         out_info->chan[chan].store_intr = intr;
+         out_info->chan[chan].value = intr->src[0].ssa->parent_instr;
+      }
+   }
+
+   unsigned i;
+   bool progress = false;
+
+   struct nir_builder b;
+   nir_builder_init(&b, impl);
+
+   /* Optimize outputs. */
+   BITSET_FOREACH_SET(i, outputs_optimized, NUM_TOTAL_VARYING_SLOTS) {
+      progress |=
+         ac_eliminate_const_output(&outputs[i], i, param_export_index) ||
+         ac_eliminate_duplicated_output(outputs, outputs_optimized, i, &b, slot_remap);
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_dominance |
+                                  nir_metadata_block_index);
+   } else {
+      nir_metadata_preserve(impl, nir_metadata_all);
+   }
+   return progress;
+}
diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build
index 2a39b03ec1c..887bc300b32 100644
--- a/src/amd/common/meson.build
+++ b/src/amd/common/meson.build
@@ -90,6 +90,7 @@ amd_common_files = files(
   'ac_rgp_elf_object_pack.c',
   'ac_nir.c',
   'ac_nir.h',
+  'ac_nir_opt_outputs.c',
   'ac_nir_cull.c',
   'ac_nir_lower_esgs_io_to_mem.c',
   'ac_nir_lower_global_access.c',
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 894d840a184..87935110c34 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1502,7 +1502,6 @@ struct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,
        * - Eliminated PS system values are disabled by LLVM
        *   (FragCoord, FrontFace, barycentrics)
        * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef.
-       *   (thanks to an LLVM pass in Mesa - TODO: move it to NIR)
        *   The storage for eliminated outputs is also not allocated.
        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
        * - TCS output stores are eliminated