Mesa (main): radeonsi: assign param export indices before compilation

Fri Apr 22 22:47:28 UTC 2022

Module: Mesa
Branch: main
Commit: 3777a5d7157f679be4afecb89313ea0d9db47a8e
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=3777a5d7157f679be4afecb89313ea0d9db47a8e

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sun Dec 12 20:50:58 2021 -0500

radeonsi: assign param export indices before compilation

This moves the logic out of LLVM-specific codepaths.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14414>

---

 src/gallium/drivers/radeonsi/si_shader.c         | 85 +++++++++++++++++++++++-
 src/gallium/drivers/radeonsi/si_shader.h         |  3 +-
 src/gallium/drivers/radeonsi/si_shader_llvm.c    | 28 --------
 src/gallium/drivers/radeonsi/si_shader_llvm_gs.c | 20 ++++++
 src/gallium/drivers/radeonsi/si_shader_llvm_vs.c | 76 +++++----------------
 5 files changed, 123 insertions(+), 89 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 87935110c34..2a90a07f214 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1580,6 +1580,52 @@ void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
    shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
 }
 
+static void si_nir_assign_param_offsets(nir_shader *nir, const struct si_shader_info *info,
+                                        int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
+                                        uint8_t *num_param_exports, uint64_t *output_param_mask,
+                                        uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS])
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   assert(impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         if (intr->intrinsic != nir_intrinsic_store_output)
+            continue;
+
+         /* No indirect indexing allowed. */
+         ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
+         assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
+
+         assert(intr->num_components == 1); /* only scalar stores expected */
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+         /* Assign the param index if it's unassigned. */
+         if (nir_slot_is_varying(sem.location) && !sem.no_varying &&
+             (sem.gs_streams & 0x3) == 0 &&
+             vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
+            /* The semantic and the base should be the same as in si_shader_info. */
+            assert(sem.location == info->output_semantic[nir_intrinsic_base(intr)]);
+            /* It must not be remapped (duplicated). */
+            assert(slot_remap[sem.location] == -1);
+
+            vs_output_param_offset[sem.location] = (*num_param_exports)++;
+            *output_param_mask |= BITFIELD64_BIT(nir_intrinsic_base(intr));
+         }
+      }
+   }
+
+   /* Duplicated outputs are redirected here. */
+   for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
+      if (slot_remap[i] >= 0)
+         vs_output_param_offset[i] = vs_output_param_offset[slot_remap[i]];
+   }
+}
+
 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                        struct si_shader *shader, struct util_debug_callback *debug)
 {
@@ -1587,6 +1633,42 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
    bool free_nir;
    struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir);
 
+   /* Assign param export indices. */
+   if ((sel->stage == MESA_SHADER_VERTEX ||
+        sel->stage == MESA_SHADER_TESS_EVAL ||
+        (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
+       !shader->key.ge.as_ls && !shader->key.ge.as_es) {
+      /* Initialize this first. */
+      shader->info.nr_param_exports = 0;
+      shader->info.vs_output_param_mask = 0;
+
+      STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
+      memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
+             sizeof(shader->info.vs_output_param_offset));
+
+      /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
+       * mapped to multiple fragment shader inputs.
+       */
+      int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
+      memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
+
+      /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
+      /* TODO: This doesn't affect GS. */
+      NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
+                 shader->info.vs_output_param_offset);
+
+      /* Assign the non-constant outputs. */
+      /* TODO: Use this for the GS copy shader too. */
+      si_nir_assign_param_offsets(nir, &sel->info, slot_remap, &shader->info.nr_param_exports,
+                                  &shader->info.vs_output_param_mask,
+                                  shader->info.vs_output_param_offset);
+
+      if (shader->key.ge.mono.u.vs_export_prim_id) {
+         shader->info.vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = shader->info.nr_param_exports++;
+         shader->info.vs_output_param_mask |= BITFIELD64_BIT(sel->info.num_outputs);
+      }
+   }
+
    struct pipe_stream_output_info so = {};
    if (sel->info.enabled_streamout_buffer_mask)
       nir_gather_stream_output_info(nir, &so);
@@ -1635,13 +1717,14 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi
       if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
          vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
 
+      /* We must use the original shader info before the removal of duplicated shader outputs. */
       /* VS and TES should also set primitive ID output if it's used. */
       unsigned num_outputs_with_prim_id = sel->info.num_outputs +
                                           shader->key.ge.mono.u.vs_export_prim_id;
 
       for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
          unsigned semantic = sel->info.output_semantic[i];
-         unsigned offset = vs_output_param_offset[i];
+         unsigned offset = vs_output_param_offset[semantic];
          unsigned ps_input_cntl;
 
          if (offset <= AC_EXP_PARAM_OFFSET_31) {
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 5f9e59391b2..98408fb508a 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -739,7 +739,8 @@ union si_shader_key {
 
 /* GCN-specific shader info. */
 struct si_shader_binary_info {
-   ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS];
+   ubyte vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS];
+   uint64_t vs_output_param_mask; /* which params to export, indexed by "base" */
    uint32_t vs_output_ps_input_cntl[NUM_TOTAL_VARYING_SLOTS];
    ubyte num_input_sgprs;
    ubyte num_input_vgprs;
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 5a4c76793d8..1c26e82842d 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -1061,31 +1061,6 @@ static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
    return sel->stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000;
 }
 
-static void si_optimize_vs_outputs(struct si_shader_context *ctx)
-{
-   struct si_shader *shader = ctx->shader;
-   struct si_shader_info *info = &shader->selector->info;
-   unsigned skip_vs_optim_mask = 0;
-
-   if ((ctx->stage != MESA_SHADER_VERTEX && ctx->stage != MESA_SHADER_TESS_EVAL) ||
-       shader->key.ge.as_ls || shader->key.ge.as_es)
-      return;
-
-   /* Optimizing these outputs is not possible, since they might be overriden
-    * at runtime with S_028644_PT_SPRITE_TEX. */
-   for (int i = 0; i < info->num_outputs; i++) {
-      if (info->output_semantic[i] == VARYING_SLOT_PNTC ||
-          (info->output_semantic[i] >= VARYING_SLOT_TEX0 &&
-           info->output_semantic[i] <= VARYING_SLOT_TEX7)) {
-         skip_vs_optim_mask |= 1u << shader->info.vs_output_param_offset[i];
-      }
-   }
-
-   ac_optimize_vs_outputs(&ctx->ac, ctx->main_fn, shader->info.vs_output_param_offset,
-                          info->num_outputs, skip_vs_optim_mask,
-                          &shader->info.nr_param_exports);
-}
-
 bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
                             struct si_shader *shader, const struct pipe_stream_output_info *so,
                             struct util_debug_callback *debug, struct nir_shader *nir,
@@ -1295,9 +1270,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
 
    si_llvm_optimize_module(&ctx);
 
-   /* Post-optimization transformations and analysis. */
-   si_optimize_vs_outputs(&ctx);
-
    /* Make sure the input is a pointer and not integer followed by inttoptr. */
    assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index e20af7e1358..0bde0d99259 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -22,6 +22,7 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "ac_nir.h"
 #include "si_pipe.h"
 #include "si_shader_internal.h"
 #include "sid.h"
@@ -444,6 +445,25 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
    shader->is_gs_copy_shader = true;
    shader->wave_size = si_determine_wave_size(sscreen, shader);
 
+   STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
+   memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
+          sizeof(shader->info.vs_output_param_offset));
+
+   for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
+      unsigned semantic = gsinfo->output_semantic[i];
+
+      /* Skip if no channel writes to stream 0. */
+      if (!nir_slot_is_varying(semantic) ||
+          (gsinfo->output_streams[i] & 0x03 &&
+           gsinfo->output_streams[i] & 0x0c &&
+           gsinfo->output_streams[i] & 0x30 &&
+           gsinfo->output_streams[i] & 0xc0))
+         continue;
+
+      shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
+      shader->info.vs_output_param_mask |= BITFIELD64_BIT(i);
+   }
+
    si_llvm_context_init(&ctx, sscreen, compiler, shader->wave_size);
    ctx.shader = shader;
    ctx.stage = MESA_SHADER_VERTEX;
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index ab984f2f7fb..b54fc86ed2e 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -438,61 +438,6 @@ static void si_llvm_init_vs_export_args(struct si_shader_context *ctx, const LLV
    memcpy(&args->out[0], values, sizeof(values[0]) * 4);
 }
 
-static void si_prepare_param_exports(struct si_shader_context *ctx,
-                                     const struct si_shader_output_values *outputs, unsigned noutput,
-                                     struct ac_export_args exports[32])
-{
-   struct si_shader *shader = ctx->shader;
-   unsigned param_count = 0;
-
-   memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
-          sizeof(shader->info.vs_output_param_offset));
-
-   for (unsigned i = 0; i < noutput; i++) {
-      unsigned semantic = outputs[i].semantic;
-
-      /* Skip if no channel writes to stream 0. */
-      if (outputs[i].vertex_streams & 0x03 &&
-          outputs[i].vertex_streams & 0x0c &&
-          outputs[i].vertex_streams & 0x30 &&
-          outputs[i].vertex_streams & 0xc0)
-         continue;
-
-      switch (semantic) {
-      case VARYING_SLOT_LAYER:
-      case VARYING_SLOT_VIEWPORT:
-      case VARYING_SLOT_CLIP_DIST0:
-      case VARYING_SLOT_CLIP_DIST1:
-      case VARYING_SLOT_COL0:
-      case VARYING_SLOT_COL1:
-      case VARYING_SLOT_BFC0:
-      case VARYING_SLOT_BFC1:
-      case VARYING_SLOT_PRIMITIVE_ID:
-      case VARYING_SLOT_FOGC:
-         break;
-      default:
-         if ((semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7) ||
-             semantic >= VARYING_SLOT_VAR0)
-            break;
-         else
-            continue;
-      }
-
-      if ((semantic <= VARYING_SLOT_VAR31 || semantic >= VARYING_SLOT_VAR0_16BIT) &&
-          shader->key.ge.opt.kill_outputs &
-             (1ull << si_shader_io_get_unique_index(semantic, true)))
-         continue;
-
-      si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_PARAM + param_count,
-                                  &exports[param_count]);
-
-      assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset));
-      shader->info.vs_output_param_offset[i] = param_count++;
-   }
-
-   shader->info.nr_param_exports = param_count;
-}
-
 /**
  * Vertex color clamping.
  *
@@ -576,9 +521,6 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx,
 
    si_vertex_color_clamping(ctx, outputs, noutput);
 
-   struct ac_export_args param_exports[32];
-   si_prepare_param_exports(ctx, outputs, noutput, param_exports);
-
    /* Build position exports. */
    for (i = 0; i < noutput; i++) {
       switch (outputs[i].semantic) {
@@ -747,7 +689,23 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx,
       ac_build_export(&ctx->ac, &pos_args[i]);
    }
 
-   /* Build parameter exports. */
+   /* Build parameter exports. Use 2 loops to export params in ascending order.
+    * 32 is the maximum number of parameter exports.
+    */
+   struct ac_export_args param_exports[32] = {};
+   uint64_t vs_output_param_mask = shader->info.vs_output_param_mask;
+
+   while (vs_output_param_mask) {
+      unsigned i = u_bit_scan64(&vs_output_param_mask);
+      unsigned offset = shader->info.vs_output_param_offset[outputs[i].semantic];
+
+      assert(offset <= AC_EXP_PARAM_OFFSET_31);
+      assert(!param_exports[offset].enabled_channels);
+
+      si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_PARAM + offset,
+                                  &param_exports[offset]);
+   }
+
    for (unsigned i = 0; i < shader->info.nr_param_exports; i++)
       ac_build_export(&ctx->ac, &param_exports[i]);
 }