Mesa (main): radv, aco, ac/nir: Tweak position export scheduling for NGG culling.

Wed Jul 14 00:44:53 UTC 2021

Module: Mesa
Branch: main
Commit: 8341af5109542db0fcb6c379d91e6e9d5cdb3f2f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8341af5109542db0fcb6c379d91e6e9d5cdb3f2f

Author: Timur Kristóf <timur.kristof at gmail.com>
Date:   Mon Jul  5 15:26:18 2021 +0200

radv, aco, ac/nir: Tweak position export scheduling for NGG culling.

The result is about +5-ish fps in Doom Eternal.

It turns out that the location of position exports matters more
than we thought, and it's actually better to keep them at the bottom
for culling shaders rather than schedule it up to the top.

Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10525>

---

 src/amd/common/ac_nir.h            |  1 +
 src/amd/common/ac_nir_lower_ngg.c  |  1 +
 src/amd/compiler/aco_scheduler.cpp | 19 ++++++++++++++++---
 src/amd/vulkan/radv_shader.c       |  1 +
 src/amd/vulkan/radv_shader.h       |  1 +
 5 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h
index 4f4076cdd29..470749949d0 100644
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@@ -96,6 +96,7 @@ typedef struct
    unsigned lds_bytes_if_culling_off;
    bool can_cull;
    bool passthrough;
+   bool early_prim_export;
    uint64_t nggc_inputs_read_by_pos;
    uint64_t nggc_inputs_read_by_others;
 } ac_nir_ngg_config;
diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c
index 34ff7e16bcc..2d35d65bbec 100644
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -1290,6 +1290,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
       .lds_bytes_if_culling_off = lds_bytes_if_culling_off,
       .can_cull = can_cull,
       .passthrough = passthrough,
+      .early_prim_export = state.early_prim_export,
       .nggc_inputs_read_by_pos = state.inputs_needed_by_pos,
       .nggc_inputs_read_by_others = state.inputs_needed_by_others,
    };
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 9a17a816d89..9b4c9ffa48c 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -126,6 +126,8 @@ struct sched_ctx {
    int16_t last_SMEM_stall;
    int last_SMEM_dep_idx;
    MoveState mv;
+   bool schedule_pos_exports = true;
+   unsigned schedule_pos_export_div = 1;
 };
 
 /* This scheduler is a simple bottom-up pass based on ideas from
@@ -928,8 +930,8 @@ schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDeman
                          Instruction* current, int idx)
 {
    assert(idx != 0);
-   int window_size = POS_EXP_WINDOW_SIZE;
-   int max_moves = POS_EXP_MAX_MOVES;
+   int window_size = POS_EXP_WINDOW_SIZE / ctx.schedule_pos_export_div;
+   int max_moves = POS_EXP_MAX_MOVES / ctx.schedule_pos_export_div;
    int16_t k = 0;
 
    DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false);
@@ -982,7 +984,7 @@ schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
       Instruction* current = block->instructions[idx].get();
 
-      if (block->kind & block_kind_export_end && current->isEXP()) {
+      if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) {
          unsigned target = current->exp().dest;
          if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
             ctx.mv.current = current;
@@ -1048,6 +1050,17 @@ schedule_program(Program* program, live& live_vars)
    ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
                            int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
 
+   /* NGG culling shaders are very sensitive to position export scheduling.
+    * Schedule less aggressively when early primitive export is used, and
+    * keep the position export at the very bottom when late primitive export is used.
+    */
+   if (program->info->has_ngg_culling && program->stage.num_sw_stages() == 1) {
+      if (!program->info->has_ngg_early_prim_export)
+         ctx.schedule_pos_exports = false;
+      else
+         ctx.schedule_pos_export_div = 4;
+   }
+
    for (Block& block : program->blocks)
       schedule_block(ctx, program, &block, live_vars);
 
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index df8f47d85b1..2fdfa3f38ac 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -1017,6 +1017,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
             key->vs.provoking_vtx_last);
 
       info->has_ngg_culling = out_conf.can_cull;
+      info->has_ngg_early_prim_export = out_conf.early_prim_export;
       info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
       info->is_ngg_passthrough = out_conf.passthrough;
       key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index ab3dcac35c6..cabf6845a87 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -264,6 +264,7 @@ struct radv_shader_info {
    bool is_ngg;
    bool is_ngg_passthrough;
    bool has_ngg_culling;
+   bool has_ngg_early_prim_export;
    uint32_t num_lds_blocks_when_not_culling;
    uint32_t num_tess_patches;
    struct {