Mesa (main): radv: move ngg culling determination earlier

Mon Oct 4 09:26:37 UTC 2021

Module: Mesa
Branch: main
Commit: 24501b54528278fd1a1d3cac35e4f9c7b0e0d07a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=24501b54528278fd1a1d3cac35e4f9c7b0e0d07a

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Fri Sep 10 16:25:05 2021 +0100

radv: move ngg culling determination earlier

Co-Authored-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13134>

---

 src/amd/common/ac_nir.h           |  3 +--
 src/amd/common/ac_nir_lower_ngg.c | 26 +-------------------------
 src/amd/vulkan/radv_pipeline.c    | 32 +++++++++++++++++++++++++++-----
 src/amd/vulkan/radv_shader.c      | 33 ++++++++++++++++++++++++++-------
 src/amd/vulkan/radv_shader.h      |  5 ++---
 5 files changed, 57 insertions(+), 42 deletions(-)

diff --git a/src/amd/common/ac_nir.h b/src/amd/common/ac_nir.h
index f77104ecb61..dd95139a266 100644
--- a/src/amd/common/ac_nir.h
+++ b/src/amd/common/ac_nir.h
@@ -94,7 +94,6 @@ ac_nir_lower_indirect_derefs(nir_shader *shader,
 typedef struct
 {
    unsigned lds_bytes_if_culling_off;
-   bool can_cull;
    bool passthrough;
    bool early_prim_export;
    uint64_t nggc_inputs_read_by_pos;
@@ -107,7 +106,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
                       unsigned num_vertices_per_primitive,
                       unsigned max_workgroup_size,
                       unsigned wave_size,
-                      bool consider_culling,
+                      bool can_cull,
                       bool consider_passthrough,
                       bool export_prim_id,
                       bool provoking_vtx_last,
diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c
index 88d6865198d..228ebba1cbc 100644
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -1254,34 +1254,13 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c
       unreachable("Should be VS or TES.");
 }
 
-static bool
-can_use_deferred_attribute_culling(nir_shader *shader)
-{
-   /* When the shader writes memory, it is difficult to guarantee correctness.
-    * Future work:
-    * - if only write-only SSBOs are used
-    * - if we can prove that non-position outputs don't rely on memory stores
-    * then may be okay to keep the memory stores in the 1st shader part, and delete them from the 2nd.
-    */
-   if (shader->info.writes_memory)
-      return false;
-
-   /* When the shader relies on the subgroup invocation ID, we'd break it, because the ID changes after the culling.
-    * Future work: try to save this to LDS and reload, but it can still be broken in subtle ways.
-    */
-   if (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SUBGROUP_INVOCATION))
-      return false;
-
-   return true;
-}
-
 ac_nir_ngg_config
 ac_nir_lower_ngg_nogs(nir_shader *shader,
                       unsigned max_num_es_vertices,
                       unsigned num_vertices_per_primitives,
                       unsigned max_workgroup_size,
                       unsigned wave_size,
-                      bool consider_culling,
+                      bool can_cull,
                       bool consider_passthrough,
                       bool export_prim_id,
                       bool provoking_vtx_last,
@@ -1292,8 +1271,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
    assert(impl);
    assert(max_num_es_vertices && max_workgroup_size && wave_size);
 
-   bool can_cull = consider_culling && (num_vertices_per_primitives == 3) &&
-                   can_use_deferred_attribute_culling(shader);
    bool passthrough = consider_passthrough && !can_cull &&
                       !(shader->info.stage == MESA_SHADER_VERTEX && export_prim_id);
 
@@ -1441,7 +1418,6 @@ ac_nir_lower_ngg_nogs(nir_shader *shader,
 
    ac_nir_ngg_config ret = {
       .lds_bytes_if_culling_off = lds_bytes_if_culling_off,
-      .can_cull = can_cull,
       .passthrough = passthrough,
       .early_prim_export = state.early_prim_export,
       .nggc_inputs_read_by_pos = state.inputs_needed_by_pos,
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 3eea082571c..d9e28280827 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -2763,6 +2763,29 @@ radv_get_ballot_bit_size(struct radv_device *device, const VkPipelineShaderStage
    return 64;
 }
 
+static void
+radv_determine_ngg_settings(struct radv_pipeline *pipeline,
+                            const struct radv_pipeline_key *pipeline_key,
+                            struct radv_shader_info *infos, nir_shader **nir)
+{
+   struct radv_device *device = pipeline->device;
+
+   if (!nir[MESA_SHADER_GEOMETRY] && pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE) {
+      uint64_t ps_inputs_read =
+         nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
+      gl_shader_stage es_stage = pipeline->graphics.last_vgt_api_stage;
+
+      unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1;
+      if (es_stage == MESA_SHADER_TESS_EVAL)
+         num_vertices_per_prim = nir[es_stage]->info.tess.point_mode                      ? 1
+                                 : nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2
+                                                                                          : 3;
+
+      infos[es_stage].has_ngg_culling =
+         radv_consider_culling(device, nir[es_stage], ps_inputs_read, num_vertices_per_prim);
+   }
+}
+
 static void
 radv_fill_shader_info(struct radv_pipeline *pipeline,
                       const VkPipelineShaderStageCreateInfo **pStages,
@@ -3454,6 +3477,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
       infos[hw_vs_api_stage].workgroup_size = infos[hw_vs_api_stage].wave_size;
    }
 
+   radv_determine_ngg_settings(pipeline, pipeline_key, infos, nir);
+
    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
       if (nir[i]) {
          radv_start_feedback(stage_feedbacks[i]);
@@ -3518,11 +3543,8 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
          bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
          bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
                             !radv_use_llvm_for_stage(device, i);
-         if (lowered_ngg) {
-            uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
-            bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read);
-            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, consider_culling);
-         }
+         if (lowered_ngg)
+            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key);
 
          radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);
 
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 7319c4f59aa..1786370b7b8 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -886,7 +886,7 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
 
 bool
 radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
-                      uint64_t ps_inputs_read)
+                      uint64_t ps_inputs_read, unsigned num_vertices_per_primitive)
 {
    /* Culling doesn't make sense for meta shaders. */
    if (!!nir->info.name)
@@ -917,14 +917,34 @@ radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
       max_ps_params = 4; /* Navi 1x. */
 
    /* TODO: consider other heuristics here, such as PS execution time */
+   if (util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) > max_ps_params)
+      return false;
+
+   /* Only triangle culling is supported. */
+   if (num_vertices_per_primitive != 3)
+      return false;
+
+   /* When the shader writes memory, it is difficult to guarantee correctness.
+    * Future work:
+    * - if only write-only SSBOs are used
+    * - if we can prove that non-position outputs don't rely on memory stores
+    * then may be okay to keep the memory stores in the 1st shader part, and delete them from the 2nd.
+    */
+   if (nir->info.writes_memory)
+      return false;
+
+   /* When the shader relies on the subgroup invocation ID, we'd break it, because the ID changes after the culling.
+    * Future work: try to save this to LDS and reload, but it can still be broken in subtle ways.
+    */
+   if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_SUBGROUP_INVOCATION))
+      return false;
 
-   return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;
+   return true;
 }
 
 void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
                     struct radv_shader_info *info,
-                    const struct radv_pipeline_key *pl_key,
-                    bool consider_culling)
+                    const struct radv_pipeline_key *pl_key)
 {
    /* TODO: support the LLVM backend with the NIR lowering */
    assert(!radv_use_llvm_for_stage(device, nir->info.stage));
@@ -971,7 +991,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
 
       assert(info->is_ngg);
 
-      if (consider_culling)
+      if (info->has_ngg_culling)
          radv_optimize_nir_algebraic(nir, false);
 
       if (nir->info.stage == MESA_SHADER_VERTEX) {
@@ -987,14 +1007,13 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
             num_vertices_per_prim,
             info->workgroup_size,
             info->wave_size,
-            consider_culling,
+            info->has_ngg_culling,
             info->is_ngg_passthrough,
             export_prim_id,
             pl_key->vs.provoking_vtx_last,
             false,
             pl_key->vs.instance_rate_inputs);
 
-      info->has_ngg_culling = out_conf.can_cull;
       info->has_ngg_early_prim_export = out_conf.early_prim_export;
       info->num_lds_blocks_when_not_culling = DIV_ROUND_UP(out_conf.lds_bytes_if_culling_off, device->physical_device->rad_info.lds_encode_granularity);
       info->is_ngg_passthrough = out_conf.passthrough;
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index a26b1c993dc..158aea71e83 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -530,11 +530,10 @@ bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
 
 void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
                     struct radv_shader_info *info,
-                    const struct radv_pipeline_key *pl_key,
-                    bool consider_culling);
+                    const struct radv_pipeline_key *pl_key);
 
 bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
-                           uint64_t ps_inputs_read);
+                           uint64_t ps_inputs_read, unsigned num_vertices_per_primitive);
 
 void radv_get_nir_options(struct radv_physical_device *device);