Mesa (main): radv: Use new NGG NIR lowering for VS/TES when ACO is used.

Wed May 12 14:12:17 UTC 2021

Module: Mesa
Branch: main
Commit: 9732881729bd0da8096e03efe77e511d4d6f815d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=9732881729bd0da8096e03efe77e511d4d6f815d

Author: Timur Kristóf <timur.kristof at gmail.com>
Date:   Thu Apr 15 17:21:57 2021 +0200

radv: Use new NGG NIR lowering for VS/TES when ACO is used.

Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10740>

---

 src/amd/compiler/aco_instruction_selection.cpp     | 146 ++-------------------
 src/amd/compiler/aco_instruction_selection.h       |   1 -
 .../compiler/aco_instruction_selection_setup.cpp   |  15 +--
 src/amd/vulkan/radv_pipeline.c                     |   3 +-
 src/amd/vulkan/radv_shader.c                       |  64 +++++++++
 src/amd/vulkan/radv_shader.h                       |   5 +
 6 files changed, 90 insertions(+), 144 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 6f7a97c7a69..089addd3313 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -4396,6 +4396,8 @@ bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp d
    return true;
 }
 
+static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos);
+
 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
 {
    if (ctx->stage == vertex_vs ||
@@ -4413,6 +4415,11 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
    } else {
       unreachable("Shader stage not implemented");
    }
+
+   /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we have to emit an exp here manually */
+   if (ctx->stage.hw == HWStage::NGG && (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
+       nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
+      export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
 }
 
 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
@@ -8496,8 +8503,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
          break;
       default:
          if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
-            /* This is actually the same as gs_prim_id, but we call it differently when there is no SW GS. */
-            bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vs_prim_id));
+            /* In case of NGG, the GS threads always have the primitive ID even if there is no SW GS. */
+            bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
             break;
          }
          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
@@ -11330,126 +11337,6 @@ void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive
         false /* compressed */, true/* done */, false /* valid mask */);
 }
 
-void ngg_nogs_export_primitives(isel_context *ctx)
-{
-   /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
-    * These must always come before VS exports.
-    *
-    * It is recommended to do these as early as possible. They can be at the beginning when
-    * there is no SW GS and the shader doesn't write edge flags.
-    */
-
-   if_context ic;
-   Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
-   begin_divergent_if_then(ctx, &ic, is_gs_thread);
-
-   Builder bld(ctx->program, ctx->block);
-   constexpr unsigned max_vertices_per_primitive = 3;
-   unsigned num_vertices_per_primitive = max_vertices_per_primitive;
-
-   assert(!ctx->stage.has(SWStage::GS));
-
-   if (ctx->stage == vertex_ngg) {
-      /* TODO: optimize for points & lines */
-   } else if (ctx->stage == tess_eval_ngg) {
-      if (ctx->shader->info.tess.point_mode)
-         num_vertices_per_primitive = 1;
-      else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
-         num_vertices_per_primitive = 2;
-   } else {
-      unreachable("Unsupported NGG non-GS shader stage");
-   }
-
-   Temp vtxindex[max_vertices_per_primitive];
-   if (!ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
-      vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
-                           get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
-      vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
-                  bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
-                           get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]), Operand(16u), Operand(16u));
-      vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
-                  bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
-                           get_arg(ctx, ctx->args->ac.gs_vtx_offset[2]));
-   }
-
-   /* Export primitive data to the index buffer. */
-   ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
-
-   /* Export primitive ID. */
-   if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) {
-      /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
-      Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
-      unsigned provoking_vtx_in_prim = 0;
-
-      /* For provoking vertex last mode, use num_vtx_in_prim - 1. */
-      if (ctx->args->options->key.vs.provoking_vtx_last)
-         provoking_vtx_in_prim = ctx->args->options->key.vs.outprim;
-
-      Temp provoking_vtx_index = vtxindex[provoking_vtx_in_prim];
-      Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
-
-      store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
-   }
-
-   begin_divergent_if_else(ctx, &ic);
-   end_divergent_if(ctx, &ic);
-}
-
-void ngg_nogs_export_prim_id(isel_context *ctx)
-{
-   assert(ctx->args->options->key.vs_common_out.export_prim_id);
-   Temp prim_id;
-
-   if (ctx->stage == vertex_ngg) {
-      /* Wait for GS threads to store primitive ID in LDS. */
-      Builder bld(ctx->program, ctx->block);
-      create_workgroup_barrier(bld);
-
-      /* Calculate LDS address where the GS threads stored the primitive ID. */
-      Temp thread_id_in_tg = thread_id_in_threadgroup(ctx);
-      Temp addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, 4u);
-
-      /* Load primitive ID from LDS. */
-      prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
-   } else if (ctx->stage == tess_eval_ngg) {
-      /* TES: Just use the patch ID as the primitive ID. */
-      prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
-   } else {
-      unreachable("unsupported NGG non-GS shader stage.");
-   }
-
-   ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
-   ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
-
-   export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
-}
-
-void ngg_nogs_prelude(isel_context *ctx)
-{
-   ngg_emit_wave0_sendmsg_gs_alloc_req(ctx);
-
-   if (ctx->ngg_nogs_early_prim_export)
-      ngg_nogs_export_primitives(ctx);
-}
-
-void ngg_nogs_late_export_finale(isel_context *ctx)
-{
-   assert(!ctx->ngg_nogs_early_prim_export);
-
-   /* Export VS/TES primitives. */
-   ngg_nogs_export_primitives(ctx);
-
-   /* Export the primitive ID for VS - needs to read LDS written by GS threads. */
-   if (ctx->args->options->key.vs_common_out.export_prim_id && ctx->stage.has(SWStage::VS)) {
-      if_context ic;
-      Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
-      begin_divergent_if_then(ctx, &ic, is_es_thread);
-      ngg_nogs_export_prim_id(ctx);
-      begin_divergent_if_else(ctx, &ic);
-      end_divergent_if(ctx, &ic);
-   }
-}
-
 std::pair<Temp, Temp> ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp src_mask)
 {
    /* Workgroup scan for NGG GS.
@@ -11866,7 +11753,6 @@ void select_program(Program *program,
 {
    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
    if_context ic_merged_wave_info;
-   bool ngg_no_gs = ctx.stage.hw == HWStage::NGG && !ctx.stage.has(SWStage::GS);
    bool ngg_gs    = ctx.stage.hw == HWStage::NGG &&  ctx.stage.has(SWStage::GS);
 
    for (unsigned i = 0; i < shader_count; i++) {
@@ -11890,9 +11776,7 @@ void select_program(Program *program,
          }
       }
 
-      if (ngg_no_gs)
-         ngg_nogs_prelude(&ctx);
-      else if (!i && ngg_gs)
+      if (!i && ngg_gs)
          ngg_gs_prelude(&ctx);
 
       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
@@ -11903,7 +11787,7 @@ void select_program(Program *program,
                            (nir->info.stage == MESA_SHADER_TESS_EVAL &&
                             ctx.stage == tess_eval_geometry_gs));
 
-      bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
+      bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader);
       bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
 
       if (i && ngg_gs) {
@@ -11943,10 +11827,6 @@ void select_program(Program *program,
 
       if (ctx.stage.hw == HWStage::VS) {
          create_vs_exports(&ctx);
-      } else if (ngg_no_gs) {
-         create_vs_exports(&ctx);
-         if (ctx.args->options->key.vs_common_out.export_prim_id && (ctx.ngg_nogs_early_prim_export || ctx.stage.has(SWStage::TES)))
-            ngg_nogs_export_prim_id(&ctx);
       } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
          Builder bld(ctx.program, ctx.block);
          bld.barrier(aco_opcode::p_barrier,
@@ -11963,9 +11843,7 @@ void select_program(Program *program,
          end_divergent_if(&ctx, &ic_merged_wave_info);
       }
 
-      if (ngg_no_gs && !ctx.ngg_nogs_early_prim_export)
-         ngg_nogs_late_export_finale(&ctx);
-      else if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
+      if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
          ngg_gs_finale(&ctx);
 
       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h
index 93583ecc40c..0a5462c04c2 100644
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -93,7 +93,6 @@ struct isel_context {
    Temp persp_centroid, linear_centroid;
 
    /* GS inputs */
-   bool ngg_nogs_early_prim_export = false;
    bool ngg_gs_early_alloc = false;
    bool ngg_gs_known_vtxcnt[4] = {false, false, false, false};
    Temp gs_wave_id;
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 292dd66ebff..2e44568f8eb 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -390,15 +390,11 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir)
       /* TODO: NGG streamout */
       if (ctx->stage.hw == HWStage::NGG)
          assert(!ctx->args->shader_info->so.num_outputs);
-
-      /* TODO: check if the shader writes edge flags (not in Vulkan) */
-      ctx->ngg_nogs_early_prim_export = exec_list_is_singular(&nir_shader_get_entrypoint(nir)->body);
    }
 
-   if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) {
-      /* We need to store the primitive IDs in LDS */
-      unsigned lds_size = ctx->program->info->ngg_info.esgs_ring_size;
-      ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->dev.lds_encoding_granule);
+   if (ctx->stage == vertex_ngg) {
+      ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
+      assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
    }
 }
 
@@ -463,8 +459,11 @@ setup_tes_variables(isel_context *ctx, nir_shader *nir)
       /* TODO: NGG streamout */
       if (ctx->stage.hw == HWStage::NGG)
          assert(!ctx->args->shader_info->so.num_outputs);
+   }
 
-      ctx->ngg_nogs_early_prim_export = exec_list_is_singular(&nir_shader_get_entrypoint(nir)->body);
+   if (ctx->stage == tess_eval_ngg) {
+      ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
+      assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
    }
 }
 
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 288c232cbbf..6e75338e7f1 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3434,6 +3434,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
 
          /* Lower I/O intrinsics to memory instructions. */
          bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
+         bool lowered_ngg = radv_lower_ngg(device, nir[i], !!nir[MESA_SHADER_GEOMETRY], &infos[i], pipeline_key, &keys[i]);
 
          /* optimize the lowered ALU operations */
          bool more_algebraic = true;
@@ -3446,7 +3447,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
             NIR_PASS(more_algebraic, nir[i], nir_opt_algebraic);
          }
 
-         if (io_to_mem || i == MESA_SHADER_COMPUTE)
+         if (io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE)
             NIR_PASS_V(nir[i], nir_opt_offsets);
 
          /* Do late algebraic optimization to turn add(a,
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 1ba5875e905..fd5780a55e8 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -810,6 +810,70 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
    return false;
 }
 
+bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has_gs,
+                    struct radv_shader_info *info,
+                    const struct radv_pipeline_key *pl_key,
+                    struct radv_shader_variant_key *key)
+{
+   /* TODO: support the LLVM backend with the NIR lowering */
+   if (radv_use_llvm_for_stage(device, nir->info.stage))
+      return false;
+
+   ac_nir_ngg_config out_conf = {0};
+   const struct gfx10_ngg_info *ngg_info = &info->ngg_info;
+   unsigned num_gs_invocations = (nir->info.stage != MESA_SHADER_GEOMETRY || ngg_info->max_vert_out_per_gs_instance) ? 1 : info->gs.invocations;
+   unsigned max_workgroup_size = MAX4(ngg_info->hw_max_esverts, /* Invocations that process an input vertex */
+                                      ngg_info->max_out_verts, /* Invocations that export an output vertex */
+                                      ngg_info->max_gsprims * num_gs_invocations, /* Invocations that process an input primitive */
+                                      ngg_info->max_gsprims * num_gs_invocations * ngg_info->prim_amp_factor /* Invocations that produce an output primitive */);
+
+   /* Maximum HW limit for NGG workgroups */
+   assert(max_workgroup_size <= 256);
+
+   if (nir->info.stage == MESA_SHADER_VERTEX ||
+       nir->info.stage == MESA_SHADER_TESS_EVAL) {
+      if (has_gs || !key->vs_common_out.as_ngg)
+         return false;
+
+      unsigned num_vertices_per_prim = 3;
+
+      if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+         if (nir->info.tess.point_mode)
+            num_vertices_per_prim = 1;
+         else if (nir->info.tess.primitive_mode == GL_ISOLINES)
+            num_vertices_per_prim = 2;
+      } else if (nir->info.stage == MESA_SHADER_VERTEX) {
+         /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
+         num_vertices_per_prim = key->vs.outprim + 1;
+      }
+
+      out_conf =
+         ac_nir_lower_ngg_nogs(
+            nir,
+            ngg_info->hw_max_esverts,
+            num_vertices_per_prim,
+            max_workgroup_size,
+            info->wave_size,
+            false,
+            key->vs_common_out.as_ngg_passthrough,
+            key->vs_common_out.export_prim_id,
+            key->vs.provoking_vtx_last);
+
+      info->is_ngg_passthrough = out_conf.passthrough;
+      key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
+   } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+      if (!key->vs_common_out.as_ngg)
+         return false;
+
+      /* TODO: lower NGG GS in NIR */
+      return false;
+   } else {
+      return false;
+   }
+
+   return true;
+}
+
 static void *
 radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader)
 {
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index ba360ffa88c..712d1ad6ffa 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -556,4 +556,9 @@ void radv_lower_io(struct radv_device *device, nir_shader *nir);
 bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
                           struct radv_shader_info *info, const struct radv_pipeline_key *pl_key);
 
+bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has_gs,
+                    struct radv_shader_info *info,
+                    const struct radv_pipeline_key *pl_key,
+                    struct radv_shader_variant_key *key);
+
 #endif