Mesa (main): radv: Use new NIR lowering of NGG GS when ACO is used.

Wed May 12 14:12:17 UTC 2021

Module: Mesa
Branch: main
Commit: bb127c2130c550b222541aaaac3da377083e8e56
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=bb127c2130c550b222541aaaac3da377083e8e56

Author: Timur Kristóf <timur.kristof at gmail.com>
Date:   Thu Apr 22 14:44:28 2021 +0200

radv: Use new NIR lowering of NGG GS when ACO is used.

Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10740>

---

 src/amd/compiler/aco_instruction_selection.cpp     | 679 +--------------------
 src/amd/compiler/aco_instruction_selection.h       |   8 -
 .../compiler/aco_instruction_selection_setup.cpp   |  23 +-
 src/amd/vulkan/radv_shader.c                       |  11 +-
 4 files changed, 16 insertions(+), 705 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 089addd3313..bb60ca99536 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -4290,44 +4290,6 @@ Temp thread_id_in_threadgroup(isel_context *ctx)
    return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
 }
 
-Temp wave_count_in_threadgroup(isel_context *ctx)
-{
-   Builder bld(ctx->program, ctx->block);
-   return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
-                   get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(28u | (4u << 16)));
-}
-
-Temp ngg_gs_vertex_lds_addr(isel_context *ctx, Temp vertex_idx)
-{
-   Builder bld(ctx->program, ctx->block);
-   unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1;
-
-   /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
-   if (write_stride_2exp) {
-      Temp row = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(5u), vertex_idx);
-      Temp swizzle = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand((1u << write_stride_2exp) - 1), row);
-      vertex_idx = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), vertex_idx, swizzle);
-   }
-
-   Temp vertex_idx_bytes = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->ngg_gs_emit_vtx_bytes);
-   return bld.vadd32(bld.def(v1), vertex_idx_bytes, Operand(ctx->ngg_gs_emit_addr));
-}
-
-Temp ngg_gs_emit_vertex_lds_addr(isel_context *ctx, Temp emit_vertex_idx)
-{
-   /* Should be used by GS threads only (not by the NGG GS epilogue).
-    * Returns the LDS address of the given vertex index as emitted by the current GS thread.
-    */
-
-   Builder bld(ctx->program, ctx->block);
-
-   Temp thread_id_in_tg = thread_id_in_threadgroup(ctx);
-   Temp thread_vertices_addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, ctx->shader->info.gs.vertices_out);
-   Temp vertex_idx = bld.vadd32(bld.def(v1), thread_vertices_addr, emit_vertex_idx);
-
-   return ngg_gs_vertex_lds_addr(ctx, vertex_idx);
-}
-
 Temp get_tess_rel_patch_id(isel_context *ctx)
 {
    Builder bld(ctx->program, ctx->block);
@@ -6988,115 +6950,6 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
    }
 }
 
-unsigned gs_outprim_vertices(unsigned outprim)
-{
-   switch (outprim) {
-   case 0: /* GL_POINTS */
-      return 1;
-   case 3: /* GL_LINE_STRIP */
-      return 2;
-   case 5: /* GL_TRIANGLE_STRIP */
-      return 3;
-   default:
-      unreachable("Unsupported GS output primitive type.");
-   }
-}
-
-void ngg_visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr)
-{
-   Builder bld(ctx->program, ctx->block);
-   Temp emit_vertex_idx = get_ssa_temp(ctx, instr->src[0].ssa);
-   Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, emit_vertex_idx);
-   unsigned stream = nir_intrinsic_stream_id(instr);
-   unsigned out_idx = 0;
-
-   for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
-      if (ctx->program->info->gs.output_streams[i] != stream) {
-         continue;
-      } else if (!ctx->outputs.mask[i] && ctx->program->info->gs.output_usage_mask[i]) {
-         /* The GS can write this output, but it's empty for the current vertex. */
-         out_idx++;
-         continue;
-      }
-
-      uint32_t wrmask = ctx->program->info->gs.output_usage_mask[i] &
-                        ctx->outputs.mask[i];
-
-      /* Clear output for the next vertex. */
-      ctx->outputs.mask[i] = 0;
-
-      if (!wrmask)
-         continue;
-
-      for (unsigned j = 0; j < 4; j++) {
-         if (wrmask & (1 << j)) {
-            Temp elem = ctx->outputs.temps[i * 4u + j];
-            store_lds(ctx, elem.bytes(), elem, 0x1u, emit_vertex_addr, out_idx * 4u, 4u);
-         }
-
-         out_idx++;
-      }
-   }
-
-   /* Calculate per-vertex primitive flags based on current and total vertex count per primitive:
-    *   bit 0: whether this vertex finishes a primitive
-    *   bit 1: whether the primitive is odd (if we are emitting triangle strips, otherwise always 0)
-    *   bit 2: always 1 (so that we can use it for determining vertex liveness)
-    */
-   unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
-   bool calc_odd = stream == 0 && total_vtx_per_prim == 3;
-   Temp prim_flag;
-
-   if (nir_src_is_const(instr->src[1])) {
-      uint8_t current_vtx_per_prim = nir_src_as_uint(instr->src[1]);
-      uint8_t completes_prim = (current_vtx_per_prim >= (total_vtx_per_prim - 1)) ? 1 : 0;
-      uint8_t odd = (uint8_t)calc_odd & current_vtx_per_prim;
-      uint8_t flag = completes_prim | (odd << 1) | (1 << 2);
-      prim_flag = bld.copy(bld.def(v1b), Operand(flag));
-   } else if (!instr->src[1].ssa->divergent) {
-      Temp current_vtx_per_prim = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
-      Temp completes_prim = bld.sopc(aco_opcode::s_cmp_le_u32, bld.def(s1, scc), Operand(total_vtx_per_prim - 1), current_vtx_per_prim);
-      prim_flag = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0b101u), Operand(0b100u), bld.scc(completes_prim));
-      if (calc_odd) {
-         Temp odd = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), current_vtx_per_prim, Operand(0u));
-         prim_flag = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1), bld.def(s1, scc), odd, prim_flag);
-      }
-   } else {
-      Temp current_vtx_per_prim = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
-      Temp completes_prim = bld.vopc(aco_opcode::v_cmp_le_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(total_vtx_per_prim - 1), current_vtx_per_prim);
-      prim_flag = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0b100u), Operand(0b101u), Operand(completes_prim));
-      if (calc_odd) {
-         Temp odd = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), current_vtx_per_prim);
-         prim_flag = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), odd, Operand(1u), prim_flag);
-      }
-   }
-
-   /* Store the per-vertex primitive flags at the end of the vertex data */
-   prim_flag = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), as_vgpr(ctx, prim_flag), Operand(0u));
-   unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream;
-   store_lds(ctx, 1, prim_flag, 1u, emit_vertex_addr, primflag_offset, 1);
-}
-
-void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream);
-void ngg_gs_write_shader_query(isel_context *ctx, nir_intrinsic_instr *instr);
-
-void ngg_visit_set_vertex_and_primitive_count(isel_context *ctx, nir_intrinsic_instr *instr)
-{
-   unsigned stream = nir_intrinsic_stream_id(instr);
-   if (stream > 0 && !ctx->args->shader_info->gs.num_stream_output_components[stream])
-      return;
-
-   ctx->ngg_gs_known_vtxcnt[stream] = true;
-
-   /* Clear the primitive flags of non-emitted GS vertices. */
-   if (!nir_src_is_const(instr->src[0]) || nir_src_as_uint(instr->src[0]) < ctx->shader->info.gs.vertices_out) {
-      Temp vtx_cnt = get_ssa_temp(ctx, instr->src[0].ssa);
-      ngg_gs_clear_primflags(ctx, vtx_cnt, stream);
-   }
-
-   ngg_gs_write_shader_query(ctx, instr);
-}
-
 void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr)
 {
    Builder bld(ctx->program, ctx->block);
@@ -8521,10 +8374,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       break;
    }
    case nir_intrinsic_emit_vertex_with_counter: {
-      if (ctx->stage.hw == HWStage::NGG)
-         ngg_visit_emit_vertex_with_counter(ctx, instr);
-      else
-         visit_emit_vertex_with_counter(ctx, instr);
+      assert(ctx->stage.hw == HWStage::GS);
+      visit_emit_vertex_with_counter(ctx, instr);
       break;
    }
    case nir_intrinsic_end_primitive_with_counter: {
@@ -8535,8 +8386,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       break;
    }
    case nir_intrinsic_set_vertex_and_primitive_count: {
-      if (ctx->stage.hw == HWStage::NGG)
-         ngg_visit_set_vertex_and_primitive_count(ctx, instr);
+      assert(ctx->stage.hw == HWStage::GS);
       /* unused in the legacy pipeline, the HW keeps track of this for us */
       break;
    }
@@ -11193,20 +11043,6 @@ Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
    return lanecount_to_mask(ctx, count);
 }
 
-Temp ngg_max_vertex_count(isel_context *ctx)
-{
-   Builder bld(ctx->program, ctx->block);
-   return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
-                   get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(12u | (9u << 16u)));
-}
-
-Temp ngg_max_primitive_count(isel_context *ctx)
-{
-   Builder bld(ctx->program, ctx->block);
-   return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
-                   get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(22u | (9u << 16u)));
-}
-
 void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt)
 {
    assert(vtx_cnt.id() && prm_cnt.id());
@@ -11214,7 +11050,7 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt
    Builder bld(ctx->program, ctx->block);
    Temp prm_cnt_0;
 
-   if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS) && ctx->ngg_gs_const_prmcnt[0] <= 0) {
+   if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS)) {
       /* Navi 1x workaround: make sure to always export at least 1 vertex and triangle */
       prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand(0u));
       prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), prm_cnt, bld.scc(prm_cnt_0));
@@ -11254,495 +11090,6 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt
    }
 }
 
-void ngg_emit_wave0_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt = Temp(), Temp prm_cnt = Temp())
-{
-   Builder bld(ctx->program, ctx->block);
-
-   /* Get the id of the current wave within the threadgroup (workgroup) */
-   Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
-                                            get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(24u | (4u << 16)));
-
-   /* Execute the following code only on the first wave (wave id 0),
-    * use the SCC def to tell if the wave id is zero or not.
-    */
-   Temp waveid_as_cond = wave_id_in_tg.def(1).getTemp();
-   if_context ic;
-   begin_uniform_if_then(ctx, &ic, waveid_as_cond);
-   begin_uniform_if_else(ctx, &ic);
-   bld.reset(ctx->block);
-
-   /* VS/TES: we infer the vertex and primitive count from arguments
-    * GS: the caller needs to supply them
-    */
-   assert(ctx->stage.has(SWStage::GS)
-          ? (vtx_cnt.id() && prm_cnt.id())
-          : (!vtx_cnt.id() && !prm_cnt.id()));
-
-   /* Number of vertices output by VS/TES */
-   if (vtx_cnt.id() == 0)
-      vtx_cnt = ngg_max_vertex_count(ctx);
-
-   /* Number of primitives output by VS/TES */
-   if (prm_cnt.id() == 0)
-      prm_cnt = ngg_max_primitive_count(ctx);
-
-   ngg_emit_sendmsg_gs_alloc_req(ctx, vtx_cnt, prm_cnt);
-
-   end_uniform_if(ctx, &ic);
-}
-
-Temp ngg_pack_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[], const Temp is_null)
-{
-   Builder bld(ctx->program, ctx->block);
-
-   Temp tmp;
-   Temp gs_invocation_id;
-
-   if (ctx->stage == vertex_ngg)
-      gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
-
-   for (unsigned i = 0; i < num_vertices; ++i) {
-      assert(vtxindex[i].id());
-
-      if (i)
-         tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
-      else
-         tmp = vtxindex[i];
-
-      /* The initial edge flag is always false in tess eval shaders. */
-      if (ctx->stage == vertex_ngg) {
-         Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8u + i), Operand(1u));
-         tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
-      }
-   }
-
-   if (is_null.id())
-      tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), is_null, Operand(31u), tmp);
-
-   return tmp;
-}
-
-void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[], const Temp is_null = Temp())
-{
-   Builder bld(ctx->program, ctx->block);
-   Temp prim_exp_arg;
-
-   if (!ctx->stage.has(SWStage::GS) && ctx->args->options->key.vs_common_out.as_ngg_passthrough)
-      prim_exp_arg = get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]);
-   else
-      prim_exp_arg = ngg_pack_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex, is_null);
-
-   bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
-        1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
-        false /* compressed */, true/* done */, false /* valid mask */);
-}
-
-std::pair<Temp, Temp> ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp src_mask)
-{
-   /* Workgroup scan for NGG GS.
-    * This performs a reduction along with an exclusive scan addition accross the workgroup.
-    * Assumes that all lanes are enabled (exec = -1) where this is emitted.
-    *
-    * Input:  (1) per-lane bool
-    *             -- 1 if the lane has a live/valid vertex, 0 otherwise
-    * Output: (1) result of a reduction over the entire workgroup,
-    *             -- the total number of vertices emitted by the workgroup
-    *         (2) result of an exclusive scan over the entire workgroup
-    *             -- used for vertex compaction, in order to determine
-    *                which lane should export the current lane's vertex
-    */
-
-   Builder bld(ctx->program, ctx->block);
-   assert(src_mask.regClass() == bld.lm);
-
-   /* Subgroup reduction and exclusive scan on the per-lane boolean. */
-   Temp sg_reduction = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), src_mask);
-   Temp sg_excl = emit_mbcnt(ctx, bld.tmp(v1), Operand(src_mask));
-
-   if (ctx->program->workgroup_size <= ctx->program->wave_size)
-      return std::make_pair(sg_reduction, sg_excl);
-
-   if_context ic;
-
-   /* Determine if the current lane is the first. */
-   Temp is_first_lane = bld.copy(bld.def(bld.lm), Operand(1u, ctx->program->wave_size == 64));
-   Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
-   begin_divergent_if_then(ctx, &ic, is_first_lane);
-   bld.reset(ctx->block);
-
-   /* The first lane of each wave stores the result of its subgroup reduction to LDS (NGG scratch). */
-   Temp wave_id_in_tg_lds_addr = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), wave_id_in_tg);
-   store_lds(ctx, 4u, as_vgpr(ctx, sg_reduction), 0x1u, wave_id_in_tg_lds_addr, ctx->ngg_gs_scratch_addr, 4u);
-
-   /* Wait for all waves to write to LDS. */
-   create_workgroup_barrier(bld);
-
-   /* Number of LDS dwords written by all waves (if there is only 1, that is already handled above) */
-   unsigned num_lds_dwords = DIV_ROUND_UP(MIN2(ctx->program->workgroup_size, 256), ctx->program->wave_size);
-   assert(num_lds_dwords >= 2 && num_lds_dwords <= 8);
-
-   /* The first lane of each wave loads every wave's results from LDS, to avoid bank conflicts */
-   Temp reduction_per_wave_vector = load_lds(ctx, 4u * num_lds_dwords, bld.tmp(RegClass(RegType::vgpr, num_lds_dwords)),
-                                             bld.copy(bld.def(v1), Operand(0u)), ctx->ngg_gs_scratch_addr, 16u);
-
-   begin_divergent_if_else(ctx, &ic);
-   end_divergent_if(ctx, &ic);
-   bld.reset(ctx->block);
-
-   /* Create phis which get us the above reduction results, or undef. */
-   bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
-   reduction_per_wave_vector = bld.pseudo(aco_opcode::p_phi, bld.def(reduction_per_wave_vector.regClass()), reduction_per_wave_vector, Operand(reduction_per_wave_vector.regClass()));
-   bld.reset(ctx->block);
-
-   emit_split_vector(ctx, reduction_per_wave_vector, num_lds_dwords);
-   Temp reduction_per_wave[8];
-
-   for (unsigned i = 0; i < num_lds_dwords; ++i) {
-      Temp reduction_current_wave = emit_extract_vector(ctx, reduction_per_wave_vector, i, v1);
-      reduction_per_wave[i] = bld.readlane(bld.def(s1), reduction_current_wave, Operand(0u));
-   }
-
-   Temp wave_count = wave_count_in_threadgroup(ctx);
-   Temp reduction_result = reduction_per_wave[0];
-   Temp excl_base;
-
-   for (unsigned i = 0; i < num_lds_dwords; ++i) {
-      /* Workgroup reduction:
-       * Add the reduction results from all waves (up to and including wave_count).
-       */
-      if (i != 0) {
-         Temp should_add = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), wave_count, Operand(i + 1u));
-         Temp addition = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), reduction_per_wave[i], Operand(0u), bld.scc(should_add));
-         reduction_result = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), reduction_result, addition);
-      }
-
-      /* Base of workgroup exclusive scan:
-       * Add the reduction results from waves up to and excluding wave_id_in_tg.
-       */
-      if (i != (num_lds_dwords - 1)) {
-         Temp should_add = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), wave_id_in_tg, Operand(i + 1u));
-         Temp addition = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), reduction_per_wave[i], Operand(0u), bld.scc(should_add));
-         excl_base = !excl_base.id() ? addition : bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), excl_base, addition);
-      }
-   }
-
-   assert(excl_base.id());
-
-   /* WG exclusive scan result: base + subgroup exclusive result. */
-   Temp wg_excl = bld.vadd32(bld.def(v1), Operand(excl_base), Operand(sg_excl));
-
-   return std::make_pair(reduction_result, wg_excl);
-}
-
-void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream)
-{
-   loop_context lc;
-   if_context ic;
-   Builder bld(ctx->program, ctx->block);
-   Temp zero = bld.copy(bld.def(v1b), Operand(uint8_t(0)));
-   Temp counter_init = bld.copy(bld.def(v1), as_vgpr(ctx, vtx_cnt));
-
-   begin_loop(ctx, &lc);
-
-   Temp incremented_counter = bld.tmp(counter_init.regClass());
-   bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
-   Temp counter = bld.pseudo(aco_opcode::p_phi, bld.def(counter_init.regClass()), Operand(counter_init), incremented_counter);
-   bld.reset(ctx->block);
-   Temp break_cond = bld.vopc(aco_opcode::v_cmp_le_u32, bld.def(bld.lm), Operand(ctx->shader->info.gs.vertices_out), counter);
-
-   /* Break when vertices_out <= counter  */
-   begin_divergent_if_then(ctx, &ic, break_cond);
-   emit_loop_break(ctx);
-   begin_divergent_if_else(ctx, &ic);
-   end_divergent_if(ctx, &ic);
-   bld.reset(ctx->block);
-
-   /* Store zero to the primitive flag of the current vertex for the current stream */
-   Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, counter);
-   unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream;
-   store_lds(ctx, 1, zero, 0xf, emit_vertex_addr, primflag_offset, 1);
-
-   /* Increment counter */
-   bld.vadd32(Definition(incremented_counter), counter, Operand(1u));
-
-   end_loop(ctx, &lc);
-}
-
-void ngg_gs_write_shader_query(isel_context *ctx, nir_intrinsic_instr *instr)
-{
-   /* Each subgroup uses a single GDS atomic to collect the total number of primitives.
-    * TODO: Consider using primitive compaction at the end instead.
-    */
-
-   unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
-   if_context ic_shader_query;
-   Builder bld(ctx->program, ctx->block);
-
-   Temp shader_query = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), get_arg(ctx, ctx->args->ngg_gs_state), Operand(0u));
-   begin_uniform_if_then(ctx, &ic_shader_query, shader_query);
-   bld.reset(ctx->block);
-
-   Temp sg_prm_cnt;
-
-   /* Calculate the "real" number of emitted primitives from the emitted GS vertices and primitives.
-    * GS emits points, line strips or triangle strips.
-    * Real primitives are points, lines or triangles.
-    */
-   if (nir_src_is_const(instr->src[0]) && nir_src_is_const(instr->src[1])) {
-      unsigned gs_vtx_cnt = nir_src_as_uint(instr->src[0]);
-      unsigned gs_prm_cnt = nir_src_as_uint(instr->src[1]);
-      Temp prm_cnt = bld.copy(bld.def(s1), Operand(gs_vtx_cnt - gs_prm_cnt * (total_vtx_per_prim - 1u)));
-      Temp thread_cnt = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
-      sg_prm_cnt = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), prm_cnt, thread_cnt);
-   } else {
-      Temp gs_vtx_cnt = get_ssa_temp(ctx, instr->src[0].ssa);
-      Temp prm_cnt = get_ssa_temp(ctx, instr->src[1].ssa);
-      if (total_vtx_per_prim > 1)
-         prm_cnt = bld.vop3(aco_opcode::v_mad_i32_i24, bld.def(v1), prm_cnt, Operand(-1u * (total_vtx_per_prim - 1)), gs_vtx_cnt);
-      else
-         prm_cnt = as_vgpr(ctx, prm_cnt);
-
-      /* Reduction calculates the primitive count for the entire subgroup. */
-      sg_prm_cnt = emit_reduction_instr(ctx, aco_opcode::p_reduce, ReduceOp::iadd32,
-                                        ctx->program->wave_size, bld.def(s1), prm_cnt);
-   }
-
-   Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
-   Temp is_first_lane = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
-                                 Operand(1u, ctx->program->wave_size == 64), first_lane);
-
-   if_context ic_last_lane;
-   begin_divergent_if_then(ctx, &ic_last_lane, is_first_lane);
-   bld.reset(ctx->block);
-
-   Temp gds_addr = bld.copy(bld.def(v1), Operand(0u));
-   Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0x100u)));
-   bld.ds(aco_opcode::ds_add_u32, gds_addr, as_vgpr(ctx, sg_prm_cnt), m, 0u, 0u, true);
-
-   begin_divergent_if_else(ctx, &ic_last_lane);
-   end_divergent_if(ctx, &ic_last_lane);
-
-   begin_uniform_if_else(ctx, &ic_shader_query);
-   end_uniform_if(ctx, &ic_shader_query);
-}
-
-Temp ngg_gs_load_prim_flag_0(isel_context *ctx, Temp tid_in_tg, Temp max_vtxcnt, Temp vertex_lds_addr)
-{
-   if_context ic;
-   Builder bld(ctx->program, ctx->block);
-
-   Temp is_vertex_emit_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_vtxcnt, tid_in_tg);
-   begin_divergent_if_then(ctx, &ic, is_vertex_emit_thread);
-   bld.reset(ctx->block);
-
-   Operand m = load_lds_size_m0(bld);
-   Temp prim_flag_0 = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset);
-
-   begin_divergent_if_else(ctx, &ic);
-   end_divergent_if(ctx, &ic);
-
-   bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
-   prim_flag_0 = bld.pseudo(aco_opcode::p_phi, bld.def(prim_flag_0.regClass()), Operand(prim_flag_0), Operand(0u));
-
-   return prim_flag_0;
-}
-
-void ngg_gs_setup_vertex_compaction(isel_context *ctx, Temp vertex_live, Temp tid_in_tg, Temp exporter_tid_in_tg)
-{
-   if_context ic;
-   Builder bld(ctx->program, ctx->block);
-   assert(vertex_live.regClass() == bld.lm);
-
-   begin_divergent_if_then(ctx, &ic, vertex_live);
-   bld.reset(ctx->block);
-
-   /* Setup the vertex compaction.
-    * Save the current thread's id for the thread which will export the current vertex.
-    * We reuse stream 1 of the primitive flag of the other thread's vertex for storing this.
-    */
-   Temp export_thread_lds_addr = ngg_gs_vertex_lds_addr(ctx, exporter_tid_in_tg);
-   tid_in_tg = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tid_in_tg, Operand(0u));
-   store_lds(ctx, 1u, tid_in_tg, 1u, export_thread_lds_addr, ctx->ngg_gs_primflags_offset + 1u, 1u);
-
-   begin_divergent_if_else(ctx, &ic);
-   end_divergent_if(ctx, &ic);
-   bld.reset(ctx->block);
-
-   /* Wait for all waves to setup the vertex compaction. */
-   create_workgroup_barrier(bld);
-}
-
-void ngg_gs_export_primitives(isel_context *ctx, Temp max_prmcnt, Temp tid_in_tg, Temp exporter_tid_in_tg,
-                              Temp prim_flag_0)
-{
-   if_context ic;
-   Builder bld(ctx->program, ctx->block);
-   unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
-   assert(total_vtx_per_prim <= 3);
-
-   Temp is_prim_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_prmcnt, tid_in_tg);
-   begin_divergent_if_then(ctx, &ic, is_prim_export_thread);
-   bld.reset(ctx->block);
-
-   Temp is_null_prim = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(-1u), prim_flag_0);
-   Temp indices[3];
-
-   indices[total_vtx_per_prim - 1] = exporter_tid_in_tg;
-   if (total_vtx_per_prim >= 2)
-      indices[total_vtx_per_prim - 2] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(1u));
-   if (total_vtx_per_prim == 3)
-      indices[total_vtx_per_prim - 3] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(2u));
-
-   if (total_vtx_per_prim == 3) {
-      /* API GS outputs triangle strips, but NGG HW needs triangles.
-      * We already have triangles due to how we set the primitive flags, but we need to
-      * make sure the vertex order is so that the front/back is correct, and the provoking vertex is kept.
-      */
-      bool flatshade_first = !ctx->args->options->key.vs.provoking_vtx_last;
-
-      /* If the triangle is odd, this will swap its two non-provoking vertices. */
-      Temp is_odd = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), Operand(prim_flag_0), Operand(1u), Operand(1u));
-      if (flatshade_first) {
-         indices[1] = bld.vadd32(bld.def(v1), indices[1], Operand(is_odd));
-         indices[2] = bld.vsub32(bld.def(v1), indices[2], Operand(is_odd));
-      } else {
-         indices[0] = bld.vadd32(bld.def(v1), indices[0], Operand(is_odd));
-         indices[1] = bld.vsub32(bld.def(v1), indices[1], Operand(is_odd));
-      }
-   }
-
-   ngg_emit_prim_export(ctx, total_vtx_per_prim, indices, is_null_prim);
-
-   begin_divergent_if_else(ctx, &ic);
-   end_divergent_if(ctx, &ic);
-}
-
-void ngg_gs_export_vertices(isel_context *ctx, Temp wg_vtx_cnt, Temp tid_in_tg, Temp vertex_lds_addr)
-{
-   if_context ic;
-   Builder bld(ctx->program, ctx->block);
-
-   /* See if the current thread has to export a vertex. */
-   Temp is_vtx_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), wg_vtx_cnt, tid_in_tg);
-   begin_divergent_if_then(ctx, &ic, is_vtx_export_thread);
-   bld.reset(ctx->block);
-
-   /* The index of the vertex that the current thread will export. */
-   Temp exported_vtx_idx;
-
-   if (ctx->ngg_gs_early_alloc) {
-      /* No vertex compaction necessary, the thread can export its own vertex. */
-      exported_vtx_idx = tid_in_tg;
-   } else {
-      /* Vertex compaction: read stream 1 of the primitive flags to see which vertex the current thread needs to export */
-      Operand m = load_lds_size_m0(bld);
-      exported_vtx_idx = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset + 1);
-   }
-
-   /* Get the LDS address of the vertex that the current thread must export. */
-   Temp exported_vtx_addr = ngg_gs_vertex_lds_addr(ctx, exported_vtx_idx);
-
-   /* Read the vertex attributes from LDS. */
-   unsigned out_idx = 0;
-   for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
-      if (ctx->program->info->gs.output_streams[i] != 0)
-         continue;
-
-      /* Set the output mask to the GS output usage mask. */
-      unsigned rdmask =
-         ctx->outputs.mask[i] =
-         ctx->program->info->gs.output_usage_mask[i];
-
-      if (!rdmask)
-         continue;
-
-      for (unsigned j = 0; j < 4; j++) {
-         if (rdmask & (1 << j))
-            ctx->outputs.temps[i * 4u + j] =
-               load_lds(ctx, 4u, bld.tmp(v1), exported_vtx_addr, out_idx * 4u, 4u);
-
-         out_idx++;
-      }
-   }
-
-   /* Export the vertex parameters. */
-   create_vs_exports(ctx);
-
-   begin_divergent_if_else(ctx, &ic);
-   end_divergent_if(ctx, &ic);
-}
-
-void ngg_gs_prelude(isel_context *ctx)
-{
-   if (!ctx->ngg_gs_early_alloc)
-      return;
-
-   /* We know the GS writes the maximum possible number of vertices, so
-    * it's likely that most threads need to export a primitive, too.
-    * Thus, we won't have to worry about primitive compaction here.
-    */
-   Temp num_max_vertices = ngg_max_vertex_count(ctx);
-   ngg_emit_wave0_sendmsg_gs_alloc_req(ctx, num_max_vertices, num_max_vertices);
-}
-
-void ngg_gs_finale(isel_context *ctx)
-{
-   /* Sanity check. Make sure the vertex/primitive counts are set and the LDS is correctly initialized. */
-   assert(ctx->ngg_gs_known_vtxcnt[0]);
-
-   if_context ic;
-   Builder bld(ctx->program, ctx->block);
-
-   /* Wait for all waves to reach the epilogue. */
-   create_workgroup_barrier(bld);
-
-   /* Thread ID in the entire threadgroup */
-   Temp tid_in_tg = thread_id_in_threadgroup(ctx);
-   /* Number of threads that may need to export a vertex or primitive. */
-   Temp max_vtxcnt = ngg_max_vertex_count(ctx);
-   /* LDS address of the vertex corresponding to the current thread. */
-   Temp vertex_lds_addr = ngg_gs_vertex_lds_addr(ctx, tid_in_tg);
-   /* Primitive flag from stream 0 of the vertex corresponding to the current thread. */
-   Temp prim_flag_0 = ngg_gs_load_prim_flag_0(ctx, tid_in_tg, max_vtxcnt, vertex_lds_addr);
-
-   bld.reset(ctx->block);
-
-   /* NIR already filters out incomplete primitives and vertices,
-    * so any vertex whose primitive flag is non-zero is considered live/valid.
-    */
-   Temp vertex_live = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), Operand(prim_flag_0));
-
-   /* Total number of vertices emitted by the workgroup. */
-   Temp wg_vtx_cnt;
-   /* ID of the thread which will export the current thread's vertex. */
-   Temp exporter_tid_in_tg;
-
-   if (ctx->ngg_gs_early_alloc) {
-      /* There is no need for a scan or vertex compaction, we know that
-       * the GS writes all possible vertices so each thread can export its own vertex.
-       */
-      wg_vtx_cnt = max_vtxcnt;
-      exporter_tid_in_tg = tid_in_tg;
-   } else {
-      /* Perform a workgroup reduction and exclusive scan. */
-      std::pair<Temp, Temp> wg_scan = ngg_gs_workgroup_reduce_and_scan(ctx, vertex_live);
-      bld.reset(ctx->block);
-      /* Total number of vertices emitted by the workgroup. */
-      wg_vtx_cnt = wg_scan.first;
-      /* ID of the thread which will export the current thread's vertex. */
-      exporter_tid_in_tg = wg_scan.second;
-      /* Skip all exports when possible. */
-      Temp have_exports = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), wg_vtx_cnt, Operand(0u));
-      max_vtxcnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), max_vtxcnt, Operand(0u), bld.scc(have_exports));
-
-      ngg_emit_wave0_sendmsg_gs_alloc_req(ctx, wg_vtx_cnt, max_vtxcnt);
-      ngg_gs_setup_vertex_compaction(ctx, vertex_live, tid_in_tg, exporter_tid_in_tg);
-   }
-
-   ngg_gs_export_primitives(ctx, max_vtxcnt, tid_in_tg, exporter_tid_in_tg, prim_flag_0);
-   ngg_gs_export_vertices(ctx, wg_vtx_cnt, tid_in_tg, vertex_lds_addr);
-}
-
 } /* end namespace */
 
 void select_program(Program *program,
@@ -11753,7 +11100,7 @@ void select_program(Program *program,
 {
    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
    if_context ic_merged_wave_info;
-   bool ngg_gs    = ctx.stage.hw == HWStage::NGG &&  ctx.stage.has(SWStage::GS);
+   bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
 
    for (unsigned i = 0; i < shader_count; i++) {
       nir_shader *nir = shaders[i];
@@ -11776,9 +11123,6 @@ void select_program(Program *program,
          }
       }
 
-      if (!i && ngg_gs)
-         ngg_gs_prelude(&ctx);
-
       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
       nir_function_impl *func = nir_shader_get_entrypoint(nir);
       bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
@@ -11787,14 +11131,8 @@ void select_program(Program *program,
                            (nir->info.stage == MESA_SHADER_TESS_EVAL &&
                             ctx.stage == tess_eval_geometry_gs));
 
-      bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader);
-      bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
-
-      if (i && ngg_gs) {
-         /* NGG GS waves need to wait for each other after the GS half is done. */
-         Builder bld(ctx.program, ctx.block);
-         create_workgroup_barrier(bld);
-      }
+      bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
+      bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
 
       if (check_merged_wave_info) {
          Temp cond = merged_wave_info_to_mask(&ctx, i);
@@ -11843,9 +11181,6 @@ void select_program(Program *program,
          end_divergent_if(&ctx, &ic_merged_wave_info);
       }
 
-      if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
-         ngg_gs_finale(&ctx);
-
       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
          /* Outputs of the previous stage are inputs to the next stage */
          ctx.inputs = ctx.outputs;
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h
index 0a5462c04c2..379da7772e0 100644
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -93,15 +93,7 @@ struct isel_context {
    Temp persp_centroid, linear_centroid;
 
    /* GS inputs */
-   bool ngg_gs_early_alloc = false;
-   bool ngg_gs_known_vtxcnt[4] = {false, false, false, false};
    Temp gs_wave_id;
-   unsigned ngg_gs_emit_addr = 0;
-   unsigned ngg_gs_emit_vtx_bytes = 0;
-   unsigned ngg_gs_scratch_addr = 0;
-   unsigned ngg_gs_primflags_offset = 0;
-   int ngg_gs_const_vtxcnt[4];
-   int ngg_gs_const_prmcnt[4];
 
    /* VS output information */
    bool export_clip_dists;
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 2e44568f8eb..e92e6b3cc9b 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -407,28 +407,7 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir)
       setup_vs_output_info(ctx, nir, false,
                            ctx->options->key.vs_common_out.export_clip_dists, outinfo);
 
-      unsigned ngg_gs_scratch_bytes = ctx->args->shader_info->so.num_outputs ? (44u * 4u) : (8u * 4u);
-      unsigned ngg_emit_bytes = ctx->args->shader_info->ngg_info.ngg_emit_size * 4u;
-      unsigned esgs_ring_bytes = ctx->args->shader_info->ngg_info.esgs_ring_size;
-
-      ctx->ngg_gs_primflags_offset = ctx->args->shader_info->gs.gsvs_vertex_size;
-      ctx->ngg_gs_emit_vtx_bytes = ctx->ngg_gs_primflags_offset + 4u;
-      ctx->ngg_gs_emit_addr = esgs_ring_bytes;
-      ctx->ngg_gs_scratch_addr = ctx->ngg_gs_emit_addr + ngg_emit_bytes;
-      ctx->ngg_gs_scratch_addr = ALIGN(ctx->ngg_gs_scratch_addr, 16u);
-
-      unsigned total_lds_bytes = ctx->ngg_gs_scratch_addr + ngg_gs_scratch_bytes;
-      assert(total_lds_bytes >= ctx->ngg_gs_emit_addr);
-      assert(total_lds_bytes >= ctx->ngg_gs_scratch_addr);
-      ctx->program->config->lds_size = DIV_ROUND_UP(total_lds_bytes, ctx->program->dev.lds_encoding_granule);
-
-      /* Make sure we have enough room for emitted GS vertices */
-      if (nir->info.gs.vertices_out)
-         assert((ngg_emit_bytes % (ctx->ngg_gs_emit_vtx_bytes * nir->info.gs.vertices_out)) == 0);
-
-      /* See if the number of vertices and primitives are compile-time known */
-      nir_gs_count_vertices_and_primitives(nir, ctx->ngg_gs_const_vtxcnt, ctx->ngg_gs_const_prmcnt, 4u);
-      ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1;
+      ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
    }
 
    if (ctx->stage.has(SWStage::VS))
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index fd5780a55e8..385cc6d4486 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -862,11 +862,16 @@ bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has
       info->is_ngg_passthrough = out_conf.passthrough;
       key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
-      if (!key->vs_common_out.as_ngg)
+      if (!info->is_ngg)
          return false;
 
-      /* TODO: lower NGG GS in NIR */
-      return false;
+      ac_nir_lower_ngg_gs(
+         nir, info->wave_size, max_workgroup_size,
+         info->ngg_info.esgs_ring_size,
+         info->gs.gsvs_vertex_size,
+         info->ngg_info.ngg_emit_size * 4u,
+         key->vs.provoking_vtx_last);
+      return true;
    } else {
       return false;
    }