Mesa (main): radv: Use new NIR lowering of NGG GS when ACO is used.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed May 12 14:12:17 UTC 2021
Module: Mesa
Branch: main
Commit: bb127c2130c550b222541aaaac3da377083e8e56
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=bb127c2130c550b222541aaaac3da377083e8e56
Author: Timur Kristóf <timur.kristof at gmail.com>
Date: Thu Apr 22 14:44:28 2021 +0200
radv: Use new NIR lowering of NGG GS when ACO is used.
Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10740>
---
src/amd/compiler/aco_instruction_selection.cpp | 679 +--------------------
src/amd/compiler/aco_instruction_selection.h | 8 -
.../compiler/aco_instruction_selection_setup.cpp | 23 +-
src/amd/vulkan/radv_shader.c | 11 +-
4 files changed, 16 insertions(+), 705 deletions(-)
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 089addd3313..bb60ca99536 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -4290,44 +4290,6 @@ Temp thread_id_in_threadgroup(isel_context *ctx)
return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
}
-Temp wave_count_in_threadgroup(isel_context *ctx)
-{
- Builder bld(ctx->program, ctx->block);
- return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
- get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(28u | (4u << 16)));
-}
-
-Temp ngg_gs_vertex_lds_addr(isel_context *ctx, Temp vertex_idx)
-{
- Builder bld(ctx->program, ctx->block);
- unsigned write_stride_2exp = ffs(MAX2(ctx->shader->info.gs.vertices_out, 1)) - 1;
-
- /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
- if (write_stride_2exp) {
- Temp row = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(5u), vertex_idx);
- Temp swizzle = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand((1u << write_stride_2exp) - 1), row);
- vertex_idx = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), vertex_idx, swizzle);
- }
-
- Temp vertex_idx_bytes = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->ngg_gs_emit_vtx_bytes);
- return bld.vadd32(bld.def(v1), vertex_idx_bytes, Operand(ctx->ngg_gs_emit_addr));
-}
-
-Temp ngg_gs_emit_vertex_lds_addr(isel_context *ctx, Temp emit_vertex_idx)
-{
- /* Should be used by GS threads only (not by the NGG GS epilogue).
- * Returns the LDS address of the given vertex index as emitted by the current GS thread.
- */
-
- Builder bld(ctx->program, ctx->block);
-
- Temp thread_id_in_tg = thread_id_in_threadgroup(ctx);
- Temp thread_vertices_addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, ctx->shader->info.gs.vertices_out);
- Temp vertex_idx = bld.vadd32(bld.def(v1), thread_vertices_addr, emit_vertex_idx);
-
- return ngg_gs_vertex_lds_addr(ctx, vertex_idx);
-}
-
Temp get_tess_rel_patch_id(isel_context *ctx)
{
Builder bld(ctx->program, ctx->block);
@@ -6988,115 +6950,6 @@ void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
}
}
-unsigned gs_outprim_vertices(unsigned outprim)
-{
- switch (outprim) {
- case 0: /* GL_POINTS */
- return 1;
- case 3: /* GL_LINE_STRIP */
- return 2;
- case 5: /* GL_TRIANGLE_STRIP */
- return 3;
- default:
- unreachable("Unsupported GS output primitive type.");
- }
-}
-
-void ngg_visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr)
-{
- Builder bld(ctx->program, ctx->block);
- Temp emit_vertex_idx = get_ssa_temp(ctx, instr->src[0].ssa);
- Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, emit_vertex_idx);
- unsigned stream = nir_intrinsic_stream_id(instr);
- unsigned out_idx = 0;
-
- for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
- if (ctx->program->info->gs.output_streams[i] != stream) {
- continue;
- } else if (!ctx->outputs.mask[i] && ctx->program->info->gs.output_usage_mask[i]) {
- /* The GS can write this output, but it's empty for the current vertex. */
- out_idx++;
- continue;
- }
-
- uint32_t wrmask = ctx->program->info->gs.output_usage_mask[i] &
- ctx->outputs.mask[i];
-
- /* Clear output for the next vertex. */
- ctx->outputs.mask[i] = 0;
-
- if (!wrmask)
- continue;
-
- for (unsigned j = 0; j < 4; j++) {
- if (wrmask & (1 << j)) {
- Temp elem = ctx->outputs.temps[i * 4u + j];
- store_lds(ctx, elem.bytes(), elem, 0x1u, emit_vertex_addr, out_idx * 4u, 4u);
- }
-
- out_idx++;
- }
- }
-
- /* Calculate per-vertex primitive flags based on current and total vertex count per primitive:
- * bit 0: whether this vertex finishes a primitive
- * bit 1: whether the primitive is odd (if we are emitting triangle strips, otherwise always 0)
- * bit 2: always 1 (so that we can use it for determining vertex liveness)
- */
- unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
- bool calc_odd = stream == 0 && total_vtx_per_prim == 3;
- Temp prim_flag;
-
- if (nir_src_is_const(instr->src[1])) {
- uint8_t current_vtx_per_prim = nir_src_as_uint(instr->src[1]);
- uint8_t completes_prim = (current_vtx_per_prim >= (total_vtx_per_prim - 1)) ? 1 : 0;
- uint8_t odd = (uint8_t)calc_odd & current_vtx_per_prim;
- uint8_t flag = completes_prim | (odd << 1) | (1 << 2);
- prim_flag = bld.copy(bld.def(v1b), Operand(flag));
- } else if (!instr->src[1].ssa->divergent) {
- Temp current_vtx_per_prim = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
- Temp completes_prim = bld.sopc(aco_opcode::s_cmp_le_u32, bld.def(s1, scc), Operand(total_vtx_per_prim - 1), current_vtx_per_prim);
- prim_flag = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0b101u), Operand(0b100u), bld.scc(completes_prim));
- if (calc_odd) {
- Temp odd = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), current_vtx_per_prim, Operand(0u));
- prim_flag = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1), bld.def(s1, scc), odd, prim_flag);
- }
- } else {
- Temp current_vtx_per_prim = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
- Temp completes_prim = bld.vopc(aco_opcode::v_cmp_le_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(total_vtx_per_prim - 1), current_vtx_per_prim);
- prim_flag = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0b100u), Operand(0b101u), Operand(completes_prim));
- if (calc_odd) {
- Temp odd = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), current_vtx_per_prim);
- prim_flag = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), odd, Operand(1u), prim_flag);
- }
- }
-
- /* Store the per-vertex primitive flags at the end of the vertex data */
- prim_flag = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), as_vgpr(ctx, prim_flag), Operand(0u));
- unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream;
- store_lds(ctx, 1, prim_flag, 1u, emit_vertex_addr, primflag_offset, 1);
-}
-
-void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream);
-void ngg_gs_write_shader_query(isel_context *ctx, nir_intrinsic_instr *instr);
-
-void ngg_visit_set_vertex_and_primitive_count(isel_context *ctx, nir_intrinsic_instr *instr)
-{
- unsigned stream = nir_intrinsic_stream_id(instr);
- if (stream > 0 && !ctx->args->shader_info->gs.num_stream_output_components[stream])
- return;
-
- ctx->ngg_gs_known_vtxcnt[stream] = true;
-
- /* Clear the primitive flags of non-emitted GS vertices. */
- if (!nir_src_is_const(instr->src[0]) || nir_src_as_uint(instr->src[0]) < ctx->shader->info.gs.vertices_out) {
- Temp vtx_cnt = get_ssa_temp(ctx, instr->src[0].ssa);
- ngg_gs_clear_primflags(ctx, vtx_cnt, stream);
- }
-
- ngg_gs_write_shader_query(ctx, instr);
-}
-
void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr)
{
Builder bld(ctx->program, ctx->block);
@@ -8521,10 +8374,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
}
case nir_intrinsic_emit_vertex_with_counter: {
- if (ctx->stage.hw == HWStage::NGG)
- ngg_visit_emit_vertex_with_counter(ctx, instr);
- else
- visit_emit_vertex_with_counter(ctx, instr);
+ assert(ctx->stage.hw == HWStage::GS);
+ visit_emit_vertex_with_counter(ctx, instr);
break;
}
case nir_intrinsic_end_primitive_with_counter: {
@@ -8535,8 +8386,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
}
case nir_intrinsic_set_vertex_and_primitive_count: {
- if (ctx->stage.hw == HWStage::NGG)
- ngg_visit_set_vertex_and_primitive_count(ctx, instr);
+ assert(ctx->stage.hw == HWStage::GS);
/* unused in the legacy pipeline, the HW keeps track of this for us */
break;
}
@@ -11193,20 +11043,6 @@ Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
return lanecount_to_mask(ctx, count);
}
-Temp ngg_max_vertex_count(isel_context *ctx)
-{
- Builder bld(ctx->program, ctx->block);
- return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
- get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(12u | (9u << 16u)));
-}
-
-Temp ngg_max_primitive_count(isel_context *ctx)
-{
- Builder bld(ctx->program, ctx->block);
- return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
- get_arg(ctx, ctx->args->ac.gs_tg_info), Operand(22u | (9u << 16u)));
-}
-
void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt)
{
assert(vtx_cnt.id() && prm_cnt.id());
@@ -11214,7 +11050,7 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt
Builder bld(ctx->program, ctx->block);
Temp prm_cnt_0;
- if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS) && ctx->ngg_gs_const_prmcnt[0] <= 0) {
+ if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS)) {
/* Navi 1x workaround: make sure to always export at least 1 vertex and triangle */
prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand(0u));
prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), prm_cnt, bld.scc(prm_cnt_0));
@@ -11254,495 +11090,6 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt, Temp prm_cnt
}
}
-void ngg_emit_wave0_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt = Temp(), Temp prm_cnt = Temp())
-{
- Builder bld(ctx->program, ctx->block);
-
- /* Get the id of the current wave within the threadgroup (workgroup) */
- Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
- get_arg(ctx, ctx->args->ac.merged_wave_info), Operand(24u | (4u << 16)));
-
- /* Execute the following code only on the first wave (wave id 0),
- * use the SCC def to tell if the wave id is zero or not.
- */
- Temp waveid_as_cond = wave_id_in_tg.def(1).getTemp();
- if_context ic;
- begin_uniform_if_then(ctx, &ic, waveid_as_cond);
- begin_uniform_if_else(ctx, &ic);
- bld.reset(ctx->block);
-
- /* VS/TES: we infer the vertex and primitive count from arguments
- * GS: the caller needs to supply them
- */
- assert(ctx->stage.has(SWStage::GS)
- ? (vtx_cnt.id() && prm_cnt.id())
- : (!vtx_cnt.id() && !prm_cnt.id()));
-
- /* Number of vertices output by VS/TES */
- if (vtx_cnt.id() == 0)
- vtx_cnt = ngg_max_vertex_count(ctx);
-
- /* Number of primitives output by VS/TES */
- if (prm_cnt.id() == 0)
- prm_cnt = ngg_max_primitive_count(ctx);
-
- ngg_emit_sendmsg_gs_alloc_req(ctx, vtx_cnt, prm_cnt);
-
- end_uniform_if(ctx, &ic);
-}
-
-Temp ngg_pack_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[], const Temp is_null)
-{
- Builder bld(ctx->program, ctx->block);
-
- Temp tmp;
- Temp gs_invocation_id;
-
- if (ctx->stage == vertex_ngg)
- gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
-
- for (unsigned i = 0; i < num_vertices; ++i) {
- assert(vtxindex[i].id());
-
- if (i)
- tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
- else
- tmp = vtxindex[i];
-
- /* The initial edge flag is always false in tess eval shaders. */
- if (ctx->stage == vertex_ngg) {
- Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8u + i), Operand(1u));
- tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
- }
- }
-
- if (is_null.id())
- tmp = bld.vop3(aco_opcode::v_lshl_or_b32, bld.def(v1), is_null, Operand(31u), tmp);
-
- return tmp;
-}
-
-void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[], const Temp is_null = Temp())
-{
- Builder bld(ctx->program, ctx->block);
- Temp prim_exp_arg;
-
- if (!ctx->stage.has(SWStage::GS) && ctx->args->options->key.vs_common_out.as_ngg_passthrough)
- prim_exp_arg = get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]);
- else
- prim_exp_arg = ngg_pack_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex, is_null);
-
- bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
- 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
- false /* compressed */, true/* done */, false /* valid mask */);
-}
-
-std::pair<Temp, Temp> ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp src_mask)
-{
- /* Workgroup scan for NGG GS.
- * This performs a reduction along with an exclusive scan addition accross the workgroup.
- * Assumes that all lanes are enabled (exec = -1) where this is emitted.
- *
- * Input: (1) per-lane bool
- * -- 1 if the lane has a live/valid vertex, 0 otherwise
- * Output: (1) result of a reduction over the entire workgroup,
- * -- the total number of vertices emitted by the workgroup
- * (2) result of an exclusive scan over the entire workgroup
- * -- used for vertex compaction, in order to determine
- * which lane should export the current lane's vertex
- */
-
- Builder bld(ctx->program, ctx->block);
- assert(src_mask.regClass() == bld.lm);
-
- /* Subgroup reduction and exclusive scan on the per-lane boolean. */
- Temp sg_reduction = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), src_mask);
- Temp sg_excl = emit_mbcnt(ctx, bld.tmp(v1), Operand(src_mask));
-
- if (ctx->program->workgroup_size <= ctx->program->wave_size)
- return std::make_pair(sg_reduction, sg_excl);
-
- if_context ic;
-
- /* Determine if the current lane is the first. */
- Temp is_first_lane = bld.copy(bld.def(bld.lm), Operand(1u, ctx->program->wave_size == 64));
- Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
- begin_divergent_if_then(ctx, &ic, is_first_lane);
- bld.reset(ctx->block);
-
- /* The first lane of each wave stores the result of its subgroup reduction to LDS (NGG scratch). */
- Temp wave_id_in_tg_lds_addr = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), wave_id_in_tg);
- store_lds(ctx, 4u, as_vgpr(ctx, sg_reduction), 0x1u, wave_id_in_tg_lds_addr, ctx->ngg_gs_scratch_addr, 4u);
-
- /* Wait for all waves to write to LDS. */
- create_workgroup_barrier(bld);
-
- /* Number of LDS dwords written by all waves (if there is only 1, that is already handled above) */
- unsigned num_lds_dwords = DIV_ROUND_UP(MIN2(ctx->program->workgroup_size, 256), ctx->program->wave_size);
- assert(num_lds_dwords >= 2 && num_lds_dwords <= 8);
-
- /* The first lane of each wave loads every wave's results from LDS, to avoid bank conflicts */
- Temp reduction_per_wave_vector = load_lds(ctx, 4u * num_lds_dwords, bld.tmp(RegClass(RegType::vgpr, num_lds_dwords)),
- bld.copy(bld.def(v1), Operand(0u)), ctx->ngg_gs_scratch_addr, 16u);
-
- begin_divergent_if_else(ctx, &ic);
- end_divergent_if(ctx, &ic);
- bld.reset(ctx->block);
-
- /* Create phis which get us the above reduction results, or undef. */
- bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
- reduction_per_wave_vector = bld.pseudo(aco_opcode::p_phi, bld.def(reduction_per_wave_vector.regClass()), reduction_per_wave_vector, Operand(reduction_per_wave_vector.regClass()));
- bld.reset(ctx->block);
-
- emit_split_vector(ctx, reduction_per_wave_vector, num_lds_dwords);
- Temp reduction_per_wave[8];
-
- for (unsigned i = 0; i < num_lds_dwords; ++i) {
- Temp reduction_current_wave = emit_extract_vector(ctx, reduction_per_wave_vector, i, v1);
- reduction_per_wave[i] = bld.readlane(bld.def(s1), reduction_current_wave, Operand(0u));
- }
-
- Temp wave_count = wave_count_in_threadgroup(ctx);
- Temp reduction_result = reduction_per_wave[0];
- Temp excl_base;
-
- for (unsigned i = 0; i < num_lds_dwords; ++i) {
- /* Workgroup reduction:
- * Add the reduction results from all waves (up to and including wave_count).
- */
- if (i != 0) {
- Temp should_add = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), wave_count, Operand(i + 1u));
- Temp addition = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), reduction_per_wave[i], Operand(0u), bld.scc(should_add));
- reduction_result = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), reduction_result, addition);
- }
-
- /* Base of workgroup exclusive scan:
- * Add the reduction results from waves up to and excluding wave_id_in_tg.
- */
- if (i != (num_lds_dwords - 1)) {
- Temp should_add = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), wave_id_in_tg, Operand(i + 1u));
- Temp addition = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), reduction_per_wave[i], Operand(0u), bld.scc(should_add));
- excl_base = !excl_base.id() ? addition : bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), excl_base, addition);
- }
- }
-
- assert(excl_base.id());
-
- /* WG exclusive scan result: base + subgroup exclusive result. */
- Temp wg_excl = bld.vadd32(bld.def(v1), Operand(excl_base), Operand(sg_excl));
-
- return std::make_pair(reduction_result, wg_excl);
-}
-
-void ngg_gs_clear_primflags(isel_context *ctx, Temp vtx_cnt, unsigned stream)
-{
- loop_context lc;
- if_context ic;
- Builder bld(ctx->program, ctx->block);
- Temp zero = bld.copy(bld.def(v1b), Operand(uint8_t(0)));
- Temp counter_init = bld.copy(bld.def(v1), as_vgpr(ctx, vtx_cnt));
-
- begin_loop(ctx, &lc);
-
- Temp incremented_counter = bld.tmp(counter_init.regClass());
- bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
- Temp counter = bld.pseudo(aco_opcode::p_phi, bld.def(counter_init.regClass()), Operand(counter_init), incremented_counter);
- bld.reset(ctx->block);
- Temp break_cond = bld.vopc(aco_opcode::v_cmp_le_u32, bld.def(bld.lm), Operand(ctx->shader->info.gs.vertices_out), counter);
-
- /* Break when vertices_out <= counter */
- begin_divergent_if_then(ctx, &ic, break_cond);
- emit_loop_break(ctx);
- begin_divergent_if_else(ctx, &ic);
- end_divergent_if(ctx, &ic);
- bld.reset(ctx->block);
-
- /* Store zero to the primitive flag of the current vertex for the current stream */
- Temp emit_vertex_addr = ngg_gs_emit_vertex_lds_addr(ctx, counter);
- unsigned primflag_offset = ctx->ngg_gs_primflags_offset + stream;
- store_lds(ctx, 1, zero, 0xf, emit_vertex_addr, primflag_offset, 1);
-
- /* Increment counter */
- bld.vadd32(Definition(incremented_counter), counter, Operand(1u));
-
- end_loop(ctx, &lc);
-}
-
-void ngg_gs_write_shader_query(isel_context *ctx, nir_intrinsic_instr *instr)
-{
- /* Each subgroup uses a single GDS atomic to collect the total number of primitives.
- * TODO: Consider using primitive compaction at the end instead.
- */
-
- unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
- if_context ic_shader_query;
- Builder bld(ctx->program, ctx->block);
-
- Temp shader_query = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), get_arg(ctx, ctx->args->ngg_gs_state), Operand(0u));
- begin_uniform_if_then(ctx, &ic_shader_query, shader_query);
- bld.reset(ctx->block);
-
- Temp sg_prm_cnt;
-
- /* Calculate the "real" number of emitted primitives from the emitted GS vertices and primitives.
- * GS emits points, line strips or triangle strips.
- * Real primitives are points, lines or triangles.
- */
- if (nir_src_is_const(instr->src[0]) && nir_src_is_const(instr->src[1])) {
- unsigned gs_vtx_cnt = nir_src_as_uint(instr->src[0]);
- unsigned gs_prm_cnt = nir_src_as_uint(instr->src[1]);
- Temp prm_cnt = bld.copy(bld.def(s1), Operand(gs_vtx_cnt - gs_prm_cnt * (total_vtx_per_prim - 1u)));
- Temp thread_cnt = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
- sg_prm_cnt = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), prm_cnt, thread_cnt);
- } else {
- Temp gs_vtx_cnt = get_ssa_temp(ctx, instr->src[0].ssa);
- Temp prm_cnt = get_ssa_temp(ctx, instr->src[1].ssa);
- if (total_vtx_per_prim > 1)
- prm_cnt = bld.vop3(aco_opcode::v_mad_i32_i24, bld.def(v1), prm_cnt, Operand(-1u * (total_vtx_per_prim - 1)), gs_vtx_cnt);
- else
- prm_cnt = as_vgpr(ctx, prm_cnt);
-
- /* Reduction calculates the primitive count for the entire subgroup. */
- sg_prm_cnt = emit_reduction_instr(ctx, aco_opcode::p_reduce, ReduceOp::iadd32,
- ctx->program->wave_size, bld.def(s1), prm_cnt);
- }
-
- Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
- Temp is_first_lane = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
- Operand(1u, ctx->program->wave_size == 64), first_lane);
-
- if_context ic_last_lane;
- begin_divergent_if_then(ctx, &ic_last_lane, is_first_lane);
- bld.reset(ctx->block);
-
- Temp gds_addr = bld.copy(bld.def(v1), Operand(0u));
- Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand(0x100u)));
- bld.ds(aco_opcode::ds_add_u32, gds_addr, as_vgpr(ctx, sg_prm_cnt), m, 0u, 0u, true);
-
- begin_divergent_if_else(ctx, &ic_last_lane);
- end_divergent_if(ctx, &ic_last_lane);
-
- begin_uniform_if_else(ctx, &ic_shader_query);
- end_uniform_if(ctx, &ic_shader_query);
-}
-
-Temp ngg_gs_load_prim_flag_0(isel_context *ctx, Temp tid_in_tg, Temp max_vtxcnt, Temp vertex_lds_addr)
-{
- if_context ic;
- Builder bld(ctx->program, ctx->block);
-
- Temp is_vertex_emit_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_vtxcnt, tid_in_tg);
- begin_divergent_if_then(ctx, &ic, is_vertex_emit_thread);
- bld.reset(ctx->block);
-
- Operand m = load_lds_size_m0(bld);
- Temp prim_flag_0 = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset);
-
- begin_divergent_if_else(ctx, &ic);
- end_divergent_if(ctx, &ic);
-
- bld.reset(&ctx->block->instructions, ctx->block->instructions.begin());
- prim_flag_0 = bld.pseudo(aco_opcode::p_phi, bld.def(prim_flag_0.regClass()), Operand(prim_flag_0), Operand(0u));
-
- return prim_flag_0;
-}
-
-void ngg_gs_setup_vertex_compaction(isel_context *ctx, Temp vertex_live, Temp tid_in_tg, Temp exporter_tid_in_tg)
-{
- if_context ic;
- Builder bld(ctx->program, ctx->block);
- assert(vertex_live.regClass() == bld.lm);
-
- begin_divergent_if_then(ctx, &ic, vertex_live);
- bld.reset(ctx->block);
-
- /* Setup the vertex compaction.
- * Save the current thread's id for the thread which will export the current vertex.
- * We reuse stream 1 of the primitive flag of the other thread's vertex for storing this.
- */
- Temp export_thread_lds_addr = ngg_gs_vertex_lds_addr(ctx, exporter_tid_in_tg);
- tid_in_tg = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tid_in_tg, Operand(0u));
- store_lds(ctx, 1u, tid_in_tg, 1u, export_thread_lds_addr, ctx->ngg_gs_primflags_offset + 1u, 1u);
-
- begin_divergent_if_else(ctx, &ic);
- end_divergent_if(ctx, &ic);
- bld.reset(ctx->block);
-
- /* Wait for all waves to setup the vertex compaction. */
- create_workgroup_barrier(bld);
-}
-
-void ngg_gs_export_primitives(isel_context *ctx, Temp max_prmcnt, Temp tid_in_tg, Temp exporter_tid_in_tg,
- Temp prim_flag_0)
-{
- if_context ic;
- Builder bld(ctx->program, ctx->block);
- unsigned total_vtx_per_prim = gs_outprim_vertices(ctx->shader->info.gs.output_primitive);
- assert(total_vtx_per_prim <= 3);
-
- Temp is_prim_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), max_prmcnt, tid_in_tg);
- begin_divergent_if_then(ctx, &ic, is_prim_export_thread);
- bld.reset(ctx->block);
-
- Temp is_null_prim = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(-1u), prim_flag_0);
- Temp indices[3];
-
- indices[total_vtx_per_prim - 1] = exporter_tid_in_tg;
- if (total_vtx_per_prim >= 2)
- indices[total_vtx_per_prim - 2] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(1u));
- if (total_vtx_per_prim == 3)
- indices[total_vtx_per_prim - 3] = bld.vsub32(bld.def(v1), exporter_tid_in_tg, Operand(2u));
-
- if (total_vtx_per_prim == 3) {
- /* API GS outputs triangle strips, but NGG HW needs triangles.
- * We already have triangles due to how we set the primitive flags, but we need to
- * make sure the vertex order is so that the front/back is correct, and the provoking vertex is kept.
- */
- bool flatshade_first = !ctx->args->options->key.vs.provoking_vtx_last;
-
- /* If the triangle is odd, this will swap its two non-provoking vertices. */
- Temp is_odd = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), Operand(prim_flag_0), Operand(1u), Operand(1u));
- if (flatshade_first) {
- indices[1] = bld.vadd32(bld.def(v1), indices[1], Operand(is_odd));
- indices[2] = bld.vsub32(bld.def(v1), indices[2], Operand(is_odd));
- } else {
- indices[0] = bld.vadd32(bld.def(v1), indices[0], Operand(is_odd));
- indices[1] = bld.vsub32(bld.def(v1), indices[1], Operand(is_odd));
- }
- }
-
- ngg_emit_prim_export(ctx, total_vtx_per_prim, indices, is_null_prim);
-
- begin_divergent_if_else(ctx, &ic);
- end_divergent_if(ctx, &ic);
-}
-
-void ngg_gs_export_vertices(isel_context *ctx, Temp wg_vtx_cnt, Temp tid_in_tg, Temp vertex_lds_addr)
-{
- if_context ic;
- Builder bld(ctx->program, ctx->block);
-
- /* See if the current thread has to export a vertex. */
- Temp is_vtx_export_thread = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.def(bld.lm), wg_vtx_cnt, tid_in_tg);
- begin_divergent_if_then(ctx, &ic, is_vtx_export_thread);
- bld.reset(ctx->block);
-
- /* The index of the vertex that the current thread will export. */
- Temp exported_vtx_idx;
-
- if (ctx->ngg_gs_early_alloc) {
- /* No vertex compaction necessary, the thread can export its own vertex. */
- exported_vtx_idx = tid_in_tg;
- } else {
- /* Vertex compaction: read stream 1 of the primitive flags to see which vertex the current thread needs to export */
- Operand m = load_lds_size_m0(bld);
- exported_vtx_idx = bld.ds(aco_opcode::ds_read_u8, bld.def(v1), vertex_lds_addr, m, ctx->ngg_gs_primflags_offset + 1);
- }
-
- /* Get the LDS address of the vertex that the current thread must export. */
- Temp exported_vtx_addr = ngg_gs_vertex_lds_addr(ctx, exported_vtx_idx);
-
- /* Read the vertex attributes from LDS. */
- unsigned out_idx = 0;
- for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
- if (ctx->program->info->gs.output_streams[i] != 0)
- continue;
-
- /* Set the output mask to the GS output usage mask. */
- unsigned rdmask =
- ctx->outputs.mask[i] =
- ctx->program->info->gs.output_usage_mask[i];
-
- if (!rdmask)
- continue;
-
- for (unsigned j = 0; j < 4; j++) {
- if (rdmask & (1 << j))
- ctx->outputs.temps[i * 4u + j] =
- load_lds(ctx, 4u, bld.tmp(v1), exported_vtx_addr, out_idx * 4u, 4u);
-
- out_idx++;
- }
- }
-
- /* Export the vertex parameters. */
- create_vs_exports(ctx);
-
- begin_divergent_if_else(ctx, &ic);
- end_divergent_if(ctx, &ic);
-}
-
-void ngg_gs_prelude(isel_context *ctx)
-{
- if (!ctx->ngg_gs_early_alloc)
- return;
-
- /* We know the GS writes the maximum possible number of vertices, so
- * it's likely that most threads need to export a primitive, too.
- * Thus, we won't have to worry about primitive compaction here.
- */
- Temp num_max_vertices = ngg_max_vertex_count(ctx);
- ngg_emit_wave0_sendmsg_gs_alloc_req(ctx, num_max_vertices, num_max_vertices);
-}
-
-void ngg_gs_finale(isel_context *ctx)
-{
- /* Sanity check. Make sure the vertex/primitive counts are set and the LDS is correctly initialized. */
- assert(ctx->ngg_gs_known_vtxcnt[0]);
-
- if_context ic;
- Builder bld(ctx->program, ctx->block);
-
- /* Wait for all waves to reach the epilogue. */
- create_workgroup_barrier(bld);
-
- /* Thread ID in the entire threadgroup */
- Temp tid_in_tg = thread_id_in_threadgroup(ctx);
- /* Number of threads that may need to export a vertex or primitive. */
- Temp max_vtxcnt = ngg_max_vertex_count(ctx);
- /* LDS address of the vertex corresponding to the current thread. */
- Temp vertex_lds_addr = ngg_gs_vertex_lds_addr(ctx, tid_in_tg);
- /* Primitive flag from stream 0 of the vertex corresponding to the current thread. */
- Temp prim_flag_0 = ngg_gs_load_prim_flag_0(ctx, tid_in_tg, max_vtxcnt, vertex_lds_addr);
-
- bld.reset(ctx->block);
-
- /* NIR already filters out incomplete primitives and vertices,
- * so any vertex whose primitive flag is non-zero is considered live/valid.
- */
- Temp vertex_live = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), Operand(prim_flag_0));
-
- /* Total number of vertices emitted by the workgroup. */
- Temp wg_vtx_cnt;
- /* ID of the thread which will export the current thread's vertex. */
- Temp exporter_tid_in_tg;
-
- if (ctx->ngg_gs_early_alloc) {
- /* There is no need for a scan or vertex compaction, we know that
- * the GS writes all possible vertices so each thread can export its own vertex.
- */
- wg_vtx_cnt = max_vtxcnt;
- exporter_tid_in_tg = tid_in_tg;
- } else {
- /* Perform a workgroup reduction and exclusive scan. */
- std::pair<Temp, Temp> wg_scan = ngg_gs_workgroup_reduce_and_scan(ctx, vertex_live);
- bld.reset(ctx->block);
- /* Total number of vertices emitted by the workgroup. */
- wg_vtx_cnt = wg_scan.first;
- /* ID of the thread which will export the current thread's vertex. */
- exporter_tid_in_tg = wg_scan.second;
- /* Skip all exports when possible. */
- Temp have_exports = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), wg_vtx_cnt, Operand(0u));
- max_vtxcnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), max_vtxcnt, Operand(0u), bld.scc(have_exports));
-
- ngg_emit_wave0_sendmsg_gs_alloc_req(ctx, wg_vtx_cnt, max_vtxcnt);
- ngg_gs_setup_vertex_compaction(ctx, vertex_live, tid_in_tg, exporter_tid_in_tg);
- }
-
- ngg_gs_export_primitives(ctx, max_vtxcnt, tid_in_tg, exporter_tid_in_tg, prim_flag_0);
- ngg_gs_export_vertices(ctx, wg_vtx_cnt, tid_in_tg, vertex_lds_addr);
-}
-
} /* end namespace */
void select_program(Program *program,
@@ -11753,7 +11100,7 @@ void select_program(Program *program,
{
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
if_context ic_merged_wave_info;
- bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
+ bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
for (unsigned i = 0; i < shader_count; i++) {
nir_shader *nir = shaders[i];
@@ -11776,9 +11123,6 @@ void select_program(Program *program,
}
}
- if (!i && ngg_gs)
- ngg_gs_prelude(&ctx);
-
/* In a merged VS+TCS HS, the VS implementation can be completely empty. */
nir_function_impl *func = nir_shader_get_entrypoint(nir);
bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
@@ -11787,14 +11131,8 @@ void select_program(Program *program,
(nir->info.stage == MESA_SHADER_TESS_EVAL &&
ctx.stage == tess_eval_geometry_gs));
- bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader);
- bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
-
- if (i && ngg_gs) {
- /* NGG GS waves need to wait for each other after the GS half is done. */
- Builder bld(ctx.program, ctx.block);
- create_workgroup_barrier(bld);
- }
+ bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
+ bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
if (check_merged_wave_info) {
Temp cond = merged_wave_info_to_mask(&ctx, i);
@@ -11843,9 +11181,6 @@ void select_program(Program *program,
end_divergent_if(&ctx, &ic_merged_wave_info);
}
- if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
- ngg_gs_finale(&ctx);
-
if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
/* Outputs of the previous stage are inputs to the next stage */
ctx.inputs = ctx.outputs;
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h
index 0a5462c04c2..379da7772e0 100644
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -93,15 +93,7 @@ struct isel_context {
Temp persp_centroid, linear_centroid;
/* GS inputs */
- bool ngg_gs_early_alloc = false;
- bool ngg_gs_known_vtxcnt[4] = {false, false, false, false};
Temp gs_wave_id;
- unsigned ngg_gs_emit_addr = 0;
- unsigned ngg_gs_emit_vtx_bytes = 0;
- unsigned ngg_gs_scratch_addr = 0;
- unsigned ngg_gs_primflags_offset = 0;
- int ngg_gs_const_vtxcnt[4];
- int ngg_gs_const_prmcnt[4];
/* VS output information */
bool export_clip_dists;
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 2e44568f8eb..e92e6b3cc9b 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -407,28 +407,7 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir)
setup_vs_output_info(ctx, nir, false,
ctx->options->key.vs_common_out.export_clip_dists, outinfo);
- unsigned ngg_gs_scratch_bytes = ctx->args->shader_info->so.num_outputs ? (44u * 4u) : (8u * 4u);
- unsigned ngg_emit_bytes = ctx->args->shader_info->ngg_info.ngg_emit_size * 4u;
- unsigned esgs_ring_bytes = ctx->args->shader_info->ngg_info.esgs_ring_size;
-
- ctx->ngg_gs_primflags_offset = ctx->args->shader_info->gs.gsvs_vertex_size;
- ctx->ngg_gs_emit_vtx_bytes = ctx->ngg_gs_primflags_offset + 4u;
- ctx->ngg_gs_emit_addr = esgs_ring_bytes;
- ctx->ngg_gs_scratch_addr = ctx->ngg_gs_emit_addr + ngg_emit_bytes;
- ctx->ngg_gs_scratch_addr = ALIGN(ctx->ngg_gs_scratch_addr, 16u);
-
- unsigned total_lds_bytes = ctx->ngg_gs_scratch_addr + ngg_gs_scratch_bytes;
- assert(total_lds_bytes >= ctx->ngg_gs_emit_addr);
- assert(total_lds_bytes >= ctx->ngg_gs_scratch_addr);
- ctx->program->config->lds_size = DIV_ROUND_UP(total_lds_bytes, ctx->program->dev.lds_encoding_granule);
-
- /* Make sure we have enough room for emitted GS vertices */
- if (nir->info.gs.vertices_out)
- assert((ngg_emit_bytes % (ctx->ngg_gs_emit_vtx_bytes * nir->info.gs.vertices_out)) == 0);
-
- /* See if the number of vertices and primitives are compile-time known */
- nir_gs_count_vertices_and_primitives(nir, ctx->ngg_gs_const_vtxcnt, ctx->ngg_gs_const_prmcnt, 4u);
- ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1;
+ ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
}
if (ctx->stage.has(SWStage::VS))
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index fd5780a55e8..385cc6d4486 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -862,11 +862,16 @@ bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has
info->is_ngg_passthrough = out_conf.passthrough;
key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
- if (!key->vs_common_out.as_ngg)
+ if (!info->is_ngg)
return false;
- /* TODO: lower NGG GS in NIR */
- return false;
+ ac_nir_lower_ngg_gs(
+ nir, info->wave_size, max_workgroup_size,
+ info->ngg_info.esgs_ring_size,
+ info->gs.gsvs_vertex_size,
+ info->ngg_info.ngg_emit_size * 4u,
+ key->vs.provoking_vtx_last);
+ return true;
} else {
return false;
}
More information about the mesa-commit
mailing list