Mesa (main): radv: Use new NGG NIR lowering for VS/TES when ACO is used.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed May 12 14:12:17 UTC 2021
Module: Mesa
Branch: main
Commit: 9732881729bd0da8096e03efe77e511d4d6f815d
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=9732881729bd0da8096e03efe77e511d4d6f815d
Author: Timur Kristóf <timur.kristof at gmail.com>
Date: Thu Apr 15 17:21:57 2021 +0200
radv: Use new NGG NIR lowering for VS/TES when ACO is used.
Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10740>
---
src/amd/compiler/aco_instruction_selection.cpp | 146 ++-------------------
src/amd/compiler/aco_instruction_selection.h | 1 -
.../compiler/aco_instruction_selection_setup.cpp | 15 +--
src/amd/vulkan/radv_pipeline.c | 3 +-
src/amd/vulkan/radv_shader.c | 64 +++++++++
src/amd/vulkan/radv_shader.h | 5 +
6 files changed, 90 insertions(+), 144 deletions(-)
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 6f7a97c7a69..089addd3313 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -4396,6 +4396,8 @@ bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp d
return true;
}
+static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos);
+
void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
{
if (ctx->stage == vertex_vs ||
@@ -4413,6 +4415,11 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
} else {
unreachable("Shader stage not implemented");
}
+
+ /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we have to emit an exp here manually */
+ if (ctx->stage.hw == HWStage::NGG && (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
+ nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
+ export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
}
void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
@@ -8496,8 +8503,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
break;
default:
if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
- /* This is actually the same as gs_prim_id, but we call it differently when there is no SW GS. */
- bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vs_prim_id));
+ /* In case of NGG, the GS threads always have the primitive ID even if there is no SW GS. */
+ bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
break;
}
unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
@@ -11330,126 +11337,6 @@ void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive
false /* compressed */, true/* done */, false /* valid mask */);
}
-void ngg_nogs_export_primitives(isel_context *ctx)
-{
- /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
- * These must always come before VS exports.
- *
- * It is recommended to do these as early as possible. They can be at the beginning when
- * there is no SW GS and the shader doesn't write edge flags.
- */
-
- if_context ic;
- Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
- begin_divergent_if_then(ctx, &ic, is_gs_thread);
-
- Builder bld(ctx->program, ctx->block);
- constexpr unsigned max_vertices_per_primitive = 3;
- unsigned num_vertices_per_primitive = max_vertices_per_primitive;
-
- assert(!ctx->stage.has(SWStage::GS));
-
- if (ctx->stage == vertex_ngg) {
- /* TODO: optimize for points & lines */
- } else if (ctx->stage == tess_eval_ngg) {
- if (ctx->shader->info.tess.point_mode)
- num_vertices_per_primitive = 1;
- else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
- num_vertices_per_primitive = 2;
- } else {
- unreachable("Unsupported NGG non-GS shader stage");
- }
-
- Temp vtxindex[max_vertices_per_primitive];
- if (!ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
- vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
- get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
- vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
- bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
- get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]), Operand(16u), Operand(16u));
- vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
- bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
- get_arg(ctx, ctx->args->ac.gs_vtx_offset[2]));
- }
-
- /* Export primitive data to the index buffer. */
- ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
-
- /* Export primitive ID. */
- if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) {
- /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
- Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
- unsigned provoking_vtx_in_prim = 0;
-
- /* For provoking vertex last mode, use num_vtx_in_prim - 1. */
- if (ctx->args->options->key.vs.provoking_vtx_last)
- provoking_vtx_in_prim = ctx->args->options->key.vs.outprim;
-
- Temp provoking_vtx_index = vtxindex[provoking_vtx_in_prim];
- Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
-
- store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
- }
-
- begin_divergent_if_else(ctx, &ic);
- end_divergent_if(ctx, &ic);
-}
-
-void ngg_nogs_export_prim_id(isel_context *ctx)
-{
- assert(ctx->args->options->key.vs_common_out.export_prim_id);
- Temp prim_id;
-
- if (ctx->stage == vertex_ngg) {
- /* Wait for GS threads to store primitive ID in LDS. */
- Builder bld(ctx->program, ctx->block);
- create_workgroup_barrier(bld);
-
- /* Calculate LDS address where the GS threads stored the primitive ID. */
- Temp thread_id_in_tg = thread_id_in_threadgroup(ctx);
- Temp addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, 4u);
-
- /* Load primitive ID from LDS. */
- prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
- } else if (ctx->stage == tess_eval_ngg) {
- /* TES: Just use the patch ID as the primitive ID. */
- prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
- } else {
- unreachable("unsupported NGG non-GS shader stage.");
- }
-
- ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
- ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
-
- export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
-}
-
-void ngg_nogs_prelude(isel_context *ctx)
-{
- ngg_emit_wave0_sendmsg_gs_alloc_req(ctx);
-
- if (ctx->ngg_nogs_early_prim_export)
- ngg_nogs_export_primitives(ctx);
-}
-
-void ngg_nogs_late_export_finale(isel_context *ctx)
-{
- assert(!ctx->ngg_nogs_early_prim_export);
-
- /* Export VS/TES primitives. */
- ngg_nogs_export_primitives(ctx);
-
- /* Export the primitive ID for VS - needs to read LDS written by GS threads. */
- if (ctx->args->options->key.vs_common_out.export_prim_id && ctx->stage.has(SWStage::VS)) {
- if_context ic;
- Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
- begin_divergent_if_then(ctx, &ic, is_es_thread);
- ngg_nogs_export_prim_id(ctx);
- begin_divergent_if_else(ctx, &ic);
- end_divergent_if(ctx, &ic);
- }
-}
-
std::pair<Temp, Temp> ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp src_mask)
{
/* Workgroup scan for NGG GS.
@@ -11866,7 +11753,6 @@ void select_program(Program *program,
{
isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
if_context ic_merged_wave_info;
- bool ngg_no_gs = ctx.stage.hw == HWStage::NGG && !ctx.stage.has(SWStage::GS);
bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
for (unsigned i = 0; i < shader_count; i++) {
@@ -11890,9 +11776,7 @@ void select_program(Program *program,
}
}
- if (ngg_no_gs)
- ngg_nogs_prelude(&ctx);
- else if (!i && ngg_gs)
+ if (!i && ngg_gs)
ngg_gs_prelude(&ctx);
/* In a merged VS+TCS HS, the VS implementation can be completely empty. */
@@ -11903,7 +11787,7 @@ void select_program(Program *program,
(nir->info.stage == MESA_SHADER_TESS_EVAL &&
ctx.stage == tess_eval_geometry_gs));
- bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
+ bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader);
bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
if (i && ngg_gs) {
@@ -11943,10 +11827,6 @@ void select_program(Program *program,
if (ctx.stage.hw == HWStage::VS) {
create_vs_exports(&ctx);
- } else if (ngg_no_gs) {
- create_vs_exports(&ctx);
- if (ctx.args->options->key.vs_common_out.export_prim_id && (ctx.ngg_nogs_early_prim_export || ctx.stage.has(SWStage::TES)))
- ngg_nogs_export_prim_id(&ctx);
} else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
Builder bld(ctx.program, ctx.block);
bld.barrier(aco_opcode::p_barrier,
@@ -11963,9 +11843,7 @@ void select_program(Program *program,
end_divergent_if(&ctx, &ic_merged_wave_info);
}
- if (ngg_no_gs && !ctx.ngg_nogs_early_prim_export)
- ngg_nogs_late_export_finale(&ctx);
- else if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
+ if (ngg_gs && nir->info.stage == MESA_SHADER_GEOMETRY)
ngg_gs_finale(&ctx);
if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h
index 93583ecc40c..0a5462c04c2 100644
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@@ -93,7 +93,6 @@ struct isel_context {
Temp persp_centroid, linear_centroid;
/* GS inputs */
- bool ngg_nogs_early_prim_export = false;
bool ngg_gs_early_alloc = false;
bool ngg_gs_known_vtxcnt[4] = {false, false, false, false};
Temp gs_wave_id;
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 292dd66ebff..2e44568f8eb 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -390,15 +390,11 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir)
/* TODO: NGG streamout */
if (ctx->stage.hw == HWStage::NGG)
assert(!ctx->args->shader_info->so.num_outputs);
-
- /* TODO: check if the shader writes edge flags (not in Vulkan) */
- ctx->ngg_nogs_early_prim_export = exec_list_is_singular(&nir_shader_get_entrypoint(nir)->body);
}
- if (ctx->stage == vertex_ngg && ctx->args->options->key.vs_common_out.export_prim_id) {
- /* We need to store the primitive IDs in LDS */
- unsigned lds_size = ctx->program->info->ngg_info.esgs_ring_size;
- ctx->program->config->lds_size = DIV_ROUND_UP(lds_size, ctx->program->dev.lds_encoding_granule);
+ if (ctx->stage == vertex_ngg) {
+ ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
+ assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
}
}
@@ -463,8 +459,11 @@ setup_tes_variables(isel_context *ctx, nir_shader *nir)
/* TODO: NGG streamout */
if (ctx->stage.hw == HWStage::NGG)
assert(!ctx->args->shader_info->so.num_outputs);
+ }
- ctx->ngg_nogs_early_prim_export = exec_list_is_singular(&nir_shader_get_entrypoint(nir)->body);
+ if (ctx->stage == tess_eval_ngg) {
+ ctx->program->config->lds_size = DIV_ROUND_UP(nir->info.shared_size, ctx->program->dev.lds_encoding_granule);
+ assert((ctx->program->config->lds_size * ctx->program->dev.lds_encoding_granule) < (32 * 1024));
}
}
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 288c232cbbf..6e75338e7f1 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -3434,6 +3434,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
/* Lower I/O intrinsics to memory instructions. */
bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
+ bool lowered_ngg = radv_lower_ngg(device, nir[i], !!nir[MESA_SHADER_GEOMETRY], &infos[i], pipeline_key, &keys[i]);
/* optimize the lowered ALU operations */
bool more_algebraic = true;
@@ -3446,7 +3447,7 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
NIR_PASS(more_algebraic, nir[i], nir_opt_algebraic);
}
- if (io_to_mem || i == MESA_SHADER_COMPUTE)
+ if (io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE)
NIR_PASS_V(nir[i], nir_opt_offsets);
/* Do late algebraic optimization to turn add(a,
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 1ba5875e905..fd5780a55e8 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -810,6 +810,70 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
return false;
}
+bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has_gs,
+ struct radv_shader_info *info,
+ const struct radv_pipeline_key *pl_key,
+ struct radv_shader_variant_key *key)
+{
+ /* TODO: support the LLVM backend with the NIR lowering */
+ if (radv_use_llvm_for_stage(device, nir->info.stage))
+ return false;
+
+ ac_nir_ngg_config out_conf = {0};
+ const struct gfx10_ngg_info *ngg_info = &info->ngg_info;
+ unsigned num_gs_invocations = (nir->info.stage != MESA_SHADER_GEOMETRY || ngg_info->max_vert_out_per_gs_instance) ? 1 : info->gs.invocations;
+ unsigned max_workgroup_size = MAX4(ngg_info->hw_max_esverts, /* Invocations that process an input vertex */
+ ngg_info->max_out_verts, /* Invocations that export an output vertex */
+ ngg_info->max_gsprims * num_gs_invocations, /* Invocations that process an input primitive */
+ ngg_info->max_gsprims * num_gs_invocations * ngg_info->prim_amp_factor /* Invocations that produce an output primitive */);
+
+ /* Maximum HW limit for NGG workgroups */
+ assert(max_workgroup_size <= 256);
+
+ if (nir->info.stage == MESA_SHADER_VERTEX ||
+ nir->info.stage == MESA_SHADER_TESS_EVAL) {
+ if (has_gs || !key->vs_common_out.as_ngg)
+ return false;
+
+ unsigned num_vertices_per_prim = 3;
+
+ if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
+ if (nir->info.tess.point_mode)
+ num_vertices_per_prim = 1;
+ else if (nir->info.tess.primitive_mode == GL_ISOLINES)
+ num_vertices_per_prim = 2;
+ } else if (nir->info.stage == MESA_SHADER_VERTEX) {
+ /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
+ num_vertices_per_prim = key->vs.outprim + 1;
+ }
+
+ out_conf =
+ ac_nir_lower_ngg_nogs(
+ nir,
+ ngg_info->hw_max_esverts,
+ num_vertices_per_prim,
+ max_workgroup_size,
+ info->wave_size,
+ false,
+ key->vs_common_out.as_ngg_passthrough,
+ key->vs_common_out.export_prim_id,
+ key->vs.provoking_vtx_last);
+
+ info->is_ngg_passthrough = out_conf.passthrough;
+ key->vs_common_out.as_ngg_passthrough = out_conf.passthrough;
+ } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
+ if (!key->vs_common_out.as_ngg)
+ return false;
+
+ /* TODO: lower NGG GS in NIR */
+ return false;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
static void *
radv_alloc_shader_memory(struct radv_device *device, struct radv_shader_variant *shader)
{
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index ba360ffa88c..712d1ad6ffa 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -556,4 +556,9 @@ void radv_lower_io(struct radv_device *device, nir_shader *nir);
bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
struct radv_shader_info *info, const struct radv_pipeline_key *pl_key);
+bool radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, bool has_gs,
+ struct radv_shader_info *info,
+ const struct radv_pipeline_key *pl_key,
+ struct radv_shader_variant_key *key);
+
#endif
More information about the mesa-commit
mailing list