Mesa (main): aco, nir, ac: Simplify sequence of getting initial NGG VS edge flags.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Mon Aug 2 11:58:52 UTC 2021
Module: Mesa
Branch: main
Commit: 1bbea90f50947f208d20a725e8043a73ce7494c2
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=1bbea90f50947f208d20a725e8043a73ce7494c2
Author: Timur Kristóf <timur.kristof at gmail.com>
Date: Thu Jul 15 13:56:18 2021 +0200
aco, nir, ac: Simplify sequence of getting initial NGG VS edge flags.
Instead of v_bfe + v_lshl_or for each vertex, get all 3 edge flags
at once of every vertex. This takes fewer VALU instructions than
previously.
Fossil DB results on Sienna Cichlid (with NGGC on):
Totals from 56917 (44.24% of 128647) affected shaders:
CodeSize: 161028288 -> 158751628 (-1.41%)
Instrs: 30917985 -> 30519571 (-1.29%)
Latency: 130617204 -> 129975532 (-0.49%); split: -0.50%, +0.01%
InvThroughput: 21280238 -> 20927401 (-1.66%)
Copies: 3011120 -> 3011125 (+0.00%); split: -0.00%, +0.00%
No Fossil DB changed with NGGC off.
Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11908>
---
src/amd/common/ac_nir_lower_ngg.c | 13 ++++---------
src/amd/compiler/aco_instruction_selection.cpp | 12 +++++++-----
src/amd/compiler/aco_instruction_selection_setup.cpp | 2 +-
src/compiler/nir/nir_divergence_analysis.c | 2 +-
src/compiler/nir/nir_intrinsics.py | 4 ++--
5 files changed, 15 insertions(+), 18 deletions(-)
diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c
index 2d35d65bbec..c15fac1aee3 100644
--- a/src/amd/common/ac_nir_lower_ngg.c
+++ b/src/amd/common/ac_nir_lower_ngg.c
@@ -267,18 +267,13 @@ static nir_ssa_def *
emit_pack_ngg_prim_exp_arg(nir_builder *b, unsigned num_vertices_per_primitives,
nir_ssa_def *vertex_indices[3], nir_ssa_def *is_null_prim)
{
- nir_ssa_def *arg = vertex_indices[0];
+ nir_ssa_def *arg = b->shader->info.stage == MESA_SHADER_VERTEX
+ ? nir_build_load_initial_edgeflags_amd(b)
+ : nir_imm_int(b, 0);
for (unsigned i = 0; i < num_vertices_per_primitives; ++i) {
assert(vertex_indices[i]);
-
- if (i)
- arg = nir_ior(b, arg, nir_ishl(b, vertex_indices[i], nir_imm_int(b, 10u * i)));
-
- if (b->shader->info.stage == MESA_SHADER_VERTEX) {
- nir_ssa_def *edgeflag = nir_build_load_initial_edgeflag_amd(b, 32, nir_imm_int(b, i));
- arg = nir_ior(b, arg, nir_ishl(b, edgeflag, nir_imm_int(b, 10u * i + 9u)));
- }
+ arg = nir_ior(b, arg, nir_ishl(b, vertex_indices[i], nir_imm_int(b, 10u * i)));
}
if (is_null_prim) {
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index bfed664b09b..57793000320 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -8938,14 +8938,16 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
Operand::c32(pos | (9u << 16u)));
break;
}
- case nir_intrinsic_load_initial_edgeflag_amd: {
+ case nir_intrinsic_load_initial_edgeflags_amd: {
assert(ctx->stage.hw == HWStage::NGG);
- assert(nir_src_is_const(instr->src[0]));
- unsigned i = nir_src_as_uint(instr->src[0]);
Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
- bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
- gs_invocation_id, Operand::c32(8u + i), Operand::c32(1u));
+ /* Get initial edgeflags for each vertex at bits 8, 9, 10 of gs_invocation_id. */
+ Temp flags = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id);
+ /* Move the bits to their desired position: 8->9, 9->19, 10->29. */
+ flags = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), Operand::c32(0x80402u), flags);
+ /* Remove garbage bits that are a byproduct of the multiplication. */
+ bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x20080200), flags);
break;
}
case nir_intrinsic_load_packed_passthrough_primitive_amd: {
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index af3508ea62c..f86a5a5a331 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -766,7 +766,7 @@ init_context(isel_context* ctx, nir_shader* shader)
case nir_intrinsic_load_buffer_amd:
case nir_intrinsic_load_tess_rel_patch_id_amd:
case nir_intrinsic_load_gs_vertex_offset_amd:
- case nir_intrinsic_load_initial_edgeflag_amd:
+ case nir_intrinsic_load_initial_edgeflags_amd:
case nir_intrinsic_load_packed_passthrough_primitive_amd:
case nir_intrinsic_gds_atomic_add_amd:
case nir_intrinsic_load_sbt_amd:
diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c
index 7eb8d537084..9aecf8aee11 100644
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@@ -513,7 +513,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
case nir_intrinsic_has_input_vertex_amd:
case nir_intrinsic_has_input_primitive_amd:
case nir_intrinsic_load_packed_passthrough_primitive_amd:
- case nir_intrinsic_load_initial_edgeflag_amd:
+ case nir_intrinsic_load_initial_edgeflags_amd:
case nir_intrinsic_gds_atomic_add_amd:
is_divergent = true;
break;
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 3d2a93236e3..13314a8005c 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1196,8 +1196,8 @@ intrinsic("load_cull_small_primitives_enabled_amd", dest_comp=1, bit_sizes=[1],
intrinsic("load_cull_any_enabled_amd", dest_comp=1, bit_sizes=[1], flags=[CAN_ELIMINATE])
# Small primitive culling precision
intrinsic("load_cull_small_prim_precision_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])
-# Initial edge flag in a Vertex Shader. src = {vertex index}.
-intrinsic("load_initial_edgeflag_amd", src_comp=[1], dest_comp=1, indices=[])
+# Initial edge flags in a Vertex Shader, packed into the format the HW needs for primitive export.
+intrinsic("load_initial_edgeflags_amd", src_comp=[], dest_comp=1, bit_sizes=[32], indices=[])
# Exports the current invocation's vertex. This is a placeholder where all vertex attribute export instructions should be emitted.
intrinsic("export_vertex_amd", src_comp=[], indices=[])
# Exports the current invocation's primitive. src[] = {packed_primitive_data}.
More information about the mesa-commit
mailing list