Mesa (main): aco: Allow elect to take advantage of knowing when all lanes are active.
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Fri Jul 16 14:48:33 UTC 2021
Module: Mesa
Branch: main
Commit: e66f54e5c83fd545e1a4062e683b584a35dacc00
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=e66f54e5c83fd545e1a4062e683b584a35dacc00
Author: Timur Kristóf <timur.kristof at gmail.com>
Date: Fri Jun 18 15:21:43 2021 +0200
aco: Allow elect to take advantage of knowing when all lanes are active.
Implement elect using a pseudo-op which is lowered during the
insert_exec_mask pass. This makes it possible to emit a more
optimal sequence when the exec mask is constant.
Fossil DB results on Sienna Cichlid:
Totals from 211 (0.16% of 128647) affected shaders:
CodeSize: 2254356 -> 2240468 (-0.62%); split: -0.62%, +0.00%
Instrs: 438471 -> 434996 (-0.79%); split: -0.80%, +0.01%
Latency: 2717082 -> 2709400 (-0.28%); split: -0.28%, +0.00%
InvThroughput: 566987 -> 566342 (-0.11%); split: -0.11%, +0.00%
Copies: 40058 -> 40162 (+0.26%)
Branches: 31209 -> 31211 (+0.01%)
PreSGPRs: 9927 -> 10125 (+1.99%)
Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11458>
---
src/amd/compiler/aco_insert_exec_mask.cpp | 14 ++++++++++++++
src/amd/compiler/aco_instruction_selection.cpp | 7 +++----
src/amd/compiler/aco_opcodes.py | 3 +++
3 files changed, 20 insertions(+), 4 deletions(-)
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index fb4ad4fc871..56e7045b852 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -813,6 +813,20 @@ process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instructio
instr->opcode = aco_opcode::p_exit_early_if;
instr->operands[0] = bld.scc(exit_cond);
state = Exact;
+
+ } else if (instr->opcode == aco_opcode::p_elect) {
+ bool all_lanes_enabled = ctx.info[block->index].exec.back().first.constantEquals(-1u);
+ Definition dst = instr->definitions[0];
+
+ if (all_lanes_enabled) {
+ bld.copy(Definition(dst), Operand::c32_or_c64(1u, dst.size() == 2));
+ } else {
+ Temp first_lane_idx = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
+ bld.sop2(Builder::s_lshl, Definition(dst), bld.def(s1, scc),
+ Operand::c32_or_c64(1u, dst.size() == 2), Operand(first_lane_idx));
+ }
+ instr.reset();
+ continue;
}
bld.insert(std::move(instr));
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 3986850142d..70bf5e8962f 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -8722,10 +8722,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
break;
}
case nir_intrinsic_elect: {
- Temp first = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
- emit_wqm(
- bld, bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand::c32(1u), first),
- get_ssa_temp(ctx, &instr->dest.ssa));
+ Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm));
+ emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->dest.ssa));
+ ctx->block->kind |= block_kind_needs_lowering;
break;
}
case nir_intrinsic_shader_clock: {
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index d6d796b195d..d9ab6a435ef 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -319,6 +319,9 @@ opcode("p_exit_early_if")
# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
opcode("p_bpermute")
+# creates a lane mask where only the first active lane is selected
+opcode("p_elect")
+
opcode("p_constaddr")
# These don't have to be pseudo-ops, but it makes optimization easier to only
More information about the mesa-commit
mailing list