Mesa (main): aco: Allow elect to take advantage of knowing when all lanes are active.

Fri Jul 16 14:48:33 UTC 2021

Module: Mesa
Branch: main
Commit: e66f54e5c83fd545e1a4062e683b584a35dacc00
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=e66f54e5c83fd545e1a4062e683b584a35dacc00

Author: Timur Kristóf <timur.kristof at gmail.com>
Date:   Fri Jun 18 15:21:43 2021 +0200

aco: Allow elect to take advantage of knowing when all lanes are active.

Implement elect using a pseudo-op which is lowered during the
insert_exec_mask pass. This makes it possible to emit a more
optimal sequence when the exec mask is constant.

Fossil DB results on Sienna Cichlid:
Totals from 211 (0.16% of 128647) affected shaders:
CodeSize: 2254356 -> 2240468 (-0.62%); split: -0.62%, +0.00%
Instrs: 438471 -> 434996 (-0.79%); split: -0.80%, +0.01%
Latency: 2717082 -> 2709400 (-0.28%); split: -0.28%, +0.00%
InvThroughput: 566987 -> 566342 (-0.11%); split: -0.11%, +0.00%
Copies: 40058 -> 40162 (+0.26%)
Branches: 31209 -> 31211 (+0.01%)
PreSGPRs: 9927 -> 10125 (+1.99%)

Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11458>

---

 src/amd/compiler/aco_insert_exec_mask.cpp      | 14 ++++++++++++++
 src/amd/compiler/aco_instruction_selection.cpp |  7 +++----
 src/amd/compiler/aco_opcodes.py                |  3 +++
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index fb4ad4fc871..56e7045b852 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -813,6 +813,20 @@ process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instructio
          instr->opcode = aco_opcode::p_exit_early_if;
          instr->operands[0] = bld.scc(exit_cond);
          state = Exact;
+
+      } else if (instr->opcode == aco_opcode::p_elect) {
+         bool all_lanes_enabled = ctx.info[block->index].exec.back().first.constantEquals(-1u);
+         Definition dst = instr->definitions[0];
+
+         if (all_lanes_enabled) {
+            bld.copy(Definition(dst), Operand::c32_or_c64(1u, dst.size() == 2));
+         } else {
+            Temp first_lane_idx = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
+            bld.sop2(Builder::s_lshl, Definition(dst), bld.def(s1, scc),
+                     Operand::c32_or_c64(1u, dst.size() == 2), Operand(first_lane_idx));
+         }
+         instr.reset();
+         continue;
       }
 
       bld.insert(std::move(instr));
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 3986850142d..70bf5e8962f 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -8722,10 +8722,9 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
       break;
    }
    case nir_intrinsic_elect: {
-      Temp first = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
-      emit_wqm(
-         bld, bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), Operand::c32(1u), first),
-         get_ssa_temp(ctx, &instr->dest.ssa));
+      Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm));
+      emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->dest.ssa));
+      ctx->block->kind |= block_kind_needs_lowering;
       break;
    }
    case nir_intrinsic_shader_clock: {
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index d6d796b195d..d9ab6a435ef 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -319,6 +319,9 @@ opcode("p_exit_early_if")
 # simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
 opcode("p_bpermute")
 
+# creates a lane mask where only the first active lane is selected
+opcode("p_elect")
+
 opcode("p_constaddr")
 
 # These don't have to be pseudo-ops, but it makes optimization easier to only