Mesa (master): aco: optimize boolean phis with uniform selections

Fri Jul 10 22:47:16 UTC 2020

Module: Mesa
Branch: master
Commit: 9a089baff1af757b1c0f033f4bb16cb2c8864271
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=9a089baff1af757b1c0f033f4bb16cb2c8864271

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Thu Jan  9 16:51:34 2020 +0000

aco: optimize boolean phis with uniform selections

Even though the boolean can be divergent, the control flow can be (at
least partially) uniform. For example, we don't have to create any
s_andn2_b64/s_and_b64/s_or_b64 instructions with this code:
a = ...
loop {
    b = bool_phi a, c
    if (uniform)
        break
    c = ...
}
d = phi c

fossil-db (Navi):
Totals from 5506 (4.05% of 135946) affected shaders:
SGPRs: 605720 -> 604024 (-0.28%)
SpillSGPRs: 52025 -> 51733 (-0.56%)
CodeSize: 65221188 -> 64957808 (-0.40%); split: -0.41%, +0.00%
Instrs: 12637881 -> 12584610 (-0.42%); split: -0.42%, +0.00%

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3388>

---

 src/amd/compiler/aco_lower_phis.cpp | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/amd/compiler/aco_lower_phis.cpp b/src/amd/compiler/aco_lower_phis.cpp
index a7d2b6dce72..ad0c620a69f 100644
--- a/src/amd/compiler/aco_lower_phis.cpp
+++ b/src/amd/compiler/aco_lower_phis.cpp
@@ -35,6 +35,9 @@
 namespace aco {
 
 struct ssa_state {
+   bool checked_preds_for_uniform;
+   bool all_preds_uniform;
+
    bool needs_init;
    uint64_t cur_undef_operands;
 
@@ -152,6 +155,19 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
 {
    Builder bld(program);
 
+   if (!state->checked_preds_for_uniform) {
+      state->all_preds_uniform = !(block->kind & block_kind_merge);
+      for (unsigned pred : block->logical_preds)
+         state->all_preds_uniform = state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform);
+      state->checked_preds_for_uniform = true;
+   }
+
+   if (state->all_preds_uniform) {
+      assert(block->logical_preds.size() == block->linear_preds.size());
+      phi->opcode = aco_opcode::p_linear_phi;
+      return;
+   }
+
    state->latest.resize(program->blocks.size());
 
    uint64_t undef_operands = 0;
@@ -180,14 +196,23 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
       state->writes[block->logical_preds[i]] = program->allocateId();
    }
 
+   bool uniform_merge = block->kind & block_kind_loop_header;
+
    for (unsigned i = 0; i < phi->operands.size(); i++) {
       Block *pred = &program->blocks[block->logical_preds[i]];
 
+      bool need_get_ssa = !uniform_merge;
+      if (block->kind & block_kind_loop_header && !(pred->kind & block_kind_uniform))
+         uniform_merge = false;
+
       if (phi->operands[i].isUndefined())
          continue;
 
-      Operand cur = get_ssa(program, pred->index, state, true);
+      Operand cur(bld.lm);
+      if (need_get_ssa)
+         cur = get_ssa(program, pred->index, state, true);
       assert(cur.regClass() == bld.lm);
+
       Temp new_cur = {state->writes.at(pred->index), program->lane_mask};
       assert(new_cur.regClass() == bld.lm);
 
@@ -241,6 +266,7 @@ void lower_phis(Program* program)
    ssa_state state;
 
    for (Block& block : program->blocks) {
+      state.checked_preds_for_uniform = false;
       state.needs_init = true;
       for (aco_ptr<Instruction>& phi : block.instructions) {
          if (phi->opcode == aco_opcode::p_phi) {