Mesa (main): aco: make optimize_postRA() work across blocks

Thu Aug 19 18:36:04 UTC 2021

Module: Mesa
Branch: main
Commit: 2e56e2342094e8ec90afa5265b1c43503f662939
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=2e56e2342094e8ec90afa5265b1c43503f662939

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Thu Jul  8 17:43:37 2021 +0100

aco: make optimize_postRA() work across blocks

fossil-db (Sienna Cichlid):
Totals from 46 (0.03% of 150170) affected shaders:
CodeSize: 103672 -> 103488 (-0.18%)
Instrs: 21968 -> 21922 (-0.21%)

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11924>

---

 src/amd/compiler/aco_optimizer_postRA.cpp | 109 +++++++++++++++++++++---------
 1 file changed, 76 insertions(+), 33 deletions(-)

diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp b/src/amd/compiler/aco_optimizer_postRA.cpp
index d086eff7cef..e612292e822 100644
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@@ -34,26 +34,65 @@ namespace {
 
 constexpr const size_t max_reg_cnt = 512;
 
-enum {
-   not_written_in_block = -1,
-   clobbered = -2,
-   const_or_undef = -3,
-   written_by_multiple_instrs = -4,
+struct Idx {
+   bool operator==(const Idx& other) const { return block == other.block && instr == other.instr; }
+   bool operator!=(const Idx& other) const { return !operator==(other); }
+
+   bool found() const { return block != UINT32_MAX; }
+
+   uint32_t block;
+   uint32_t instr;
 };
 
+Idx not_written_in_block{UINT32_MAX, 0};
+Idx clobbered{UINT32_MAX, 1};
+Idx const_or_undef{UINT32_MAX, 2};
+Idx written_by_multiple_instrs{UINT32_MAX, 3};
+
+bool
+is_instr_after(Idx second, Idx first)
+{
+   if (first == not_written_in_block && second != not_written_in_block)
+      return true;
+
+   if (!first.found() || !second.found())
+      return false;
+
+   return second.block > first.block || (second.block == first.block && second.instr > first.instr);
+}
+
 struct pr_opt_ctx {
    Program* program;
    Block* current_block;
-   int current_instr_idx;
+   uint32_t current_instr_idx;
    std::vector<uint16_t> uses;
-   std::array<int, max_reg_cnt * 4u> instr_idx_by_regs;
+   std::vector<std::array<Idx, max_reg_cnt>> instr_idx_by_regs;
 
    void reset_block(Block* block)
    {
       current_block = block;
-      current_instr_idx = -1;
-      std::fill(instr_idx_by_regs.begin(), instr_idx_by_regs.end(), not_written_in_block);
+      current_instr_idx = 0;
+
+      if ((block->kind & block_kind_loop_header) || block->linear_preds.empty()) {
+         std::fill(instr_idx_by_regs[block->index].begin(), instr_idx_by_regs[block->index].end(),
+                   not_written_in_block);
+      } else {
+         unsigned first_pred = block->linear_preds[0];
+         for (unsigned i = 0; i < max_reg_cnt; i++) {
+            bool all_same = std::all_of(
+               std::next(block->linear_preds.begin()), block->linear_preds.end(),
+               [&](unsigned pred)
+               { return instr_idx_by_regs[pred][i] == instr_idx_by_regs[first_pred][i]; });
+
+            if (all_same)
+               instr_idx_by_regs[block->index][i] = instr_idx_by_regs[first_pred][i];
+            else
+               instr_idx_by_regs[block->index][i] = not_written_in_block;
+         }
+      }
    }
+
+   Instruction* get(Idx idx) { return program->blocks[idx.block].instructions[idx.instr].get(); }
 };
 
 void
@@ -65,36 +104,38 @@ save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 
       unsigned dw_size = DIV_ROUND_UP(def.bytes(), 4u);
       unsigned r = def.physReg().reg();
-      int idx = ctx.current_instr_idx;
+      Idx idx{ctx.current_block->index, ctx.current_instr_idx};
 
       if (def.regClass().is_subdword())
          idx = clobbered;
 
       assert(def.size() == dw_size || def.regClass().is_subdword());
-      std::fill(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size], idx);
+      std::fill(&ctx.instr_idx_by_regs[ctx.current_block->index][r],
+                &ctx.instr_idx_by_regs[ctx.current_block->index][r + dw_size], idx);
    }
 }
 
-int
+Idx
 last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
 {
    /* Verify that all of the operand's registers are written by the same instruction. */
-   int instr_idx = ctx.instr_idx_by_regs[physReg.reg()];
+   Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][physReg.reg()];
    unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
    unsigned r = physReg.reg();
-   bool all_same = std::all_of(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
-                               [instr_idx](int i) { return i == instr_idx; });
+   bool all_same = std::all_of(&ctx.instr_idx_by_regs[ctx.current_block->index][r],
+                               &ctx.instr_idx_by_regs[ctx.current_block->index][r + dw_size],
+                               [instr_idx](Idx i) { return i == instr_idx; });
 
    return all_same ? instr_idx : written_by_multiple_instrs;
 }
 
-int
+Idx
 last_writer_idx(pr_opt_ctx& ctx, const Operand& op)
 {
    if (op.isConstant() || op.isUndefined())
       return const_or_undef;
 
-   int instr_idx = ctx.instr_idx_by_regs[op.physReg().reg()];
+   Idx instr_idx = ctx.instr_idx_by_regs[ctx.current_block->index][op.physReg().reg()];
 
 #ifndef NDEBUG
    /* Debug mode:  */
@@ -129,21 +170,22 @@ try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
        instr->operands[0].physReg() != scc)
       return;
 
-   int op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
-   int last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
-   int last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
+   Idx op0_instr_idx = last_writer_idx(ctx, instr->operands[0]);
+   Idx last_vcc_wr_idx = last_writer_idx(ctx, vcc, ctx.program->lane_mask);
+   Idx last_exec_wr_idx = last_writer_idx(ctx, exec, ctx.program->lane_mask);
 
    /* We need to make sure:
     * - the operand register used by the branch, and VCC were both written in the current block
     * - VCC was NOT written after the operand register
     * - EXEC is sane and was NOT written after the operand register
     */
-   if (op0_instr_idx < 0 || last_vcc_wr_idx < 0 || last_vcc_wr_idx > op0_instr_idx ||
-       last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
+   if (!op0_instr_idx.found() || !last_vcc_wr_idx.found() ||
+       !is_instr_after(last_vcc_wr_idx, last_exec_wr_idx) ||
+       !is_instr_after(op0_instr_idx, last_vcc_wr_idx))
       return;
 
-   aco_ptr<Instruction>& op0_instr = ctx.current_block->instructions[op0_instr_idx];
-   aco_ptr<Instruction>& last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
+   Instruction* op0_instr = ctx.get(op0_instr_idx);
+   Instruction* last_vcc_wr = ctx.get(last_vcc_wr_idx);
 
    if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
         op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
@@ -192,12 +234,12 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
          return;
 
       /* Make sure both SCC and Operand 0 are written by the same instruction. */
-      int wr_idx = last_writer_idx(ctx, instr->operands[0]);
-      int sccwr_idx = last_writer_idx(ctx, scc, s1);
-      if (wr_idx < 0 || wr_idx != sccwr_idx)
+      Idx wr_idx = last_writer_idx(ctx, instr->operands[0]);
+      Idx sccwr_idx = last_writer_idx(ctx, scc, s1);
+      if (!wr_idx.found() || wr_idx != sccwr_idx)
          return;
 
-      aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
+      Instruction* wr_instr = ctx.get(wr_idx);
       if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
           wr_instr->definitions[1].physReg() != scc)
          return;
@@ -259,11 +301,11 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
          scc_op_idx = 2;
       }
 
-      int wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]);
-      if (wr_idx < 0)
+      Idx wr_idx = last_writer_idx(ctx, instr->operands[scc_op_idx]);
+      if (!wr_idx.found())
          return;
 
-      aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
+      Instruction* wr_instr = ctx.get(wr_idx);
 
       /* Check if we found the pattern above. */
       if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
@@ -299,14 +341,14 @@ try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 void
 process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
-   ctx.current_instr_idx++;
-
    try_apply_branch_vcc(ctx, instr);
 
    try_optimize_scc_nocompare(ctx, instr);
 
    if (instr)
       save_reg_writes(ctx, instr);
+
+   ctx.current_instr_idx++;
 }
 
 } // namespace
@@ -317,6 +359,7 @@ optimize_postRA(Program* program)
    pr_opt_ctx ctx;
    ctx.program = program;
    ctx.uses = dead_code_analysis(program);
+   ctx.instr_idx_by_regs.resize(program->blocks.size());
 
    /* Forward pass
     * Goes through each instruction exactly once, and can transform