Mesa (master): aco: Transform uniform bitwise instructions to 32-bit if possible.

Fri Jan 24 15:29:54 UTC 2020

Module: Mesa
Branch: master
Commit: 8a32f57fff56b3b94f1b5589feba38016f39427c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=8a32f57fff56b3b94f1b5589feba38016f39427c

Author: Timur Kristóf <timur.kristof at gmail.com>
Date:   Thu Jan 16 19:32:31 2020 +0100

aco: Transform uniform bitwise instructions to 32-bit if possible.

This allows removing superfluous s_cselect instructions
that come from turning booleans into 64-bit vector condition.

v2 by Daniel Schürmann:
- Make the code massively simpler
v3 by Timur Kristóf:
- Fix regressions, make it work in wave32 mode
- Eliminate extra moves by not always using the SCC definition
- Use s_absdiff_i32 for uniform XOR
- Skip the transformation for uncommon or invalid instructions

Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3450>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3450>

---

 src/amd/compiler/aco_optimizer.cpp | 87 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 1088d8aec9e..73b9c653a64 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -86,6 +86,7 @@ enum Label {
    label_uniform_bitwise = 1 << 23,
    label_scc_invert = 1 << 24,
    label_vcc_hint = 1 << 25,
+   label_scc_needed = 1 << 26,
 };
 
 static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success |
@@ -384,6 +385,16 @@ struct ssa_info {
       return label & label_fcmp;
    }
 
+   void set_scc_needed()
+   {
+      add_label(label_scc_needed);
+   }
+
+   bool is_scc_needed()
+   {
+      return label & label_scc_needed;
+   }
+
    void set_scc_invert(Temp scc_inv)
    {
       add_label(label_scc_invert);
@@ -2458,6 +2469,53 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
    }
 }
 
+bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)
+{
+   switch (instr->opcode) {
+      case aco_opcode::s_and_b32:
+      case aco_opcode::s_and_b64:
+         instr->opcode = aco_opcode::s_and_b32;
+         break;
+      case aco_opcode::s_or_b32:
+      case aco_opcode::s_or_b64:
+         instr->opcode = aco_opcode::s_or_b32;
+         break;
+      case aco_opcode::s_xor_b32:
+      case aco_opcode::s_xor_b64:
+         instr->opcode = aco_opcode::s_absdiff_i32;
+         break;
+      default:
+         /* Don't transform other instructions. They are very unlikely to appear here. */
+         return false;
+   }
+
+   for (Operand &op : instr->operands) {
+      ctx.uses[op.tempId()]--;
+
+      if (ctx.info[op.tempId()].is_uniform_bool()) {
+         /* Just use the uniform boolean temp. */
+         op.setTemp(ctx.info[op.tempId()].temp);
+      } else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
+         /* Use the SCC definition of the predecessor instruction.
+          * This allows the predecessor to get picked up by the same optimization (if it has no divergent users),
+          * and it also makes sure that the current instruction will keep working even if the predecessor won't be transformed.
+          */
+         Instruction *pred_instr = ctx.info[op.tempId()].instr;
+         assert(pred_instr->definitions.size() >= 2);
+         assert(pred_instr->definitions[1].isFixed() && pred_instr->definitions[1].physReg() == scc);
+         op.setTemp(pred_instr->definitions[1].getTemp());
+      } else {
+         unreachable("Invalid operand on uniform bitwise instruction.");
+      }
+
+      ctx.uses[op.tempId()]++;
+   }
+
+   instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
+   assert(instr->operands[0].regClass() == s1);
+   assert(instr->operands[1].regClass() == s1);
+   return true;
+}
 
 void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 {
@@ -2571,10 +2629,39 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       }
    }
 
+   /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions when it isn't beneficial */
+   if (instr->format == Format::PSEUDO_BRANCH &&
+       instr->operands.size() &&
+       instr->operands[0].isTemp()) {
+      ctx.info[instr->operands[0].tempId()].set_scc_needed();
+      return;
+   } else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
+               instr->opcode == aco_opcode::s_cselect_b32) &&
+              instr->operands[2].isTemp()) {
+      ctx.info[instr->operands[2].tempId()].set_scc_needed();
+   }
+
    /* check for literals */
    if (!instr->isSALU() && !instr->isVALU())
       return;
 
+   /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
+   if (instr->definitions.size() &&
+       ctx.uses[instr->definitions[0].tempId()] == 0 &&
+       ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
+      bool transform_done = to_uniform_bool_instr(ctx, instr);
+
+      if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
+         /* Swap the two definition IDs in order to avoid overusing the SCC. This reduces extra moves generated by RA. */
+         uint32_t def0_id = instr->definitions[0].getTemp().id();
+         uint32_t def1_id = instr->definitions[1].getTemp().id();
+         instr->definitions[0].setTemp(Temp(def1_id, s1));
+         instr->definitions[1].setTemp(Temp(def0_id, s1));
+      }
+
+      return;
+   }
+
    if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10))
       return; /* some encodings can't ever take literals */