Mesa (main): aco: handle DPP in the optimizer

Thu Aug 19 18:36:04 UTC 2021

Module: Mesa
Branch: main
Commit: b97cfd72afc2f693a21e8b662dc965fc8e3b63fb
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=b97cfd72afc2f693a21e8b662dc965fc8e3b63fb

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Mon Jul 19 14:26:42 2021 +0100

aco: handle DPP in the optimizer

There are a bunch of optimizations that are broken when DPP is involved.

fossil-db (Sienna Cichlid):
Totals from 100 (0.07% of 150170) affected shaders:
CodeSize: 325204 -> 325192 (-0.00%); split: -0.06%, +0.05%
Instrs: 62773 -> 62664 (-0.17%); split: -0.18%, +0.00%
Latency: 295348 -> 295266 (-0.03%); split: -0.03%, +0.00%
InvThroughput: 73990 -> 73946 (-0.06%); split: -0.06%, +0.01%
Copies: 1650 -> 1609 (-2.48%); split: -2.55%, +0.06%
PreSGPRs: 3554 -> 3520 (-0.96%)

Fossil-db changes are probably because v_sub_f32_dpp(v_mul_f32) is no
longer being combined into MAD and then split back into separate
instructions.

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11924>

---

 src/amd/compiler/aco_optimizer.cpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index fc4939a62d7..5be7e3fd19c 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -2066,6 +2066,17 @@ combine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       new_sdwa->clamp = cmp_sdwa.clamp;
       new_sdwa->omod = cmp_sdwa.omod;
       new_instr = new_sdwa;
+   } else if (cmp->isDPP()) {
+      DPP_instruction* new_dpp = create_instruction<DPP_instruction>(
+         new_opcode, (Format)((uint16_t)Format::DPP | (uint16_t)Format::VOPC), 2, 1);
+      DPP_instruction& cmp_dpp = cmp->dpp();
+      memcpy(new_dpp->abs, cmp_dpp.abs, sizeof(new_dpp->abs));
+      memcpy(new_dpp->neg, cmp_dpp.neg, sizeof(new_dpp->neg));
+      new_dpp->dpp_ctrl = cmp_dpp.dpp_ctrl;
+      new_dpp->row_mask = cmp_dpp.row_mask;
+      new_dpp->bank_mask = cmp_dpp.bank_mask;
+      new_dpp->bound_ctrl = cmp_dpp.bound_ctrl;
+      new_instr = new_dpp;
    } else {
       new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1);
       instr->definitions[0].setHint(vcc);
@@ -2105,6 +2116,8 @@ match_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op
 
    if (op1_instr->isSDWA() || op2_instr->isSDWA())
       return false;
+   if (op1_instr->isDPP() || op2_instr->isDPP())
+      return false;
 
    /* don't support inbetween clamp/omod */
    if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))
@@ -2216,7 +2229,7 @@ combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
       return true;
 
-   if (instr->isSDWA())
+   if (instr->isSDWA() || instr->isDPP())
       return false;
 
    /* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
@@ -2476,6 +2489,7 @@ combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    for (unsigned i = 0; i < 2; i++) {
       Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
       if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
+          !op_instr->usesModifiers() &&
           op_instr->operands[0].isTemp() &&
           op_instr->operands[0].getTemp().type() == RegType::vgpr &&
           op_instr->operands[1].constantEquals(0)) {
@@ -3145,7 +3159,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       instr->definitions[0].setHint(vcc);
    }
 
-   if (instr->isSDWA())
+   if (instr->isSDWA() || instr->isDPP())
       return;
 
    /* TODO: There are still some peephole optimizations that could be done:
@@ -3172,7 +3186,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          return;
       if (mul_instr->isVOP3() && mul_instr->vop3().clamp)
          return;
-      if (mul_instr->isSDWA())
+      if (mul_instr->isSDWA() || mul_instr->isDPP())
          return;
 
       /* convert to mul(neg(a), b) */
@@ -3231,7 +3245,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
             continue;
 
          Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
-         if (info.instr->isSDWA() || !check_vop3_operands(ctx, 3, op) ||
+         if (info.instr->isSDWA() || info.instr->isDPP() || !check_vop3_operands(ctx, 3, op) ||
              ctx.uses[instr->operands[i].tempId()] >= uses)
             continue;