Mesa (main): aco: remove DPP when applying constants/literals/sgprs

Tue Aug 31 17:15:49 UTC 2021

Module: Mesa
Branch: main
Commit: 33ddbd220f26391fd117f484f6b566d17d942091
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=33ddbd220f26391fd117f484f6b566d17d942091

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Mon Aug 30 10:30:45 2021 +0100

aco: remove DPP when applying constants/literals/sgprs

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12601>

---

 src/amd/compiler/aco_ir.h                 |  6 +++++
 src/amd/compiler/aco_optimizer.cpp        | 13 ++++++++---
 src/amd/compiler/tests/test_optimizer.cpp | 39 ++++++++++++++++++++++---------
 3 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 2b0f7d34550..c96dcce7892 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -290,6 +290,12 @@ asSDWA(Format format)
    return (Format)((uint32_t)Format::SDWA | (uint32_t)format);
 }
 
+constexpr Format
+withoutDPP(Format format)
+{
+   return (Format)((uint32_t)format & ~(uint32_t)Format::DPP);
+}
+
 enum class RegType {
    none = 0,
    sgpr,
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 634c0939ea1..27c993e83fc 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -560,10 +560,11 @@ pseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsi
    return true;
 }
 
+/* This expects the DPP modifier to be removed. */
 bool
 can_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
-   if ((instr->isSDWA() && ctx.program->chip_class < GFX9) || instr->isDPP())
+   if (instr->isSDWA() && ctx.program->chip_class < GFX9)
       return false;
    return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
           instr->opcode != aco_opcode::v_readlane_b32 &&
@@ -1010,6 +1011,7 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
          if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
              instr->operands.size() == 1) {
+            instr->format = withoutDPP(instr->format);
             instr->operands[i].setTemp(info.temp);
             info = ctx.info[info.temp.id()];
          }
@@ -1058,13 +1060,14 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 
          unsigned bits = get_operand_size(instr, i);
          if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&
-             (!instr->isSDWA() || ctx.program->chip_class >= GFX9) && !instr->isDPP()) {
+             (!instr->isSDWA() || ctx.program->chip_class >= GFX9)) {
             Operand op = get_constant_op(ctx, info, bits);
             perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
                      "v_cndmask_b32 with a constant selector", instr.get());
             if (i == 0 || instr->isSDWA() || instr->isVOP3P() ||
                 instr->opcode == aco_opcode::v_readlane_b32 ||
                 instr->opcode == aco_opcode::v_writelane_b32) {
+               instr->format = withoutDPP(instr->format);
                instr->operands[i] = op;
                continue;
             } else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) {
@@ -2740,6 +2743,9 @@ apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       if (new_sgpr && num_sgprs >= max_sgprs)
          continue;
 
+      if (sgpr_idx == 0)
+         instr->format = withoutDPP(instr->format);
+
       if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
           info.is_extract()) {
          /* can_apply_extract() checks SGPR encoding restrictions */
@@ -3734,7 +3740,7 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       }
    }
 
-   if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
+   if (instr->isSDWA() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
        (instr->isVOP3P() && ctx.program->chip_class < GFX10))
       return; /* some encodings can't ever take literals */
 
@@ -3858,6 +3864,7 @@ apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
          unsigned bits = get_operand_size(instr, i);
          if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
             Operand literal = Operand::c32(ctx.info[op.tempId()].val);
+            instr->format = withoutDPP(instr->format);
             if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
                to_VOP3(ctx, instr);
             instr->operands[i] = literal;
diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp
index ce4e925b779..9609fea4f2b 100644
--- a/src/amd/compiler/tests/test_optimizer.cpp
+++ b/src/amd/compiler/tests/test_optimizer.cpp
@@ -1037,23 +1037,40 @@ BEGIN_TEST(optimize.dpp_prop)
    if (!setup_cs("v1 s1", GFX10))
       return;
 
-   //! v1: %zero = p_parallelcopy 0
-   //! v1: %res0 = v_mul_f32 %zero, %a row_shl:1 bound_ctrl:1
+   //! v1: %one = p_parallelcopy 1
+   //! v1: %res0 = v_mul_f32 1, %a
    //! p_unit_test 0, %res0
-   Temp zero = bld.copy(bld.def(v1), Operand::zero());
-   writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), zero, inputs[0], dpp_row_sl(1)));
+   Temp one = bld.copy(bld.def(v1), Operand::c32(1));
+   writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
 
-   //! v1: %literal = p_parallelcopy 0x12345678
-   //! v1: %res1 = v_mul_f32 %literal, %a row_shl:1 bound_ctrl:1
+   //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
    //! p_unit_test 1, %res1
-   Temp literal = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
-   writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal, inputs[0], dpp_row_sl(1)));
+   writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
 
-   //! v1: %b_v = p_parallelcopy %b
-   //! v1: %res2 = v_mul_f32 %b_v, %a row_shl:1 bound_ctrl:1
+   //! v1: %res2 = v_mul_f32 0x12345678, %a
    //! p_unit_test 2, %res2
+   Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
+   writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
+
+   //! v1: %literal2 = p_parallelcopy 0x12345679
+   //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
+   //! p_unit_test 3, %res3
+   Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
+   writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
+
+   //! v1: %b_v = p_parallelcopy %b
+   //! v1: %res4 = v_mul_f32 %b, %a
+   //! p_unit_test 4, %res4
    Temp b_v = bld.copy(bld.def(v1), inputs[1]);
-   writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
+   writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
+
+   //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
+   //! p_unit_test 5, %res5
+   writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
+
+   //! v1: %res6 = v_rcp_f32 %b
+   //! p_unit_test 6, %res6
+   writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
 
    finish_opt_test();
 END_TEST