Mesa (master): aco: optimize v_add(v_bcnt(a, 0), b) to v_bcnt(a, b)

Fri Nov 13 07:38:36 UTC 2020

Module: Mesa
Branch: master
Commit: 68488fd3835f2d1a63562ae16777987af3ff7e01
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=68488fd3835f2d1a63562ae16777987af3ff7e01

Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date:   Wed Nov 11 18:42:35 2020 +0100

aco: optimize v_add(v_bcnt(a, 0), b) to v_bcnt(a, b)

The first operand of v_bcnt should always be a VGPR because if it's
a SGPR, isel selects s_bcnt1 but I added a sanity check to prevent
any problems.

fossils-db (Vega10):
Totals from 23 (0.02% of 139517) affected shaders:
CodeSize: 106828 -> 106664 (-0.15%)
Instrs: 20242 -> 20201 (-0.20%)
Cycles: 213112 -> 211352 (-0.83%)
VMEM: 3200 -> 3184 (-0.50%)
SMEM: 928 -> 927 (-0.11%)

Helps Control, Assassins Creeds Origins and Youngblood.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7568>

---

 src/amd/compiler/aco_optimizer.cpp        | 38 +++++++++++++++++++++++++++-
 src/amd/compiler/tests/test_optimizer.cpp | 42 +++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 04fed5cf5ea..506653cccb0 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -1469,6 +1469,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
    case aco_opcode::s_lshl_b32:
    case aco_opcode::v_or_b32:
    case aco_opcode::v_lshlrev_b32:
+   case aco_opcode::v_bcnt_u32_b32:
       ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
       break;
    case aco_opcode::v_min_f32:
@@ -2300,6 +2301,39 @@ bool combine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode n
    return false;
 }
 
+bool combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+{
+   if (instr->usesModifiers())
+      return false;
+
+   /* Do not combine if the carry-out is used. */
+   if ((instr->opcode == aco_opcode::v_add_co_u32 ||
+        instr->opcode == aco_opcode::v_add_co_u32_e64) &&
+       ctx.uses[instr->definitions[1].tempId()])
+      return false;
+
+   for (unsigned i = 0; i < 2; i++) {
+      Instruction *op_instr = follow_operand(ctx, instr->operands[i]);
+      if (op_instr &&
+          op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
+          op_instr->operands[0].isTemp() &&
+          op_instr->operands[0].getTemp().type() == RegType::vgpr &&
+          op_instr->operands[1].constantEquals(0)) {
+         aco_ptr<Instruction> new_instr{create_instruction<VOP3A_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};
+         ctx.uses[instr->operands[i].tempId()]--;
+         new_instr->operands[0] = op_instr->operands[0];
+         new_instr->operands[1] = instr->operands[!i];
+         new_instr->definitions[0] = instr->definitions[0];
+         instr = std::move(new_instr);
+         ctx.info[instr->definitions[0].tempId()].label = 0;
+
+         return true;
+      }
+   }
+
+   return false;
+}
+
 bool get_minmax_info(aco_opcode op, aco_opcode *min, aco_opcode *max, aco_opcode *min3, aco_opcode *max3, aco_opcode *med3, bool *some_gfx9_only)
 {
    switch (op) {
@@ -2848,6 +2882,7 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
       else combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32, "012", 1 | 2);
    } else if (instr->opcode == aco_opcode::v_add_u32) {
       if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
+      else if (combine_add_bcnt(ctx, instr)) ;
       else if (ctx.program->chip_class >= GFX9) {
          if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
          else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32, "120", 1 | 2)) ;
@@ -2860,7 +2895,8 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr
       }
    } else if (instr->opcode == aco_opcode::v_add_co_u32 ||
               instr->opcode == aco_opcode::v_add_co_u32_e64) {
-      combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2);
+      if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) ;
+      else combine_add_bcnt(ctx, instr);
    } else if (instr->opcode == aco_opcode::v_sub_u32 ||
               instr->opcode == aco_opcode::v_sub_co_u32 ||
               instr->opcode == aco_opcode::v_sub_co_u32_e64) {
diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp
index 3bf73316908..2453567f007 100644
--- a/src/amd/compiler/tests/test_optimizer.cpp
+++ b/src/amd/compiler/tests/test_optimizer.cpp
@@ -222,3 +222,45 @@ BEGIN_TEST(optimize.mad_u32_u16)
       finish_opt_test();
    }
 END_TEST
+
+BEGIN_TEST(optimize.bcnt)
+   for (unsigned i = GFX8; i <= GFX10; i++) {
+      //>> v1: %a, s1: %b, s2: %_:exec = p_startpgm
+      if (!setup_cs("v1 s1", (chip_class)i))
+         continue;
+
+      Temp bcnt;
+
+      //! v1: %res0 = v_bcnt_u32_b32 %a, %a
+      //! p_unit_test 0, %res0
+      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
+      writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
+
+      //! v1: %res1 = v_bcnt_u32_b32 %a, %b
+      //! p_unit_test 1, %res1
+      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
+      writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
+
+      //! v1: %res2 = v_bcnt_u32_b32 %a, 42
+      //! p_unit_test 2, %res2
+      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
+      writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand(42u)));
+
+      //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
+      //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
+      //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
+      //! p_unit_test 3, %res3
+      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand(0u));
+      writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
+
+      //! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
+      //~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
+      //~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
+      //! p_unit_test 4, %carry
+      bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand(0u));
+      Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
+      writeout(4, carry);
+
+      finish_opt_test();
+   }
+END_TEST