Mesa (master): aco: optimize v_mad_u32_u16 with acc=0 to v_mul_u32_u24

Thu Nov 12 12:42:17 UTC 2020

Module: Mesa
Branch: master
Commit: bbdafd6ab3541d2a61fe3c2389f6dec55814c34f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=bbdafd6ab3541d2a61fe3c2389f6dec55814c34f

Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date:   Mon Nov  2 16:44:04 2020 +0100

aco: optimize v_mad_u32_u16 with acc=0 to v_mul_u32_u24

v_mad_u32_u16 will be selected by isel to keep the range analysis
information around and to combine more v_add_u32+v_mad_u32_u16
together. When it's not possible to optimize that pattern, fallback
to v_mul_u32_u24 which is VOP2 instead of VOP3.

No fossils-db changes.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7425>

---

 src/amd/compiler/aco_optimizer.cpp        | 47 +++++++++++++++++++++++++++++++
 src/amd/compiler/tests/test_optimizer.cpp | 42 +++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 10592a4d1ef..f427b02c926 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -2939,6 +2939,50 @@ bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)
    return true;
 }
 
+void select_mul_u32_u24(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+{
+   if (instr->usesModifiers())
+      return;
+
+   /* Only valid if the accumulator is zero (this is selected by isel to
+    * combine more v_add_u32+v_mad_u32_u16 together), but the optimizer
+    * fallbacks here when not possible.
+    */
+   if (!instr->operands[2].constantEquals(0))
+      return;
+
+   /* Only valid if the upper 16-bits of both operands are zero (because
+    * v_mul_u32_u24 doesn't mask them).
+    */
+   for (unsigned i = 0; i < 2; i++) {
+      if (instr->operands[i].isTemp() && !instr->operands[i].is16bit())
+         return;
+   }
+
+   bool swap = false;
+
+   /* VOP2 instructions can only take constants/sgprs in operand 0. */
+   if ((instr->operands[1].isConstant() ||
+       (instr->operands[1].hasRegClass() &&
+      instr->operands[1].regClass().type() == RegType::sgpr))) {
+      swap = true;
+      if ((instr->operands[0].isConstant() ||
+          (instr->operands[0].hasRegClass() &&
+           instr->operands[0].regClass().type() == RegType::sgpr))) {
+         /* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because
+          * v_mul_u32_u24 has no advantages.
+          */
+         return;
+      }
+   }
+
+   VOP2_instruction *new_instr = create_instruction<VOP2_instruction>(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1);
+   new_instr->operands[0] = instr->operands[swap];
+   new_instr->operands[1] = instr->operands[!swap];
+   new_instr->definitions[0] = instr->definitions[0];
+   instr.reset(new_instr);
+}
+
 void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 {
    const uint32_t threshold = 4;
@@ -3102,6 +3146,9 @@ void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
       return;
    }
 
+   if (instr->opcode == aco_opcode::v_mad_u32_u16)
+      select_mul_u32_u24(ctx, instr);
+
    if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10))
       return; /* some encodings can't ever take literals */
 
diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp
index 4be2f6226ae..4ac8dc4dd11 100644
--- a/src/amd/compiler/tests/test_optimizer.cpp
+++ b/src/amd/compiler/tests/test_optimizer.cpp
@@ -156,3 +156,45 @@ BEGIN_TEST(optimize.add_lshl)
       finish_opt_test();
    }
 END_TEST
+
+Temp create_mad_u32_u16(Operand a, Operand b, Operand c, bool is16bit = true)
+{
+   a.set16bit(is16bit);
+   b.set16bit(is16bit);
+
+   return bld.vop3(aco_opcode::v_mad_u32_u16, bld.def(v1), a, b, c);
+}
+
+BEGIN_TEST(optimize.mad_u32_u16)
+   for (unsigned i = GFX9; i <= GFX10; i++) {
+      //>> v1: %a, v1: %b, s1: %c, s2: %_:exec = p_startpgm
+      if (!setup_cs("v1 v1 s1", (chip_class)i))
+         continue;
+
+      //! v1: %res0 = v_mul_u32_u24 (is16bit)%a, (is16bit)%b
+      //! p_unit_test 0, %res0
+      writeout(0, create_mad_u32_u16(Operand(inputs[0]), Operand(inputs[1]), Operand(0u)));
+
+      //! v1: %res1 = v_mul_u32_u24 42, (is16bit)%a
+      //! p_unit_test 1, %res1
+      writeout(1, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u)));
+
+      //! v1: %res2 = v_mul_u32_u24 42, (is16bit)%a
+      //! p_unit_test 2, %res2
+      writeout(2, create_mad_u32_u16(Operand(inputs[0]), Operand(42u), Operand(0u)));
+
+      //! v1: %res3 = v_mul_u32_u24 (is16bit)%c, (is16bit)%a
+      //! p_unit_test 3, %res3
+      writeout(3, create_mad_u32_u16(Operand(inputs[2]), Operand(inputs[0]), Operand(0u)));
+
+      //! v1: %res4 = v_mad_u32_u16 42, (is16bit)%c, 0
+      //! p_unit_test 4, %res4
+      writeout(4, create_mad_u32_u16(Operand(42u), Operand(inputs[2]), Operand(0u)));
+
+      //! v1: %res5 = v_mad_u32_u16 42, %a, 0
+      //! p_unit_test 5, %res5
+      writeout(5, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u), false));
+
+      finish_opt_test();
+   }
+END_TEST