Mesa (main): aco: refactor nir_op_imul selection

Fri Aug 27 20:15:49 UTC 2021

Module: Mesa
Branch: main
Commit: 23d5865f422f1e382d6786c75976f2788865dd59
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=23d5865f422f1e382d6786c75976f2788865dd59

Author: Daniel Schürmann <daniel at schuermann.dev>
Date:   Thu Jul  1 12:42:08 2021 +0200

aco: refactor nir_op_imul selection

Previously, the optimization to use v_mul_lo_u16 for
32bit multiplications was done in instruction_selection.
This was moved to the optimizer to ease some case distinctions.

The mixed results are due to increased use of SDWA.

Totals from 2616 (1.74% of 150170) affected shaders: (GFX10.3)
VGPRs: 143888 -> 143872 (-0.01%); split: -0.02%, +0.01%
CodeSize: 5604032 -> 5604080 (+0.00%); split: -0.01%, +0.01%
Instrs: 1086798 -> 1083915 (-0.27%); split: -0.27%, +0.01%
Latency: 8215793 -> 8213023 (-0.03%); split: -0.10%, +0.07%
InvThroughput: 20765157 -> 20773766 (+0.04%); split: -0.02%, +0.06%
VClause: 35256 -> 35260 (+0.01%); split: -0.02%, +0.03%
SClause: 29021 -> 29024 (+0.01%); split: -0.00%, +0.01%
Copies: 74163 -> 74306 (+0.19%); split: -0.05%, +0.24%

Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11678>

---

 src/amd/compiler/aco_instruction_selection.cpp | 24 ++-------
 src/amd/compiler/aco_optimizer.cpp             | 51 --------------------
 src/amd/compiler/tests/test_optimizer.cpp      | 67 --------------------------
 3 files changed, 4 insertions(+), 138 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index eb01a77e266..631e3c550e1 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -1945,29 +1945,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
       } else if (dst.type() == RegType::vgpr) {
-         Temp src0 = get_alu_src(ctx, instr->src[0]);
-         Temp src1 = get_alu_src(ctx, instr->src[1]);
          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
 
-         if (src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff &&
-             (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9)) {
-            /* If the 16-bit multiplication can't overflow, emit v_mul_lo_u16
-             * but only on GFX8-9 because GFX10 doesn't zero the upper 16
-             * bits.
-             */
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true /* commutative */,
-                                  false, false, true /* nuw */);
-         } else if (src0_ub <= 0xffff && src1_ub <= 0xffff && ctx->options->chip_class >= GFX9) {
-            /* Initialize the accumulator to 0 to allow further combinations
-             * in the optimizer.
-             */
-            Operand op0(src0);
-            Operand op1(src1);
-            bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0),
-                     bld.set16bit(op1), Operand::zero());
-         } else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true);
+         if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
+            bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
+                                  true /* commutative */, false, false, nuw_16bit);
          } else if (nir_src_is_const(instr->src[0].src)) {
             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
                           nir_src_as_uint(instr->src[0].src), false);
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index e9f8514befb..8c9e50c96c6 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -3397,8 +3397,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
                                           "012", 1 | 2)) {
          } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
                                           "012", 1 | 2)) {
-         } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16,
-                                          aco_opcode::v_mad_u32_u16, "120", 1 | 2)) {
          } else if (combine_add_or_then_and_lshl(ctx, instr)) {
          }
       }
@@ -3506,52 +3504,6 @@ to_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    return true;
 }
 
-void
-select_mul_u32_u24(opt_ctx& ctx, aco_ptr<Instruction>& instr)
-{
-   if (instr->usesModifiers())
-      return;
-
-   /* Only valid if the accumulator is zero (this is selected by isel to
-    * combine more v_add_u32+v_mad_u32_u16 together), but the optimizer
-    * fallbacks here when not possible.
-    */
-   if (!instr->operands[2].constantEquals(0))
-      return;
-
-   /* Only valid if the upper 16-bits of both operands are zero (because
-    * v_mul_u32_u24 doesn't mask them).
-    */
-   for (unsigned i = 0; i < 2; i++) {
-      if (instr->operands[i].isTemp() && !instr->operands[i].is16bit())
-         return;
-   }
-
-   bool swap = false;
-
-   /* VOP2 instructions can only take constants/sgprs in operand 0. */
-   if ((instr->operands[1].isConstant() ||
-        (instr->operands[1].hasRegClass() &&
-         instr->operands[1].regClass().type() == RegType::sgpr))) {
-      swap = true;
-      if ((instr->operands[0].isConstant() ||
-           (instr->operands[0].hasRegClass() &&
-            instr->operands[0].regClass().type() == RegType::sgpr))) {
-         /* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because
-          * v_mul_u32_u24 has no advantages.
-          */
-         return;
-      }
-   }
-
-   VOP2_instruction* new_instr =
-      create_instruction<VOP2_instruction>(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1);
-   new_instr->operands[0] = instr->operands[swap];
-   new_instr->operands[1] = instr->operands[!swap];
-   new_instr->definitions[0] = instr->definitions[0];
-   instr.reset(new_instr);
-}
-
 void
 select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
@@ -3732,9 +3684,6 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       return;
    }
 
-   if (instr->opcode == aco_opcode::v_mad_u32_u16)
-      select_mul_u32_u24(ctx, instr);
-
    /* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */
    if (instr->isVALU()) {
       for (unsigned i = 0; i < instr->operands.size(); i++) {
diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp
index e1953b0ec31..9917404354c 100644
--- a/src/amd/compiler/tests/test_optimizer.cpp
+++ b/src/amd/compiler/tests/test_optimizer.cpp
@@ -395,73 +395,6 @@ BEGIN_TEST(optimize.add_lshl)
    }
 END_TEST
 
-Temp create_mad_u32_u16(Operand a, Operand b, Operand c, bool is16bit = true)
-{
-   a.set16bit(is16bit);
-   b.set16bit(is16bit);
-
-   return bld.vop3(aco_opcode::v_mad_u32_u16, bld.def(v1), a, b, c);
-}
-
-BEGIN_TEST(optimize.mad_u32_u16)
-   for (unsigned i = GFX9; i <= GFX10; i++) {
-      //>> v1: %a, v1: %b, s1: %c = p_startpgm
-      if (!setup_cs("v1 v1 s1", (chip_class)i))
-         continue;
-
-      //! v1: %res0 = v_mul_u32_u24 (is16bit)%a, (is16bit)%b
-      //! p_unit_test 0, %res0
-      writeout(0, create_mad_u32_u16(Operand(inputs[0]), Operand(inputs[1]), Operand::zero()));
-
-      //! v1: %res1 = v_mul_u32_u24 42, (is16bit)%a
-      //! p_unit_test 1, %res1
-      writeout(1, create_mad_u32_u16(Operand::c32(42u), Operand(inputs[0]), Operand::zero()));
-
-      //! v1: %res2 = v_mul_u32_u24 42, (is16bit)%a
-      //! p_unit_test 2, %res2
-      writeout(2, create_mad_u32_u16(Operand(inputs[0]), Operand::c32(42u), Operand::zero()));
-
-      //! v1: %res3 = v_mul_u32_u24 (is16bit)%c, (is16bit)%a
-      //! p_unit_test 3, %res3
-      writeout(3, create_mad_u32_u16(Operand(inputs[2]), Operand(inputs[0]), Operand::zero()));
-
-      //! v1: %res4 = v_mad_u32_u16 42, (is16bit)%c, 0
-      //! p_unit_test 4, %res4
-      writeout(4, create_mad_u32_u16(Operand::c32(42u), Operand(inputs[2]), Operand::zero()));
-
-      //! v1: %res5 = v_mad_u32_u16 42, %a, 0
-      //! p_unit_test 5, %res5
-      writeout(5,
-               create_mad_u32_u16(Operand::c32(42u), Operand(inputs[0]), Operand::zero(), false));
-
-      //~gfx9! v1: %mul6 = v_mul_lo_u16 %a, %b
-      //~gfx9! v1: %res6 = v_add_u32 %mul6, %b
-      //~gfx10! v1: %mul6 = v_mul_lo_u16_e64 %a, %b
-      //~gfx10! v1: %res6 = v_add_u32 %mul6, %b
-      //! p_unit_test 6, %res6
-      Temp mul;
-      if (i >= GFX10) {
-         mul = bld.vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
-      } else {
-         mul = bld.vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
-      }
-      writeout(6, bld.vadd32(bld.def(v1), mul, inputs[1]));
-
-      //~gfx9! v1: %res7 = v_mad_u32_u16 %a, %b, %b
-      //~gfx10! v1: (nuw)%mul7 = v_mul_lo_u16_e64 %a, %b
-      //~gfx10! v1: %res7 = v_add_u32 %mul7, %b
-      //! p_unit_test 7, %res7
-      if (i >= GFX10) {
-         mul = bld.nuw().vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]);
-      } else {
-         mul = bld.nuw().vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]);
-      }
-      writeout(7, bld.vadd32(bld.def(v1), mul, inputs[1]));
-
-      finish_opt_test();
-   }
-END_TEST
-
 BEGIN_TEST(optimize.bcnt)
    for (unsigned i = GFX8; i <= GFX10; i++) {
       //>> v1: %a, s1: %b = p_startpgm