Mesa (master): aco/optimizer: convert extract_vector with index 0 into parallelcopies if possible

Thu Jan 21 11:13:24 UTC 2021

Module: Mesa
Branch: master
Commit: 7dcb9a0d8c90d4aecf325822eb8b2d121a020d1c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=7dcb9a0d8c90d4aecf325822eb8b2d121a020d1c

Author: Daniel Schürmann <daniel at schuermann.dev>
Date:   Fri Jan 15 09:23:04 2021 +0100

aco/optimizer: convert extract_vector with index 0 into parallelcopies if possible

Totals from 273 (0.20% of 139391) affected shaders (Navi10):
VGPRs: 11600 -> 11792 (+1.66%)
CodeSize: 1389304 -> 1383152 (-0.44%); split: -0.53%, +0.08%
MaxWaves: 3848 -> 3752 (-2.49%)
Instrs: 240228 -> 239478 (-0.31%); split: -0.37%, +0.06%
Cycles: 20637708 -> 20580024 (-0.28%); split: -0.46%, +0.18%
VMEM: 39164 -> 38831 (-0.85%); split: +0.06%, -0.91%
SMEM: 21743 -> 22204 (+2.12%)
VClause: 4787 -> 4783 (-0.08%)
Copies: 39057 -> 38308 (-1.92%); split: -2.28%, +0.37%
Branches: 6556 -> 6557 (+0.02%)

Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8260>

---

 src/amd/compiler/aco_optimizer.cpp        | 59 ++++++++++++++-----------------
 src/amd/compiler/tests/test_optimizer.cpp |  3 +-
 2 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index a88747775ff..b1e786408a1 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -1224,42 +1224,37 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
       const unsigned index = instr->operands[1].constantValue();
       const unsigned dst_offset = index * instr->definitions[0].bytes();
 
-      if (info.is_constant_or_literal(32)) {
-         uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
-         ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, (info.val >> (dst_offset * 8u)) & mask);
-         break;
-      } else if (!info.is_vec()) {
-         break;
-      }
-
-      /* check if we index directly into a vector element */
-      Instruction* vec = info.instr;
-      unsigned offset = 0;
-
-      for (const Operand& op : vec->operands) {
-         if (offset < dst_offset) {
-            offset += op.bytes();
-            continue;
-         } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
+      if (info.is_vec()) {
+         /* check if we index directly into a vector element */
+         Instruction* vec = info.instr;
+         unsigned offset = 0;
+
+         for (const Operand& op : vec->operands) {
+            if (offset < dst_offset) {
+               offset += op.bytes();
+               continue;
+            } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
+               break;
+            }
+            instr->operands[0] = op;
             break;
          }
+      } else if (info.is_constant_or_literal(32)) {
+         /* propagate constants */
+         uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
+         uint32_t val = (info.val >> (dst_offset * 8u)) & mask;
+         instr->operands[0] = Operand::get_const(ctx.program->chip_class, val, instr->definitions[0].bytes());;
+      } else if (index == 0 && instr->operands[0].size() == instr->definitions[0].size()) {
+         ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
+      }
 
-         /* convert this extract into a copy instruction */
-         instr->opcode = aco_opcode::p_parallelcopy;
-         instr->operands.pop_back();
-         instr->operands[0] = op;
-
-         if (op.isConstant()) {
-            ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, op.constantValue64());
-         } else if (op.isUndefined()) {
-            ctx.info[instr->definitions[0].tempId()].set_undefined();
-         } else {
-            assert(op.isTemp());
-            ctx.info[instr->definitions[0].tempId()].set_temp(op.getTemp());
-         }
+      if (instr->operands[0].bytes() != instr->definitions[0].bytes())
          break;
-      }
-      break;
+
+      /* convert this extract into a copy instruction */
+      instr->opcode = aco_opcode::p_parallelcopy;
+      instr->operands.pop_back();
+      FALLTHROUGH;
    }
    case aco_opcode::p_parallelcopy: /* propagate */
       if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&
diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp
index e885001c614..679812faac8 100644
--- a/src/amd/compiler/tests/test_optimizer.cpp
+++ b/src/amd/compiler/tests/test_optimizer.cpp
@@ -676,8 +676,7 @@ BEGIN_TEST(optimize.const_comparison_ordering)
    writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
 
    /* bit sizes */
-   //! v2b: %b16 = p_extract_vector %b, 0
-   //! s2: %res10 = v_cmp_nge_f16 4.0, %b16
+   //! s2: %res10 = v_cmp_nge_f16 4.0, %b
    //! p_unit_test 10, %res10
    Temp input1_16 = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand(0u));
    writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),