Mesa (master): aco: add and use scratch SGPR to lower subdword p_create_vector on GFX6/7

Tue Jun 9 21:50:58 UTC 2020

Module: Mesa
Branch: master
Commit: b21d2d9a9f1f9042def069f51ae46bd64848c853
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=b21d2d9a9f1f9042def069f51ae46bd64848c853

Author: Daniel Schürmann <daniel at schuermann.dev>
Date:   Wed May 27 18:31:33 2020 +0100

aco: add and use scratch SGPR to lower subdword p_create_vector on GFX6/7

This is needed to lower some corner cases correctly,
in case the same operand occurs multiple times:
e.g. v0 = p_create_vector(v0[0:8], v0[0:8], v0[0:8], v0[0:8])

Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5226>

---

 src/amd/compiler/aco_lower_to_hw_instr.cpp   | 21 +++++++++++++++------
 src/amd/compiler/aco_register_allocation.cpp | 12 ++++++++++--
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index f0d6ceecc46..b0b8701720b 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -1024,7 +1024,7 @@ uint32_t get_intersection_mask(int a_start, int a_size,
    return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
 }
 
-bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc)
+bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr)
 {
    bool did_copy = false;
    for (unsigned offset = 0; offset < copy.bytes;) {
@@ -1059,9 +1059,18 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
             assert(op.physReg().byte() == 0);
             def = Definition(def.physReg().advance(-def.physReg().byte()), v1);
             bld.vop2(aco_opcode::v_and_b32, def, Operand((1 << bits) - 1u), Operand(def.physReg(), op.regClass()));
-            bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
-            bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op);
-            bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
+            if (def.physReg().reg() == op.physReg().reg()) {
+               if (bits < 24) {
+                  bld.vop2(aco_opcode::v_mul_u32_u24, def, Operand((1 << bits) + 1u), op);
+               } else {
+                  bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u));
+                  bld.vop3(aco_opcode::v_mul_lo_u32, def, Operand(scratch_sgpr, s1), op);
+               }
+            } else {
+               bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
+               bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op);
+               bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
+            }
          } else {
             bld.vop1(aco_opcode::v_mov_b32, def, op);
          }
@@ -1172,7 +1181,7 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool
    copy_operation tmp_copy = copy;
    tmp_copy.op.setFixed(copy.def.physReg());
    tmp_copy.def.setFixed(copy.op.physReg());
-   do_copy(ctx, bld, tmp_copy, &preserve_scc);
+   do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
 }
 
 void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi)
@@ -1337,7 +1346,7 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context*
          }
       }
 
-      bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc);
+      bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
 
       std::pair<PhysReg, copy_operation> copy = *it;
 
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
index f00001285a0..5b843070e4f 100644
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@@ -1196,13 +1196,18 @@ void handle_pseudo(ra_ctx& ctx,
    }
    /* if all operands are constant, no need to care either */
    bool reads_sgpr = false;
+   bool reads_subdword = false;
    for (Operand& op : instr->operands) {
       if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
          reads_sgpr = true;
          break;
       }
+      if (op.isTemp() && op.regClass().is_subdword())
+         reads_subdword = true;
    }
-   if (!(writes_sgpr && reads_sgpr))
+   bool needs_scratch_reg = (writes_sgpr && reads_sgpr) ||
+                            (ctx.program->chip_class <= GFX7 && reads_subdword);
+   if (!needs_scratch_reg)
       return;
 
    Pseudo_instruction *pi = (Pseudo_instruction *)instr;
@@ -1216,7 +1221,10 @@ void handle_pseudo(ra_ctx& ctx,
          reg = ctx.max_used_sgpr + 1;
          for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
             ;
-         assert(reg < ctx.program->max_reg_demand.sgpr);
+         if (reg == ctx.program->max_reg_demand.sgpr) {
+            assert(reads_subdword && reg_file[m0] == 0);
+            reg = m0;
+         }
       }
 
       adjust_max_used_regs(ctx, s1, reg);