Mesa (master): aco: calculate all p_as_uniform and v_readfirstlane_b32 sources in WQM

Fri Feb 26 14:22:53 UTC 2021

Module: Mesa
Branch: master
Commit: 5f1b3544729178715a1ed0714bd1029737089824
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=5f1b3544729178715a1ed0714bd1029737089824

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Thu Feb 25 15:37:17 2021 +0000

aco: calculate all p_as_uniform and v_readfirstlane_b32 sources in WQM

We should avoid a situation where a v_readfirstlane_b32 is in WQM but it's
source is calculated in Exact.

Fixes hang when running Assassin's Creed: Valhalla benchmark.

fossil-db (GFX10.3):
Totals from 1021 (0.70% of 146267) affected shaders:
CodeSize: 7835228 -> 7842992 (+0.10%); split: -0.00%, +0.10%
Instrs: 1519208 -> 1521149 (+0.13%); split: -0.00%, +0.13%
SClause: 78921 -> 78920 (-0.00%)
Copies: 44456 -> 45421 (+2.17%); split: -0.05%, +2.22%
Branches: 12987 -> 13933 (+7.28%)
PreSGPRs: 47599 -> 47813 (+0.45%)
Cycles: 10037540 -> 10045304 (+0.08%); split: -0.00%, +0.08%
VMEM: 538381 -> 538777 (+0.07%); split: +0.11%, -0.03%
SMEM: 84553 -> 84554 (+0.00%); split: +0.01%, -0.01%

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Cc: mesa-stable
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9288>

---

 src/amd/compiler/aco_insert_exec_mask.cpp      | 3 ++-
 src/amd/compiler/aco_instruction_selection.cpp | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index ea6e1218a90..49367e39830 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -143,7 +143,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
       aco_ptr<Instruction>& instr = block->instructions[i];
 
       WQMState needs = needs_exact(instr) ? Exact : Unspecified;
-      bool propagate_wqm = instr->opcode == aco_opcode::p_wqm;
+      bool propagate_wqm = instr->opcode == aco_opcode::p_wqm ||
+                           instr->opcode == aco_opcode::p_as_uniform;
       bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
       bool pred_by_exec = needs_exec_mask(instr.get());
       for (const Definition& definition : instr->definitions) {
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index c2c4ed868d4..2422d46177f 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -736,8 +736,10 @@ Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr, bool non_uniform=fal
    if (ptr.size() == 2)
       return ptr;
    Builder bld(ctx->program, ctx->block);
-   if (ptr.type() == RegType::vgpr && !non_uniform)
+   if (ptr.type() == RegType::vgpr && !non_uniform) {
       ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
+      ptr = emit_wqm(bld, ptr);
+   }
    return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)),
                      ptr, Operand((unsigned)ctx->options->address32_hi));
 }
@@ -5696,8 +5698,10 @@ Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
             constant_index += array_size * const_value->u32;
          } else {
             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
-            if (indirect.type() == RegType::vgpr)
+            if (indirect.type() == RegType::vgpr) {
                indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
+               indirect = emit_wqm(bld, indirect);
+            }
 
             if (array_size != 1)
                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);