Mesa (master): aco: only break SMEM clauses if XNACK is enabled (mostly APUs)

Wed Apr 1 18:31:24 UTC 2020

Module: Mesa
Branch: master
Commit: 2f424c83e072f6a21d15af1064f6e744e801fbfa
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=2f424c83e072f6a21d15af1064f6e744e801fbfa

Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date:   Fri Mar 27 15:16:39 2020 +0100

aco: only break SMEM clauses if XNACK is enabled (mostly APUs)

According to LLVM, it seems only required for APUs like RAVEN, but
we still ensure that SMEM stores are in their own clause.

pipeline-db (VEGA10):
Totals from affected shaders:
SGPRS: 1775364 -> 1775364 (0.00 %)
VGPRS: 1287176 -> 1287176 (0.00 %)
Spilled SGPRs: 725 -> 725 (0.00 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Code Size: 65386620 -> 65107460 (-0.43 %) bytes
Max Waves: 287099 -> 287099 (0.00 %)

pipeline-db (POLARIS10):
Totals from affected shaders:
SGPRS: 1797743 -> 1797743 (0.00 %)
VGPRS: 1271108 -> 1271108 (0.00 %)
Spilled SGPRs: 730 -> 730 (0.00 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Code Size: 64046244 -> 63782324 (-0.41 %) bytes
Max Waves: 254875 -> 254875 (0.00 %)

This only affects GFX6-GFX9 chips because the compiler uses a
different pass for GFX10.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4349>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4349>

---

 src/amd/compiler/aco_insert_NOPs.cpp               | 77 ++++++++++++++--------
 .../compiler/aco_instruction_selection_setup.cpp   | 20 ++++++
 src/amd/compiler/aco_ir.h                          |  3 +-
 src/amd/compiler/aco_live_var_analysis.cpp         |  6 +-
 4 files changed, 75 insertions(+), 31 deletions(-)

diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
index 75dbe852174..bb703d7481e 100644
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -274,6 +274,41 @@ bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
    }
 }
 
+/* A SMEM clause is any group of consecutive SMEM instructions. The
+ * instructions in this group may return out of order and/or may be replayed.
+ *
+ * To fix this potential hazard correctly, we have to make sure that when a
+ * clause has more than one instruction, no instruction in the clause writes
+ * to a register that is read by another instruction in the clause (including
+ * itself). In this case, we have to break the SMEM clause by inserting non
+ * SMEM instructions.
+ *
+ * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
+ */
+void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
+                                aco_ptr<Instruction>& instr, int *NOPs)
+{
+   /* break off from previous SMEM clause if needed */
+   if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
+      /* Don't allow clauses with store instructions since the clause's
+       * instructions may use the same address. */
+      if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
+         *NOPs = 1;
+      } else if (program->xnack_enabled) {
+         for (Operand op : instr->operands) {
+            if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
+               *NOPs = 1;
+               break;
+            }
+         }
+
+         Definition def = instr->definitions[0];
+         if (!*NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
+            *NOPs = 1;
+      }
+   }
+}
+
 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
 void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &ctx,
                              aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
@@ -300,24 +335,7 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
          }
       }
 
-      /* break off from prevous SMEM clause if needed */
-      if (!NOPs & (ctx.smem_clause || ctx.smem_write)) {
-         /* Don't allow clauses with store instructions since the clause's
-          * instructions may use the same address. */
-         if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
-            NOPs = 1;
-         } else {
-            for (Operand op : instr->operands) {
-               if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
-                  NOPs = 1;
-                  break;
-               }
-            }
-            Definition def = instr->definitions[0];
-            if (!NOPs && test_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size()))
-               NOPs = 1;
-         }
-      }
+      handle_smem_clause_hazards(program, ctx, instr, &NOPs);
    } else if (instr->isSALU()) {
       if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
           instr->opcode == aco_opcode::s_getreg_b32) {
@@ -414,8 +432,11 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
    if ((ctx.smem_clause || ctx.smem_write) && (NOPs || instr->format != Format::SMEM)) {
       ctx.smem_clause = false;
       ctx.smem_write = false;
-      BITSET_ZERO(ctx.smem_clause_read_write);
-      BITSET_ZERO(ctx.smem_clause_write);
+
+      if (program->xnack_enabled) {
+         BITSET_ZERO(ctx.smem_clause_read_write);
+         BITSET_ZERO(ctx.smem_clause_write);
+      }
    }
 
    if (instr->format == Format::SMEM) {
@@ -424,15 +445,17 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
       } else {
          ctx.smem_clause = true;
 
-         for (Operand op : instr->operands) {
-            if (!op.isConstant()) {
-               set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
+         if (program->xnack_enabled) {
+            for (Operand op : instr->operands) {
+               if (!op.isConstant()) {
+                  set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
+               }
             }
-         }
 
-         Definition def = instr->definitions[0];
-         set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
-         set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
+            Definition def = instr->definitions[0];
+            set_bitset_range(ctx.smem_clause_read_write, def.physReg(), def.size());
+            set_bitset_range(ctx.smem_clause_write, def.physReg(), def.size());
+         }
       }
    } else if (instr->isVALU()) {
       for (Definition def : instr->definitions) {
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index d365f79698a..462cd48d960 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -1150,6 +1150,24 @@ setup_nir(isel_context *ctx, nir_shader *nir)
    nir_index_ssa_defs(func);
 }
 
+void
+setup_xnack(Program *program)
+{
+   switch (program->family) {
+   /* GFX8 APUs */
+   case CHIP_CARRIZO:
+   case CHIP_STONEY:
+   /* GFX9 APUS */
+   case CHIP_RAVEN:
+   case CHIP_RAVEN2:
+   case CHIP_RENOIR:
+      program->xnack_enabled = true;
+      break;
+   default:
+      break;
+   }
+}
+
 isel_context
 setup_isel_context(Program* program,
                    unsigned shader_count,
@@ -1308,6 +1326,8 @@ setup_isel_context(Program* program,
    ctx.block->loop_nest_depth = 0;
    ctx.block->kind = block_kind_top_level;
 
+   setup_xnack(program);
+
    return ctx;
 }
 
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 73a1d394eff..ace84db1018 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1252,8 +1252,9 @@ public:
    uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
 
+   bool xnack_enabled = false;
+
    bool needs_vcc = false;
-   bool needs_xnack_mask = false;
    bool needs_flat_scr = false;
 
    uint32_t allocateId()
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index e223d6d5f84..106c5eb3166 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -302,19 +302,19 @@ uint16_t get_extra_sgprs(Program *program)
 {
    if (program->chip_class >= GFX10) {
       assert(!program->needs_flat_scr);
-      assert(!program->needs_xnack_mask);
+      assert(!program->xnack_enabled);
       return 2;
    } else if (program->chip_class >= GFX8) {
       if (program->needs_flat_scr)
          return 6;
-      else if (program->needs_xnack_mask)
+      else if (program->xnack_enabled)
          return 4;
       else if (program->needs_vcc)
          return 2;
       else
          return 0;
    } else {
-      assert(!program->needs_xnack_mask);
+      assert(!program->xnack_enabled);
       if (program->needs_flat_scr)
          return 4;
       else if (program->needs_vcc)