Mesa (master): aco: take LDS into account when calculating num_waves

Wed Oct 23 18:28:03 UTC 2019

Module: Mesa
Branch: master
Commit: fc04a2fc3120ad77eee7431ed9d7d1300183647a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=fc04a2fc3120ad77eee7431ed9d7d1300183647a

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Fri Oct 18 19:06:10 2019 +0100

aco: take LDS into account when calculating num_waves

pipeline-db (Vega):
SGPRS: 344 -> 344 (0.00 %)
VGPRS: 424 -> 524 (23.58 %)
Spilled SGPRs: 84 -> 80 (-4.76 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 52812 -> 52484 (-0.62 %) bytes
LDS: 135 -> 135 (0.00 %) blocks
Max Waves: 56 -> 53 (-5.36 %)

v2: consider WGP, rework to be clearer and apply the
    "maximum 16 workgroups per CU" limit properly
v2: use "SIMD" instead of "EU"
v2: fix spiller by introducing "Program::max_waves"
v2: rename "lds_size" to "lds_limit"
v3: make max_waves actually independant of register usage
v3: fix issue where max_waves was way too high
v3: use DIV_ROUND_UP(a, b) instead of max(a / b, 1)
v3: rename "workgroups_per_cu" to "workgroups_per_cu_wgp"
v4: fix typo from "workgroups_per_cu" rename

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev> (v3)

---

 .../compiler/aco_instruction_selection_setup.cpp   |  9 +++---
 src/amd/compiler/aco_ir.h                          |  4 +++
 src/amd/compiler/aco_live_var_analysis.cpp         | 34 ++++++++++++++++++++--
 src/amd/compiler/aco_spill.cpp                     |  2 +-
 4 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index d7a193552ba..0104fd36f49 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -1192,10 +1192,8 @@ setup_variables(isel_context *ctx, nir_shader *nir)
       break;
    }
    case MESA_SHADER_COMPUTE: {
-      unsigned lds_allocation_size_unit = 4 * 64;
-      if (ctx->program->chip_class >= GFX7)
-         lds_allocation_size_unit = 4 * 128;
-      ctx->program->config->lds_size = (nir->info.cs.shared_size + lds_allocation_size_unit - 1) / lds_allocation_size_unit;
+      ctx->program->config->lds_size = (nir->info.cs.shared_size + ctx->program->lds_alloc_granule - 1) /
+                                       ctx->program->lds_alloc_granule;
       break;
    }
    case MESA_SHADER_VERTEX: {
@@ -1255,6 +1253,9 @@ setup_isel_context(Program* program,
    program->family = options->family;
    program->wave_size = options->wave_size;
 
+   program->lds_alloc_granule = options->chip_class >= GFX7 ? 512 : 256;
+   program->lds_limit = options->chip_class >= GFX7 ? 65536 : 32768;
+
    if (options->chip_class >= GFX10) {
       program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
       program->sgpr_alloc_granule = 127;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 3606b33402e..739ef869e6a 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1063,6 +1063,7 @@ public:
    std::vector<Block> blocks;
    RegisterDemand max_reg_demand = RegisterDemand();
    uint16_t num_waves = 0;
+   uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
    ac_shader_config* config;
    struct radv_shader_info *info;
    enum chip_class chip_class;
@@ -1075,6 +1076,9 @@ public:
 
    std::vector<uint8_t> constant_data;
 
+   uint16_t lds_alloc_granule;
+   uint32_t lds_limit; /* in bytes */
+
    uint16_t physical_sgprs;
    uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
    uint16_t sgpr_limit;
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 3fe413256e7..4d689db7070 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -234,7 +234,14 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
 
 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
 {
-   // TODO: also take shared mem into account
+   /* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */
+   unsigned max_waves_per_simd = 10;
+   unsigned simd_per_cu = 4;
+
+   bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
+   unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
+   unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit;
+
    const int16_t vgpr_alloc = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3);
    /* this won't compile, register pressure reduction necessary */
    if (new_demand.vgpr > 256 || new_demand.sgpr > program->sgpr_limit) {
@@ -243,8 +250,31 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
    } else {
       program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
       program->num_waves = std::min<uint16_t>(program->num_waves, 256 / vgpr_alloc);
-      program->num_waves = std::min<uint16_t>(program->num_waves, 10);
+      program->max_waves = max_waves_per_simd;
+
+      /* adjust max_waves for workgroup and LDS limits */
+      unsigned workgroup_size = program->wave_size;
+      if (program->stage == compute_cs) {
+         unsigned* bsize = program->info->cs.block_size;
+         workgroup_size = bsize[0] * bsize[1] * bsize[2];
+      }
+      unsigned waves_per_workgroup = align(workgroup_size, program->wave_size) / program->wave_size;
+
+      unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
+      if (program->config->lds_size) {
+         unsigned lds = program->config->lds_size * program->lds_alloc_granule;
+         workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
+      }
+      if (waves_per_workgroup > 1 && program->chip_class < GFX10)
+         workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
+
+      /* in cases like waves_per_workgroup=3 or lds=65536 and
+       * waves_per_workgroup=1, we want the maximum possible number of waves per
+       * SIMD and not the minimum. so DIV_ROUND_UP is used */
+      program->max_waves = std::min<uint16_t>(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
 
+      /* incorporate max_waves and calculate max_reg_demand */
+      program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
       program->max_reg_demand.vgpr = int16_t((256 / program->num_waves) & ~3);
       program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
    }
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
index 56167e36d6d..fefa8a8221b 100644
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1580,7 +1580,7 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt
    int spills_to_vgpr = (max_reg_demand.sgpr - program->sgpr_limit + 63) / 64;
 
    /* test if it possible to increase occupancy with little spilling */
-   for (unsigned num_waves_next = 2; num_waves_next <= 8; num_waves_next++) {
+   for (unsigned num_waves_next = 2; num_waves_next <= program->max_waves; num_waves_next++) {
       RegisterDemand target_pressure_next = {int16_t((256 / num_waves_next) & ~3),
                                              int16_t(get_addr_sgpr_from_waves(program, num_waves_next))};