Mesa (main): aco: remove 'max_waves' and use 'num_waves' to adjust for LDS and workgroup size

Fri Apr 29 15:56:51 UTC 2022

Module: Mesa
Branch: main
Commit: 6220046ad198ed8914c7a04ed3d5dd3cf6efc3c2
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=6220046ad198ed8914c7a04ed3d5dd3cf6efc3c2

Author: Daniel Schürmann <daniel at schuermann.dev>
Date:   Tue Apr 19 16:58:26 2022 +0200

aco: remove 'max_waves' and use 'num_waves' to adjust for LDS and workgroup size

Totals from 21 (0.02% of 134913) affected shaders: (GFX10.3)
VGPRs: 1024 -> 1176 (+14.84%)
CodeSize: 127824 -> 127664 (-0.13%); split: -0.17%, +0.04%
MaxWaves: 416 -> 378 (-9.13%)
Instrs: 22521 -> 22502 (-0.08%); split: -0.17%, +0.09%
Latency: 146386 -> 143154 (-2.21%); split: -2.21%, +0.00%
InvThroughput: 28379 -> 28944 (+1.99%); split: -0.23%, +2.22%
VClause: 575 -> 579 (+0.70%); split: -0.87%, +1.57%
SClause: 692 -> 645 (-6.79%)
Copies: 780 -> 747 (-4.23%); split: -4.74%, +0.51%

Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16039>

---

 src/amd/compiler/aco_ir.h                  |  3 +--
 src/amd/compiler/aco_live_var_analysis.cpp | 16 +++++++---------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 327acbf0e6a..f74a2e93c07 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -2052,8 +2052,6 @@ public:
    std::vector<Block> blocks;
    std::vector<RegClass> temp_rc = {s1};
    RegisterDemand max_reg_demand = RegisterDemand();
-   uint16_t num_waves = 0;
-   uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */
    ac_shader_config* config;
    const struct radv_shader_info* info;
    enum chip_class chip_class;
@@ -2069,6 +2067,7 @@ public:
    Temp private_segment_buffer;
    Temp scratch_offset;
 
+   uint16_t num_waves = 0;
    uint16_t min_waves = 0;
    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
    bool wgp_mode;
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 58a4023d787..18c9053db94 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -368,7 +368,6 @@ calc_min_waves(Program* program)
 void
 update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
 {
-   unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);
    unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
    unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;
    unsigned max_workgroups_per_cu_wgp = program->wgp_mode ? 32 : 16;
@@ -387,11 +386,12 @@ update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
          get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
       program->num_waves =
          std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
-      program->max_waves = max_waves_per_simd;
+      uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size);
+      program->num_waves = std::min(program->num_waves, max_waves);
 
-      /* adjust max_waves for workgroup and LDS limits */
+      /* adjust num_waves for workgroup and LDS limits */
       unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
-      unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
+      unsigned workgroups_per_cu_wgp = program->num_waves * simd_per_cu_wgp / waves_per_workgroup;
 
       unsigned lds_per_workgroup =
          align(program->config->lds_size * program->dev.lds_encoding_granule,
@@ -416,12 +416,10 @@ update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
       /* in cases like waves_per_workgroup=3 or lds=65536 and
        * waves_per_workgroup=1, we want the maximum possible number of waves per
        * SIMD and not the minimum. so DIV_ROUND_UP is used */
-      program->max_waves = std::min<uint16_t>(
-         program->max_waves,
-         DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
+      program->num_waves =
+         DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp);
 
-      /* incorporate max_waves and calculate max_reg_demand */
-      program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
+      /* calculate max_reg_demand */
       program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
       program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
    }