Mesa (main): aco: split num_waves adjustment into separate function
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Fri Apr 29 15:56:51 UTC 2022
Module: Mesa
Branch: main
Commit: 8d8c59b4cd962012342855cf91997fa968ad5890
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=8d8c59b4cd962012342855cf91997fa968ad5890
Author: Daniel Schürmann <daniel at schuermann.dev>
Date: Tue Apr 19 11:32:56 2022 +0200
aco: split num_waves adjustment into separate function
Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16039>
---
src/amd/compiler/aco_ir.h | 3 ++
src/amd/compiler/aco_live_var_analysis.cpp | 74 ++++++++++++++++--------------
2 files changed, 42 insertions(+), 35 deletions(-)
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index f74a2e93c07..3016a753a13 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -2245,6 +2245,9 @@ RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& in
/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
uint16_t get_extra_sgprs(Program* program);
+/* adjust num_waves for workgroup size and LDS limits */
+uint16_t max_suitable_waves(Program* program, uint16_t waves);
+
/* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */
uint16_t get_sgpr_alloc(Program* program, uint16_t addressable_sgprs);
uint16_t get_vgpr_alloc(Program* program, uint16_t addressable_vgprs);
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 18c9053db94..d579736cb85 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -365,13 +365,46 @@ calc_min_waves(Program* program)
program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
}
-void
-update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
+uint16_t
+max_suitable_waves(Program* program, uint16_t waves)
{
- unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
+ unsigned num_simd = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
+ unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
+ unsigned num_workgroups = waves * num_simd / waves_per_workgroup;
+
+ /* Adjust #workgroups for LDS */
+ unsigned lds_per_workgroup = align(program->config->lds_size * program->dev.lds_encoding_granule,
+ program->dev.lds_alloc_granule);
+
+ if (program->stage == fragment_fs) {
+ /* PS inputs are moved from PC (parameter cache) to LDS before PS waves are launched.
+ * Each PS input occupies 3x vec4 of LDS space. See Figure 10.3 in GCN3 ISA manual.
+ * These limit occupancy the same way as other stages' LDS usage does.
+ */
+ unsigned lds_bytes_per_interp = 3 * 16;
+ unsigned lds_param_bytes = lds_bytes_per_interp * program->info->ps.num_interp;
+ lds_per_workgroup += align(lds_param_bytes, program->dev.lds_alloc_granule);
+ }
unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit;
- unsigned max_workgroups_per_cu_wgp = program->wgp_mode ? 32 : 16;
+ if (lds_per_workgroup)
+ num_workgroups = std::min(num_workgroups, lds_limit / lds_per_workgroup);
+
+ /* Hardware limitation */
+ if (waves_per_workgroup > 1)
+ num_workgroups = std::min(num_workgroups, program->wgp_mode ? 32u : 16u);
+
+ /* Adjust #waves for workgroup multiples:
+ * In cases like waves_per_workgroup=3 or lds=65536 and
+ * waves_per_workgroup=1, we want the maximum possible number of waves per
+ * SIMD and not the minimum. so DIV_ROUND_UP is used
+ */
+ unsigned workgroup_waves = num_workgroups * waves_per_workgroup;
+ return DIV_ROUND_UP(workgroup_waves, num_simd);
+}
+void
+update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
+{
assert(program->min_waves >= 1);
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
@@ -389,37 +422,8 @@ update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
uint16_t max_waves = program->dev.max_wave64_per_simd * (64 / program->wave_size);
program->num_waves = std::min(program->num_waves, max_waves);
- /* adjust num_waves for workgroup and LDS limits */
- unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
- unsigned workgroups_per_cu_wgp = program->num_waves * simd_per_cu_wgp / waves_per_workgroup;
-
- unsigned lds_per_workgroup =
- align(program->config->lds_size * program->dev.lds_encoding_granule,
- program->dev.lds_alloc_granule);
-
- if (program->stage == fragment_fs) {
- /* PS inputs are moved from PC (parameter cache) to LDS before PS waves are launched.
- * Each PS input occupies 3x vec4 of LDS space. See Figure 10.3 in GCN3 ISA manual.
- * These limit occupancy the same way as other stages' LDS usage does.
- */
- unsigned lds_bytes_per_interp = 3 * 16;
- unsigned lds_param_bytes = lds_bytes_per_interp * program->info->ps.num_interp;
- lds_per_workgroup += align(lds_param_bytes, program->dev.lds_alloc_granule);
- }
-
- if (lds_per_workgroup)
- workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds_per_workgroup);
-
- if (waves_per_workgroup > 1)
- workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, max_workgroups_per_cu_wgp);
-
- /* in cases like waves_per_workgroup=3 or lds=65536 and
- * waves_per_workgroup=1, we want the maximum possible number of waves per
- * SIMD and not the minimum. so DIV_ROUND_UP is used */
- program->num_waves =
- DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp);
-
- /* calculate max_reg_demand */
+ /* Adjust for LDS and workgroup multiples and calculate max_reg_demand */
+ program->num_waves = max_suitable_waves(program, program->num_waves);
program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
}
More information about the mesa-commit
mailing list