Mesa (master): aco: limit register usage for large work groups
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Fri Jan 10 12:28:39 UTC 2020
Module: Mesa
Branch: master
Commit: b5c9688516d00b00184e6fa6868de826916f5fc3
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=b5c9688516d00b00184e6fa6868de826916f5fc3
Author: Rhys Perry <pendingchaos02 at gmail.com>
Date: Wed Dec 18 16:18:35 2019 +0000
aco: limit register usage for large work groups
Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
---
.../compiler/aco_instruction_selection_setup.cpp | 5 ++++
src/amd/compiler/aco_ir.h | 2 ++
src/amd/compiler/aco_live_var_analysis.cpp | 32 +++++++++++++++++-----
src/amd/compiler/aco_scheduler.cpp | 1 +
4 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index d55d4105d39..8a461e7e929 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -819,9 +819,14 @@ setup_isel_context(Program* program,
program->sgpr_alloc_granule = 7;
program->sgpr_limit = 104;
}
+
/* TODO: we don't have to allocate VCC if we don't need it */
program->needs_vcc = true;
+ calc_min_waves(program);
+ program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
+ program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
+
isel_context ctx = {};
ctx.program = program;
ctx.args = args;
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 82309ba8e38..04647981b68 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1155,6 +1155,7 @@ public:
Temp private_segment_buffer;
Temp scratch_offset;
+ uint16_t min_waves = 0;
uint16_t lds_alloc_granule;
uint32_t lds_limit; /* in bytes */
uint16_t vgpr_limit;
@@ -1216,6 +1217,7 @@ void select_program(Program *program,
void lower_wqm(Program* program, live& live_vars,
const struct radv_nir_compiler_options *options);
void lower_bool_phis(Program* program);
+void calc_min_waves(Program* program);
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options);
std::vector<uint16_t> dead_code_analysis(Program *program);
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 2841ba208f6..44a3ea53430 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -228,6 +228,16 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
assert(block->index != 0 || new_demand == RegisterDemand());
}
+
+unsigned calc_waves_per_workgroup(Program *program)
+{
+ unsigned workgroup_size = program->wave_size;
+ if (program->stage == compute_cs) {
+ unsigned* bsize = program->info->cs.block_size;
+ workgroup_size = bsize[0] * bsize[1] * bsize[2];
+ }
+ return align(workgroup_size, program->wave_size) / program->wave_size;
+}
} /* end namespace */
uint16_t get_extra_sgprs(Program *program)
@@ -284,6 +294,20 @@ uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves)
return std::min(vgprs, program->vgpr_limit);
}
+void calc_min_waves(Program* program)
+{
+ unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
+ /* currently min_waves is in wave64 waves */
+ if (program->wave_size == 32)
+ waves_per_workgroup = DIV_ROUND_UP(waves_per_workgroup, 2);
+
+ unsigned simd_per_cu = 4; /* TODO: different on Navi */
+ bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
+ unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
+
+ program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
+}
+
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
{
/* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */
@@ -304,13 +328,7 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
program->max_waves = max_waves_per_simd;
/* adjust max_waves for workgroup and LDS limits */
- unsigned workgroup_size = program->wave_size;
- if (program->stage == compute_cs) {
- unsigned* bsize = program->info->cs.block_size;
- workgroup_size = bsize[0] * bsize[1] * bsize[2];
- }
- unsigned waves_per_workgroup = align(workgroup_size, program->wave_size) / program->wave_size;
-
+ unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
if (program->config->lds_size) {
unsigned lds = program->config->lds_size * program->lds_alloc_granule;
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 665fcb2db6f..0a8d5af8c78 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -932,6 +932,7 @@ void schedule_program(Program *program, live& live_vars)
ctx.num_waves = 7;
else
ctx.num_waves = 8;
+ ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
ctx.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
More information about the mesa-commit
mailing list