Mesa (master): aco: change gpr_alloc_granule to full alignment

Fri Feb 12 19:10:51 UTC 2021

Module: Mesa
Branch: master
Commit: eaf681724e992209f1ca97e2c37837170b8087aa
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=eaf681724e992209f1ca97e2c37837170b8087aa

Author: Daniel Schürmann <daniel at schuermann.dev>
Date:   Tue Feb  2 17:33:09 2021 +0100

aco: change gpr_alloc_granule to full alignment

This also switches the alloc_granule of Tonga and Iceland
to 96, so that the calculation is consistent.
Also changes the granularity for RDNA to 16 to keep
better stats with the upcoming patch.

Reviewed-by: Rhys Perry <pendingchaos02 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8921>

---

 src/amd/compiler/aco_ir.cpp                | 19 ++++++++++---------
 src/amd/compiler/aco_ir.h                  |  4 ++--
 src/amd/compiler/aco_live_var_analysis.cpp | 18 +++++++++---------
 3 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 90d0f5e9ab7..a156d109b31 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -100,27 +100,28 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
 
    program->vgpr_limit = 256;
    program->physical_vgprs = 256;
-   program->vgpr_alloc_granule = 3;
+   program->vgpr_alloc_granule = 4;
 
    if (chip_class >= GFX10) {
       program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
       program->physical_vgprs = 512;
-      program->sgpr_alloc_granule = 127;
+      program->sgpr_alloc_granule = 128;
       program->sgpr_limit = 106;
       if (chip_class >= GFX10_3)
-         program->vgpr_alloc_granule = program->wave_size == 32 ? 15 : 7;
+         program->vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
       else
-         program->vgpr_alloc_granule = program->wave_size == 32 ? 7 : 3;
+         program->vgpr_alloc_granule = program->wave_size == 32 ? 8 : 4;
    } else if (program->chip_class >= GFX8) {
       program->physical_sgprs = 800;
-      program->sgpr_alloc_granule = 15;
-      if (family == CHIP_TONGA || family == CHIP_ICELAND)
+      program->sgpr_alloc_granule = 16;
+      program->sgpr_limit = 102;
+      if (family == CHIP_TONGA || family == CHIP_ICELAND) {
+         program->sgpr_alloc_granule = 96;
          program->sgpr_limit = 94; /* workaround hardware bug */
-      else
-         program->sgpr_limit = 102;
+      }
    } else {
       program->physical_sgprs = 512;
-      program->sgpr_alloc_granule = 7;
+      program->sgpr_alloc_granule = 8;
       program->sgpr_limit = 104;
    }
 
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 8f0e2f13f98..d2fdf380117 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1825,8 +1825,8 @@ public:
    uint16_t sgpr_limit;
    uint16_t physical_sgprs;
    uint16_t physical_vgprs;
-   uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
-   uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
+   uint16_t sgpr_alloc_granule;
+   uint16_t vgpr_alloc_granule; /* must be power of two */
    unsigned workgroup_size; /* if known; otherwise UINT_MAX */
 
    bool xnack_enabled = false;
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
index 61a1c626c5c..5f6c5b00a29 100644
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@@ -273,30 +273,30 @@ uint16_t get_extra_sgprs(Program *program)
 
 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
 {
-   assert(addressable_sgprs <= program->sgpr_limit);
    uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
-   uint16_t granule = program->sgpr_alloc_granule + 1;
-   return align(std::max(sgprs, granule), granule);
+   uint16_t granule = program->sgpr_alloc_granule;
+   return ALIGN_NPOT(std::max(sgprs, granule), granule);
 }
 
 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
 {
    assert(addressable_vgprs <= program->vgpr_limit);
-   uint16_t granule = program->vgpr_alloc_granule + 1;
+   uint16_t granule = program->vgpr_alloc_granule;
    return align(std::max(addressable_vgprs, granule), granule);
 }
 
 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
 {
-    uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule;
-    sgprs -= get_extra_sgprs(program);
-    return std::min(sgprs, program->sgpr_limit);
+   uint16_t sgprs = (program->physical_sgprs / max_waves) - program->sgpr_alloc_granule + 1;
+   sgprs = get_sgpr_alloc(program, sgprs);
+   sgprs -= get_extra_sgprs(program);
+   return std::min(sgprs, program->sgpr_limit);
 }
 
 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves)
 {
-    uint16_t vgprs = program->physical_vgprs / max_waves & ~program->vgpr_alloc_granule;
-    return std::min(vgprs, program->vgpr_limit);
+   uint16_t vgprs = program->physical_vgprs / max_waves & ~(program->vgpr_alloc_granule - 1);
+   return std::min(vgprs, program->vgpr_limit);
 }
 
 void calc_min_waves(Program* program)