Mesa (main): ac,radeonsi: move late alloc computation into common code and shader states

Thu Jul 8 18:55:48 UTC 2021

Module: Mesa
Branch: main
Commit: b2397c394d82c69456c1e8b2194dbec52f7b0a71
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=b2397c394d82c69456c1e8b2194dbec52f7b0a71

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Wed Jul  7 16:37:30 2021 -0400

ac,radeonsi: move late alloc computation into common code and shader states

This also fixes a rare deadlock when a scratch buffer is used.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11754>

---

 src/amd/common/ac_gpu_info.h                    |  2 +-
 src/amd/common/ac_shader_util.c                 | 67 +++++++++++++++++++++++++
 src/amd/common/ac_shader_util.h                 |  3 ++
 src/gallium/drivers/radeonsi/si_state.c         | 57 ---------------------
 src/gallium/drivers/radeonsi/si_state_shaders.c | 53 +++++++++----------
 5 files changed, 98 insertions(+), 84 deletions(-)

diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 68a4bf182da..e487ff040c1 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -206,7 +206,7 @@ struct radeon_info {
    uint32_t min_wave64_vgpr_alloc;
    uint32_t max_vgpr_alloc;
    uint32_t wave64_vgpr_alloc_granularity;
-   bool use_late_alloc; /* VS and GS: late pos/param allocation */
+   bool use_late_alloc; /* deprecated: remove this after radv switches to ac_compute_late_alloc */
 
    /* Render backends (color + depth blocks). */
    uint32_t r300_num_gb_pipes;
diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c
index ad6d026a0f3..645d0d36178 100644
--- a/src/amd/common/ac_shader_util.c
+++ b/src/amd/common/ac_shader_util.c
@@ -22,6 +22,7 @@
  */
 
 #include "ac_shader_util.h"
+#include "ac_gpu_info.h"
 
 #include "sid.h"
 
@@ -444,3 +445,69 @@ void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype,
    formats->blend = blend;
    formats->blend_alpha = blend_alpha;
 }
+
+void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling,
+                           bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask)
+{
+   *late_alloc_wave64 = 0; /* The limit is per SA. */
+   *cu_mask = 0xffff;
+
+   /* CU masking can decrease performance and cause a hang with <= 2 CUs per SA. */
+   if (info->min_good_cu_per_sa <= 2)
+      return;
+
+   /* If scratch is used with late alloc, the GPU could deadlock if PS uses scratch too. A more
+    * complicated computation is needed to enable late alloc with scratch (see PAL).
+    */
+   if (uses_scratch)
+      return;
+
+   /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
+   if (ngg && info->family == CHIP_NAVI14)
+      return;
+
+   if (info->chip_class >= GFX10) {
+      /* For Wave32, the hw will launch twice the number of late alloc waves, so 1 == 2x wave32.
+       * These limits are estimated because they are all safe but they vary in performance.
+       */
+      if (ngg_culling)
+         *late_alloc_wave64 = info->min_good_cu_per_sa * 10;
+      else
+         *late_alloc_wave64 = info->min_good_cu_per_sa * 4;
+
+      /* Limit LATE_ALLOC_GS to prevent a hang (hw bug) on gfx10. */
+      if (info->chip_class == GFX10 && ngg)
+         *late_alloc_wave64 = MIN2(*late_alloc_wave64, 64);
+
+      /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock.
+       * Others: CU1 must be disabled to prevent a hw deadlock.
+       *
+       * The deadlock is caused by late alloc, which usually increases performance.
+       */
+      *cu_mask &= info->chip_class == GFX10 ? ~BITFIELD_RANGE(2, 2) :
+                                              ~BITFIELD_RANGE(1, 1);
+   } else {
+      if (info->min_good_cu_per_sa <= 4) {
+         /* Too few available compute units per SA. Disallowing VS to run on one CU could hurt us
+          * more than late VS allocation would help.
+          *
+          * 2 is the highest safe number that allows us to keep all CUs enabled.
+          */
+         *late_alloc_wave64 = 2;
+      } else {
+         /* This is a good initial value, allowing 1 late_alloc wave per SIMD on num_cu - 2.
+          */
+         *late_alloc_wave64 = (info->min_good_cu_per_sa - 2) * 4;
+      }
+
+      /* VS can't execute on one CU if the limit is > 2. */
+      if (*late_alloc_wave64 > 2)
+         *cu_mask = 0xfffe; /* 1 CU disabled */
+   }
+
+   /* Max number that fits into the register field. */
+   if (ngg) /* GS */
+      *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(~0u));
+   else /* VS */
+      *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B11C_LIMIT(~0u));
+}
diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h
index 5f52eb9b58a..f9020125f47 100644
--- a/src/amd/common/ac_shader_util.h
+++ b/src/amd/common/ac_shader_util.h
@@ -101,6 +101,9 @@ void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype,
                                  bool is_depth, bool use_rbplus,
                                  struct ac_spi_color_formats *formats);
 
+void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling,
+                           bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 84485af080d..bbac84819d7 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -5291,63 +5291,6 @@ void si_init_cs_preamble_state(struct si_context *sctx, bool uses_reg_shadowing)
       cu_mask_ps = u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa);
 
    if (sctx->chip_class >= GFX7) {
-      /* Compute LATE_ALLOC_VS.LIMIT. */
-      unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
-      unsigned late_alloc_wave64 = 0; /* The limit is per SA. */
-      unsigned cu_mask_vs = 0xffff;
-      unsigned cu_mask_gs = 0xffff;
-
-      if (sctx->chip_class >= GFX10) {
-         /* For Wave32, the hw will launch twice the number of late
-          * alloc waves, so 1 == 2x wave32.
-          */
-         if (!sscreen->info.use_late_alloc) {
-            late_alloc_wave64 = 0;
-         } else {
-            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
-            /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock.
-             * Others: CU1 must be disabled to prevent a hw deadlock.
-             *
-             * The deadlock is caused by late alloc, which usually increases
-             * performance.
-             */
-            cu_mask_vs &= sctx->chip_class == GFX10 ? ~BITFIELD_RANGE(2, 2) :
-                                                      ~BITFIELD_RANGE(1, 1);
-
-            /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
-            if (sscreen->use_ngg && sctx->family != CHIP_NAVI14)
-               cu_mask_gs = cu_mask_vs;
-         }
-      } else {
-         if (!sscreen->info.use_late_alloc) {
-            late_alloc_wave64 = 0;
-         } else if (num_cu_per_sh <= 4) {
-            /* Too few available compute units per SA. Disallowing
-             * VS to run on one CU could hurt us more than late VS
-             * allocation would help.
-             *
-             * 2 is the highest safe number that allows us to keep
-             * all CUs enabled.
-             */
-            late_alloc_wave64 = 2;
-         } else {
-            /* This is a good initial value, allowing 1 late_alloc
-             * wave per SIMD on num_cu - 2.
-             */
-            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-         }
-
-         /* VS can't execute on one CU if the limit is > 2. */
-         if (late_alloc_wave64 > 2)
-            cu_mask_vs = 0xfffe; /* 1 CU disabled */
-      }
-
-      si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
-                     S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
-      si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
-      si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
-                     S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
       si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
                      S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F));
    }
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 087622c8a20..5ef033ed6a5 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -936,6 +936,8 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 
       si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, rsrc1);
       si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
+      si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                     S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
 
       if (sscreen->info.chip_class >= GFX10) {
          si_pm4_set_reg(pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
@@ -955,6 +957,10 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 
       polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4);
    } else {
+      if (sscreen->info.chip_class >= GFX7) {
+         si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                        S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
+      }
       si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
       si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, S_00B224_MEM_BASE(va >> 40));
 
@@ -1193,6 +1199,11 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
       gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
 
    unsigned wave_size = si_get_shader_wave_size(shader);
+   unsigned late_alloc_wave64, cu_mask;
+
+   ac_compute_late_alloc(&sscreen->info, true, shader->key.opt.ngg_culling,
+                         shader->config.scratch_bytes_per_wave > 0,
+                         &late_alloc_wave64, &cu_mask);
 
    si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
    si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, S_00B324_MEM_BASE(va >> 40));
@@ -1212,29 +1223,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
                      S_00B22C_USER_SGPR_MSB_GFX10(num_user_sgprs >> 5) |
                      S_00B22C_OC_LDS_EN(es_stage == MESA_SHADER_TESS_EVAL) |
                      S_00B22C_LDS_SIZE(shader->config.lds_size));
-
-   /* Determine LATE_ALLOC_GS. */
-   unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
-   unsigned late_alloc_wave64; /* The limit is per SA. */
-
-   /* For Wave32, the hw will launch twice the number of late
-    * alloc waves, so 1 == 2x wave32.
-    *
-    * Don't use late alloc for NGG on Navi14 due to a hw bug.
-    */
-   if (sscreen->info.family == CHIP_NAVI14 || !sscreen->info.use_late_alloc)
-      late_alloc_wave64 = 0;
-   else if (shader->key.opt.ngg_culling)
-      late_alloc_wave64 = num_cu_per_sh * 10;
-   else
-      late_alloc_wave64 = num_cu_per_sh * 4;
-
-   /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
-   if (sscreen->info.chip_class == GFX10)
-      late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
-
-   /* Max number that fits into the register field. */
-   late_alloc_wave64 = MIN2(late_alloc_wave64, 127);
+   si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+                  S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
 
    si_pm4_set_reg(
       pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
@@ -1307,8 +1297,8 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
          oversub_pc_factor = 0.5;
    }
 
-   unsigned oversub_pc_lines = sscreen->info.pc_lines * oversub_pc_factor;
-   shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+   unsigned oversub_pc_lines = late_alloc_wave64 ? sscreen->info.pc_lines * oversub_pc_factor : 0;
+   shader->ctx_reg.ngg.ge_pc_alloc = S_030980_OVERSUB_EN(oversub_pc_lines > 0) |
                                      S_030980_NUM_PC_LINES(oversub_pc_lines - 1);
 
    if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST ||
@@ -1495,12 +1485,23 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
                                                                   : V_02870C_SPI_SHADER_NONE) |
       S_02870C_POS3_EXPORT_FORMAT(shader->info.nr_pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
                                                                   : V_02870C_SPI_SHADER_NONE);
-   shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(sscreen->info.use_late_alloc) |
+   unsigned late_alloc_wave64, cu_mask;
+   ac_compute_late_alloc(&sscreen->info, false, false,
+                         shader->config.scratch_bytes_per_wave > 0,
+                         &late_alloc_wave64, &cu_mask);
+
+   shader->ctx_reg.vs.ge_pc_alloc = S_030980_OVERSUB_EN(late_alloc_wave64 > 0) |
                                     S_030980_NUM_PC_LINES(sscreen->info.pc_lines / 4 - 1);
    shader->pa_cl_vs_out_cntl = si_get_vs_out_cntl(shader->selector, shader, false);
 
    oc_lds_en = shader->selector->info.stage == MESA_SHADER_TESS_EVAL ? 1 : 0;
 
+   if (sscreen->info.chip_class >= GFX7) {
+      si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+                     S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
+      si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
+   }
+
    si_pm4_set_reg(pm4, R_00B120_SPI_SHADER_PGM_LO_VS, va >> 8);
    si_pm4_set_reg(pm4, R_00B124_SPI_SHADER_PGM_HI_VS, S_00B124_MEM_BASE(va >> 40));