Mesa (main): radv: Use ac_compute_late_alloc in radv_pipeline.

Wed Aug 4 15:59:40 UTC 2021

Module: Mesa
Branch: main
Commit: d911259c3a47fd35a7afdaa45182ba87e9b501b2
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=d911259c3a47fd35a7afdaa45182ba87e9b501b2

Author: Timur Kristóf <timur.kristof at gmail.com>
Date:   Thu Jul 15 11:48:25 2021 +0200

radv: Use ac_compute_late_alloc in radv_pipeline.

This aligns RADV with RadeonSI in how it handles late alloc,
making it easier for us to deal with deadlocks and such.

Also move setting the RSRC3 registers for VS, GS and NGG
into radv_pipeline.

Signed-off-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11905>

---

 src/amd/vulkan/radv_pipeline.c | 57 ++++++++++++++++++++++++++
 src/amd/vulkan/si_cmd_buffer.c | 91 ------------------------------------------
 2 files changed, 57 insertions(+), 91 deletions(-)

diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 0c1ddc1561b..bf78a8cceb9 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1983,6 +1983,20 @@ radv_get_num_input_vertices(nir_shader **nir)
    return 3;
 }
 
+static void
+gfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum chip_class chip_class, uint32_t oversub_pc_lines)
+{
+   if (chip_class == GFX10) {
+      /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
+      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+      radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
+   }
+
+   radeon_set_uconfig_reg(
+      cs, R_030980_GE_PC_ALLOC,
+      S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
+}
+
 static void
 gfx10_get_ngg_info(const struct radv_pipeline_key *key, struct radv_pipeline *pipeline,
                    nir_shader **nir, struct radv_shader_info *infos, struct gfx10_ngg_info *ngg)
@@ -4379,6 +4393,20 @@ radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf
 
    if (pipeline->device->physical_device->rad_info.chip_class <= GFX8)
       radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF, outinfo->writes_viewport_index);
+
+   unsigned late_alloc_wave64, cu_mask;
+   ac_compute_late_alloc(&pipeline->device->physical_device->rad_info, false, false,
+                         shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
+
+   if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
+      radeon_set_sh_reg_idx(pipeline->device->physical_device, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
+                            S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
+      radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
+   }
+   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
+      uint32_t oversub_pc_lines = late_alloc_wave64 ? pipeline->device->physical_device->rad_info.pc_lines / 4 : 0;
+      gfx10_emit_ge_pc_alloc(cs, pipeline->device->physical_device->rad_info.chip_class, oversub_pc_lines);
+   }
 }
 
 static void
@@ -4547,6 +4575,23 @@ radv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf
    }
 
    radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL, ge_cntl);
+
+   unsigned late_alloc_wave64, cu_mask;
+   ac_compute_late_alloc(&pipeline->device->physical_device->rad_info, true, shader->info.has_ngg_culling,
+                         shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
+
+   radeon_set_sh_reg_idx(
+      pipeline->device->physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
+      S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
+   radeon_set_sh_reg_idx(
+      pipeline->device->physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
+      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
+
+   uint32_t oversub_pc_lines = late_alloc_wave64 ? pipeline->device->physical_device->rad_info.pc_lines / 4 : 0;
+   if (shader->info.has_ngg_culling)
+      oversub_pc_lines *= 3;
+
+   gfx10_emit_ge_pc_alloc(cs, pipeline->device->physical_device->rad_info.chip_class, oversub_pc_lines);
 }
 
 static void
@@ -4782,6 +4827,18 @@ radv_pipeline_generate_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf
       radeon_emit(cs, gs->config.rsrc2);
    }
 
+   if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
+      radeon_set_sh_reg_idx(
+         pipeline->device->physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
+         S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
+
+      if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
+         radeon_set_sh_reg_idx(
+            pipeline->device->physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
+            S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
+      }
+   }
+
    radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader);
 }
 
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index 9d8825b57b7..a4471d87910 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -332,86 +332,6 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
                                 S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
       }
 
-      /* Compute LATE_ALLOC_VS.LIMIT. */
-      unsigned num_cu_per_sh = physical_device->rad_info.min_good_cu_per_sa;
-      unsigned late_alloc_wave64 = 0; /* The limit is per SA. */
-      unsigned late_alloc_wave64_gs = 0;
-      unsigned cu_mask_vs = 0xffff;
-      unsigned cu_mask_gs = 0xffff;
-
-      if (physical_device->rad_info.chip_class >= GFX10) {
-         /* For Wave32, the hw will launch twice the number of late
-          * alloc waves, so 1 == 2x wave32.
-          */
-         if (!physical_device->rad_info.use_late_alloc) {
-            late_alloc_wave64 = 0;
-         } else {
-            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-
-            /* Gfx10: CU2 & CU3 must be disabled to
-             * prevent a hw deadlock.  Others: CU1 must be
-             * disabled to prevent a hw deadlock.
-             *
-             * The deadlock is caused by late alloc, which
-             * usually increases performance.
-             */
-            cu_mask_vs &= physical_device->rad_info.chip_class == GFX10 ? ~BITFIELD_RANGE(2, 2)
-                                                                        : ~BITFIELD_RANGE(1, 1);
-
-            if (physical_device->use_ngg) {
-               cu_mask_gs = cu_mask_vs;
-            }
-         }
-
-         late_alloc_wave64_gs = late_alloc_wave64;
-
-         /* Don't use late alloc for NGG on Navi14 due to a hw
-          * bug. If NGG is never used, enable all CUs.
-          */
-         if (!physical_device->use_ngg || physical_device->rad_info.family == CHIP_NAVI14) {
-            late_alloc_wave64_gs = 0;
-            cu_mask_gs = 0xffff;
-         }
-
-         /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
-         if (physical_device->rad_info.chip_class == GFX10)
-            late_alloc_wave64_gs = MIN2(late_alloc_wave64_gs, 64);
-      } else {
-         if (!physical_device->rad_info.use_late_alloc) {
-            late_alloc_wave64 = 0;
-         } else if (num_cu_per_sh <= 4) {
-            /* Too few available compute units per SA.
-             * Disallowing VS to run on one CU could hurt
-             * us more than late VS allocation would help.
-             *
-             * 2 is the highest safe number that allows us
-             * to keep all CUs enabled.
-             */
-            late_alloc_wave64 = 2;
-         } else {
-            /* This is a good initial value, allowing 1
-             * late_alloc wave per SIMD on num_cu - 2.
-             */
-            late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
-         }
-
-         if (late_alloc_wave64 > 2)
-            cu_mask_vs = 0xfffe; /* 1 CU disabled */
-      }
-
-      radeon_set_sh_reg_idx(physical_device, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
-                            S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
-      radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
-
-      radeon_set_sh_reg_idx(physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
-                            S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
-
-      if (physical_device->rad_info.chip_class >= GFX10) {
-         radeon_set_sh_reg_idx(
-            physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
-            S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64_gs));
-      }
-
       radeon_set_sh_reg_idx(physical_device, cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, 3,
                             S_00B01C_CU_EN(cu_mask_ps) | S_00B01C_WAVE_LIMIT(0x3F));
    }
@@ -487,17 +407,6 @@ si_emit_graphics(struct radv_device *device, struct radeon_cmdbuf *cs)
             cs, R_028848_PA_CL_VRS_CNTL,
             S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE));
       }
-
-      if (physical_device->rad_info.chip_class == GFX10) {
-         /* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
-         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
-         radeon_emit(cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
-      }
-
-      /* When culling is enabled, this will be overwritten accordingly */
-      radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC,
-                             S_030980_OVERSUB_EN(physical_device->rad_info.use_late_alloc) |
-                             S_030980_NUM_PC_LINES(physical_device->rad_info.pc_lines / 4 - 1));
    }
 
    if (physical_device->rad_info.chip_class >= GFX9) {