Mesa (staging/19.2): radeonsi/gfx10: fix wave occupancy computations

Tue Sep 10 16:54:20 UTC 2019

Module: Mesa
Branch: staging/19.2
Commit: 481d82b65b20c7c769a5dff6d9a553c6869db978
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=481d82b65b20c7c769a5dff6d9a553c6869db978

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Wed Aug 28 17:38:50 2019 -0400

radeonsi/gfx10: fix wave occupancy computations

Cc: 19.2 <mesa-stable at lists.freedesktop.org>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
(cherry picked from commit d95afd8b9e7f9b3880813203292257bf0ed7babf)

---

 src/amd/common/ac_gpu_info.h             | 22 +++++++++++++++---
 src/amd/vulkan/radv_device.c             |  2 +-
 src/amd/vulkan/radv_shader.c             |  7 +++---
 src/gallium/drivers/radeonsi/si_shader.c | 39 ++++++++++++++++++++------------
 4 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 8418a62e387..a3a187e2245 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -173,7 +173,7 @@ unsigned ac_get_compute_resource_limits(struct radeon_info *info,
 					unsigned max_waves_per_sh,
 					unsigned threadgroups_per_cu);
 
-static inline unsigned ac_get_max_simd_waves(enum radeon_family family)
+static inline unsigned ac_get_max_wave64_per_simd(enum radeon_family family)
 {
 
 	switch (family) {
@@ -188,10 +188,26 @@ static inline unsigned ac_get_max_simd_waves(enum radeon_family family)
 	}
 }
 
+static inline unsigned ac_get_num_physical_vgprs(enum chip_class chip_class,
+						 unsigned wave_size)
+{
+	/* The number is per SIMD. */
+	if (chip_class >= GFX10)
+		return wave_size == 32 ? 1024 : 512;
+	else
+		return 256;
+}
+
 static inline uint32_t
-ac_get_num_physical_sgprs(enum chip_class chip_class)
+ac_get_num_physical_sgprs(const struct radeon_info *info)
 {
-	return chip_class >= GFX8 ? 800 : 512;
+	/* The number is per SIMD. There is enough SGPRs for the maximum number
+	 * of Wave32, which is double the number for Wave64.
+	 */
+	if (info->chip_class >= GFX10)
+		return 128 * ac_get_max_wave64_per_simd(info->family) * 2;
+
+	return info->chip_class >= GFX8 ? 800 : 512;
 }
 
 #ifdef __cplusplus
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index c02b5c5afb9..3a10117f68c 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1307,7 +1307,7 @@ void radv_GetPhysicalDeviceProperties2(
 
 			/* SGPR. */
 			properties->sgprsPerSimd =
-				ac_get_num_physical_sgprs(pdevice->rad_info.chip_class);
+				ac_get_num_physical_sgprs(&pdevice->rad_info);
 			properties->minSgprAllocation =
 				pdevice->rad_info.chip_class >= GFX8 ? 16 : 8;
 			properties->maxSgprAllocation =
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 41bc73a882f..4a845624c4d 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -1343,7 +1343,7 @@ radv_get_max_waves(struct radv_device *device,
 	unsigned max_simd_waves;
 	unsigned lds_per_wave = 0;
 
-	max_simd_waves = ac_get_max_simd_waves(device->physical_device->rad_info.family);
+	max_simd_waves = ac_get_max_wave64_per_simd(device->physical_device->rad_info.family);
 
 	if (stage == MESA_SHADER_FRAGMENT) {
 		lds_per_wave = conf->lds_size * lds_increment +
@@ -1359,7 +1359,8 @@ radv_get_max_waves(struct radv_device *device,
 	if (conf->num_sgprs)
 		max_simd_waves =
 			MIN2(max_simd_waves,
-			     ac_get_num_physical_sgprs(chip_class) / conf->num_sgprs);
+			     ac_get_num_physical_sgprs(&device->physical_device->rad_info) /
+			     conf->num_sgprs);
 
 	if (conf->num_vgprs)
 		max_simd_waves =
@@ -1456,7 +1457,7 @@ radv_GetShaderInfoAMD(VkDevice _device,
 			VkShaderStatisticsInfoAMD statistics = {};
 			statistics.shaderStageMask = shaderStage;
 			statistics.numPhysicalVgprs = RADV_NUM_PHYSICAL_VGPRS;
-			statistics.numPhysicalSgprs = ac_get_num_physical_sgprs(device->physical_device->rad_info.chip_class);
+			statistics.numPhysicalSgprs = ac_get_num_physical_sgprs(&device->physical_device->rad_info);
 			statistics.numAvailableSgprs = statistics.numPhysicalSgprs;
 
 			if (stage == MESA_SHADER_COMPUTE) {
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index f4b714af79f..78b27686aad 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5430,7 +5430,7 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
 	unsigned lds_per_wave = 0;
 	unsigned max_simd_waves;
 
-	max_simd_waves = ac_get_max_simd_waves(sscreen->info.family);
+	max_simd_waves = ac_get_max_wave64_per_simd(sscreen->info.family);
 
 	/* Compute LDS usage for PS. */
 	switch (shader->selector->type) {
@@ -5464,16 +5464,25 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
 	if (conf->num_sgprs) {
 		max_simd_waves =
 			MIN2(max_simd_waves,
-			     ac_get_num_physical_sgprs(sscreen->info.chip_class) / conf->num_sgprs);
+			     ac_get_num_physical_sgprs(&sscreen->info) / conf->num_sgprs);
 	}
 
-	if (conf->num_vgprs)
-		max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
+	if (conf->num_vgprs) {
+		/* Always print wave limits as Wave64, so that we can compare
+		 * Wave32 and Wave64 with shader-db fairly. */
+		unsigned max_vgprs = ac_get_num_physical_vgprs(sscreen->info.chip_class, 64);
+		max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
+	}
 
-	/* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
-	 * 16KB makes some SIMDs unoccupied). */
+	/* LDS is 64KB per CU (4 SIMDs) on GFX6-9, which is 16KB per SIMD (usage above
+	 * 16KB makes some SIMDs unoccupied).
+	 *
+	 * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
+	 */
+	unsigned max_lds_size = sscreen->info.chip_class >= GFX10 ? 128*1024 : 64*1024;
+	unsigned max_lds_per_simd = max_lds_size / 4;
 	if (lds_per_wave)
-		max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
+		max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
 
 	shader->info.max_simd_waves = max_simd_waves;
 }
@@ -7178,15 +7187,17 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 	 */
 	if (sel->type == PIPE_SHADER_COMPUTE) {
 		unsigned wave_size = sscreen->compute_wave_size;
-		unsigned max_vgprs = 256;
-		unsigned max_sgprs = sscreen->info.chip_class >= GFX8 ? 800 : 512;
+		unsigned max_vgprs = ac_get_num_physical_vgprs(sscreen->info.chip_class,
+							       wave_size);
+		unsigned max_sgprs = ac_get_num_physical_sgprs(&sscreen->info);
 		unsigned max_sgprs_per_wave = 128;
-		unsigned max_block_threads = si_get_max_workgroup_size(shader);
-		unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size);
-		unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
+		unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
+		unsigned threads_per_tg = si_get_max_workgroup_size(shader);
+		unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
+		unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
 
-		max_vgprs = max_vgprs / min_waves_per_simd;
-		max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
+		max_vgprs = max_vgprs / waves_per_simd;
+		max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
 
 		if (shader->config.num_sgprs > max_sgprs ||
 		    shader->config.num_vgprs > max_vgprs) {