[Mesa-dev] [PATCH 01/10] radeonsi: take LDS into account for compute shader occupancy stats

Marek Olšák maraeo at gmail.com
Fri Dec 2 20:39:20 UTC 2016


From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_shader.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 44a4dd2..145de9f 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5943,61 +5943,71 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 		fprintf(file, "Shader %s binary:\n", name);
 		for (i = 0; i < binary->code_size; i += 4) {
 			fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i,
 				binary->code[i + 3], binary->code[i + 2],
 				binary->code[i + 1], binary->code[i]);
 		}
 	}
 }
 
 static void si_shader_dump_stats(struct si_screen *sscreen,
-			         struct si_shader_config *conf,
-				 unsigned num_inputs,
-				 unsigned code_size,
+				 struct si_shader *shader,
 			         struct pipe_debug_callback *debug,
 			         unsigned processor,
 				 FILE *file)
 {
+	struct si_shader_config *conf = &shader->config;
+	unsigned num_inputs = shader->selector ? shader->selector->info.num_inputs : 0;
+	unsigned code_size = si_get_shader_binary_size(shader);
 	unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
 	unsigned lds_per_wave = 0;
 	unsigned max_simd_waves = 10;
 
 	/* Compute LDS usage for PS. */
-	if (processor == PIPE_SHADER_FRAGMENT) {
+	switch (processor) {
+	case PIPE_SHADER_FRAGMENT:
 		/* The minimum usage per wave is (num_inputs * 48). The maximum
 		 * usage is (num_inputs * 48 * 16).
 		 * We can get anything in between and it varies between waves.
 		 *
 		 * The 48 bytes per input for a single primitive is equal to
 		 * 4 bytes/component * 4 components/input * 3 points.
 		 *
 		 * Other stages don't know the size at compile time or don't
 		 * allocate LDS per wave, but instead they do it per thread group.
 		 */
 		lds_per_wave = conf->lds_size * lds_increment +
 			       align(num_inputs * 48, lds_increment);
+		break;
+	case PIPE_SHADER_COMPUTE:
+		if (shader->selector) {
+			unsigned max_workgroup_size =
+				si_get_max_workgroup_size(shader);
+			lds_per_wave = (conf->lds_size * lds_increment) /
+				       DIV_ROUND_UP(max_workgroup_size, 64);
+		}
+		break;
 	}
 
 	/* Compute the per-SIMD wave counts. */
 	if (conf->num_sgprs) {
 		if (sscreen->b.chip_class >= VI)
 			max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
 		else
 			max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
 	}
 
 	if (conf->num_vgprs)
 		max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
 
-	/* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
-	 * that PS can use.
-	 */
+	/* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
+	 * 16KB makes some SIMDs unoccupied). */
 	if (lds_per_wave)
 		max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
 
 	if (file != stderr ||
 	    r600_can_dump_shader(&sscreen->b, processor)) {
 		if (processor == PIPE_SHADER_FRAGMENT) {
 			fprintf(file, "*** SHADER CONFIG ***\n"
 				"SPI_PS_INPUT_ADDR = 0x%04x\n"
 				"SPI_PS_INPUT_ENA  = 0x%04x\n",
 				conf->spi_ps_input_addr, conf->spi_ps_input_ena);
@@ -6087,24 +6097,21 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
 						   debug, "prolog", file);
 
 		si_shader_dump_disassembly(&shader->binary, debug, "main", file);
 
 		if (shader->epilog)
 			si_shader_dump_disassembly(&shader->epilog->binary,
 						   debug, "epilog", file);
 		fprintf(file, "\n");
 	}
 
-	si_shader_dump_stats(sscreen, &shader->config,
-			     shader->selector ? shader->selector->info.num_inputs : 0,
-			     si_get_shader_binary_size(shader), debug, processor,
-			     file);
+	si_shader_dump_stats(sscreen, shader, debug, processor, file);
 }
 
 int si_compile_llvm(struct si_screen *sscreen,
 		    struct radeon_shader_binary *binary,
 		    struct si_shader_config *conf,
 		    LLVMTargetMachineRef tm,
 		    LLVMModuleRef mod,
 		    struct pipe_debug_callback *debug,
 		    unsigned processor,
 		    const char *name)
-- 
2.7.4



More information about the mesa-dev mailing list