[Mesa-dev] [PATCH 2/2] radeonsi: write shader asm annotated with wave info into GPU hang reports

Marek Olšák maraeo at gmail.com
Sat Feb 4 22:44:29 UTC 2017


From: Marek Olšák <marek.olsak at amd.com>

Note that the disassembly is written twice - first the unmodified compiler
output and then the wave-annotated output only if there are waves executing
the shader.

Sample output from a real GPU hang most likely caused by image_sample:

The number of active waves = 28

Pixel Shader - annotated disassembly:
    s_mov_b64 s[6:7], exec                                        ; BE86017E [PC=0x10f3e3800, off=0, size=4]
    s_wqm_b64 exec, exec                                          ; BEFE077E [PC=0x10f3e3804, off=4, size=4]
...
    image_sample v[7:9], v[0:1], s[12:19], s[20:23] dmask:0x7     ; F0800700 00A30700 [PC=0x10f3e3a94, off=660, size=8]
    s_buffer_load_dword s20, s[0:3], 0x50                         ; C0220500 00000050 [PC=0x10f3e3a9c, off=668, size=8]
    s_load_dwordx4 s[24:27], s[4:5], 0x170                        ; C00A0602 00000170 [PC=0x10f3e3aa4, off=676, size=8]
    s_load_dwordx8 s[12:19], s[4:5], 0x140                        ; C00E0302 00000140 [PC=0x10f3e3aac, off=684, size=8]
    s_buffer_load_dword s11, s[0:3], 0x5c                         ; C02202C0 0000005C [PC=0x10f3e3ab4, off=692, size=8]
    s_buffer_load_dword s21, s[0:3], 0x54                         ; C0220540 00000054 [PC=0x10f3e3abc, off=700, size=8]
    s_buffer_load_dword s22, s[0:3], 0x58                         ; C0220580 00000058 [PC=0x10f3e3ac4, off=708, size=8]
    s_waitcnt vmcnt(0)                                            ; BF8C0F70 [PC=0x10f3e3acc, off=716, size=4]
          ^ SE0 SH0 CU1 SIMD1 WAVE0  EXEC=aaaaaaa555aaaaaa  INST32=BF8C0F70
          ^ SE0 SH0 CU1 SIMD2 WAVE0  EXEC=aaaa85555555552a  INST32=BF8C0F70
          ^ SE0 SH0 CU1 SIMD3 WAVE0  EXEC=000000000000000a  INST32=BF8C0F70
          ^ SE0 SH0 CU6 SIMD1 WAVE0  EXEC=25a5a5aa82aaaaaa  INST32=BF8C0F70
          ^ SE0 SH0 CU6 SIMD3 WAVE0  EXEC=50aaaa8fffa55555  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD0 WAVE0  EXEC=5554aaaaaaa1a555  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD0 WAVE1  EXEC=aaaa5555ffffffff  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD1 WAVE0  EXEC=555557aaaaaaaaa5  INST32=BF8C0F70
          ^ SE0 SH0 CU7 SIMD3 WAVE0  EXEC=5555aaaaaaaaaa85  INST32=BF8C0F70
          ^ SE1 SH0 CU3 SIMD1 WAVE0  EXEC=aaaaaaaaaaaaaaaa  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD0 WAVE0  EXEC=aaaaaaaa5a5a5a5a  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD1 WAVE0  EXEC=aaaaaaa5a5a5a4a5  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD2 WAVE0  EXEC=5555555000000000  INST32=BF8C0F70
          ^ SE1 SH0 CU4 SIMD3 WAVE0  EXEC=aa555554155aaaaa  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD0 WAVE0  EXEC=55ffff55555555aa  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD1 WAVE0  EXEC=555555555aaaaaaa  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD2 WAVE0  EXEC=a0aaaaaaa8555555  INST32=BF8C0F70
          ^ SE1 SH0 CU5 SIMD3 WAVE0  EXEC=8aaaaaaaaaaaa555  INST32=BF8C0F70
          ^ SE1 SH0 CU6 SIMD0 WAVE0  EXEC=000000002aaaaaaa  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD0 WAVE0  EXEC=5aaaa5400aaaa15a  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD1 WAVE0  EXEC=00aaaaaaaa5555aa  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD2 WAVE0  EXEC=aa00005555554555  INST32=BF8C0F70
          ^ SE2 SH0 CU1 SIMD3 WAVE0  EXEC=aaaaaaa000000000  INST32=BF8C0F70
          ^ SE3 SH0 CU4 SIMD0 WAVE0  EXEC=5555aaaaaaaaaaaa  INST32=BF8C0F70
          ^ SE3 SH0 CU4 SIMD2 WAVE0  EXEC=ffaaaaaaaaaa5555  INST32=BF8C0F70
          ^ SE3 SH0 CU4 SIMD3 WAVE0  EXEC=aaaa55555555aa00  INST32=BF8C0F70
          ^ SE3 SH0 CU5 SIMD0 WAVE0  EXEC=00aaaaaaaaaaaa5a  INST32=BF8C0F70
          ^ SE3 SH0 CU5 SIMD1 WAVE0  EXEC=5a555555005555ff  INST32=BF8C0F70
    v_mul_f32_e32 v7, s6, v7                                      ; 0A0E0E06 [PC=0x10f3e3ad0, off=720, size=4]
...

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
---
 src/gallium/drivers/radeonsi/si_debug.c  | 251 ++++++++++++++++++++++++++++++-
 src/gallium/drivers/radeonsi/si_shader.c |   3 +-
 src/gallium/drivers/radeonsi/si_shader.h |   1 +
 3 files changed, 252 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index e4a350f..1f154c2 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -461,20 +461,268 @@ static void si_dump_descriptors(struct si_context *sctx,
 	if (type == PIPE_SHADER_VERTEX) {
 		si_dump_descriptor_list(&sctx->vertex_buffers, shader_name[type],
 					" - Vertex buffer", info->num_inputs, f);
 	}
 
 	for (unsigned i = 0; i < SI_NUM_SHADER_DESCS; ++i, ++descs)
 		si_dump_descriptor_list(descs, shader_name[type], elem_name[i],
 					num_elements[i], f);
 }
 
+struct si_shader_inst {
+	char text[160];  /* one disasm line */
+	unsigned offset; /* instruction offset */
+	unsigned size;   /* instruction size = 4 or 8 */
+};
+
+/* Split a disassembly string into lines and add them to the array pointed
+ * to by "instructions". */
+static void si_add_split_disasm(const char *disasm,
+				uint64_t start_addr,
+				unsigned *num,
+				struct si_shader_inst *instructions)
+{
+	struct si_shader_inst *last_inst = *num ? &instructions[*num - 1] : NULL;
+	char *next;
+
+	while ((next = strchr(disasm, '\n'))) {
+		struct si_shader_inst *inst = &instructions[*num];
+		unsigned len = next - disasm;
+
+		assert(len < ARRAY_SIZE(inst->text));
+		memcpy(inst->text, disasm, len);
+		inst->text[len] = 0;
+		inst->offset = last_inst ? last_inst->offset + last_inst->size : 0;
+
+		const char *semicolon = strchr(disasm, ';');
+		assert(semicolon);
+		/* More than 16 chars after ";" means the instruction is 8 bytes long. */
+		inst->size = next - semicolon > 16 ? 8 : 4;
+
+		snprintf(inst->text + len, ARRAY_SIZE(inst->text) - len,
+			" [PC=0x%"PRIx64", off=%u, size=%u]",
+			start_addr + inst->offset, inst->offset, inst->size);
+
+		last_inst = inst;
+		(*num)++;
+		disasm = next + 1;
+	}
+}
+
+#define MAX_WAVES_PER_CHIP (64 * 40)
+
+struct si_wave_info {
+	unsigned se; /* shader engine */
+	unsigned sh; /* shader array */
+	unsigned cu; /* compute unit */
+	unsigned simd;
+	unsigned wave;
+	uint32_t status;
+	uint64_t pc; /* program counter */
+	uint32_t inst_dw0;
+	uint32_t inst_dw1;
+	uint64_t exec;
+	bool matched; /* whether the wave is used by a currently-bound shader */
+};
+
+static int compare_wave(const void *p1, const void *p2)
+{
+	struct si_wave_info *w1 = (struct si_wave_info *)p1;
+	struct si_wave_info *w2 = (struct si_wave_info *)p2;
+
+	/* Sort waves according to PC and then SE, SH, CU, etc. */
+	if (w1->pc < w2->pc)
+		return -1;
+	if (w1->pc > w2->pc)
+		return 1;
+	if (w1->se < w2->se)
+		return -1;
+	if (w1->se > w2->se)
+		return 1;
+	if (w1->sh < w2->sh)
+		return -1;
+	if (w1->sh > w2->sh)
+		return 1;
+	if (w1->cu < w2->cu)
+		return -1;
+	if (w1->cu > w2->cu)
+		return 1;
+	if (w1->simd < w2->simd)
+		return -1;
+	if (w1->simd > w2->simd)
+		return 1;
+	if (w1->wave < w2->wave)
+		return -1;
+	if (w1->wave > w2->wave)
+		return 1;
+
+	return 0;
+}
+
+/* Return wave information. "waves" should be a large enough array. */
+static unsigned si_get_wave_info(struct si_wave_info waves[MAX_WAVES_PER_CHIP])
+{
+	char line[2000];
+	unsigned num_waves = 0;
+
+	FILE *p = popen("umr -wa", "r");
+	if (!p)
+		return 0;
+
+	if (!fgets(line, sizeof(line), p) ||
+	    strncmp(line, "SE", 2) != 0) {
+		pclose(p);
+		return 0;
+	}
+
+	while (fgets(line, sizeof(line), p)) {
+		struct si_wave_info *w;
+		uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
+
+		assert(num_waves < MAX_WAVES_PER_CHIP);
+		w = &waves[num_waves];
+
+		if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x",
+			   &w->se, &w->sh, &w->cu, &w->simd, &w->wave,
+			   &w->status, &pc_hi, &pc_lo, &w->inst_dw0,
+			   &w->inst_dw1, &exec_hi, &exec_lo) == 12) {
+			w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
+			w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
+			w->matched = false;
+			num_waves++;
+		}
+	}
+
+	qsort(waves, num_waves, sizeof(struct si_wave_info), compare_wave);
+
+	pclose(p);
+	return num_waves;
+}
+
+/* If the shader is being executed, print its asm instructions, and annotate
+ * those that are being executed right now with information about waves that
+ * execute them. This is most useful during a GPU hang.
+ */
+static void si_print_annotated_shader(struct si_shader *shader,
+				      struct si_wave_info *waves,
+				      unsigned num_waves,
+				      FILE *f)
+{
+	if (!shader || !shader->binary.disasm_string)
+		return;
+
+	uint64_t start_addr = shader->bo->gpu_address;
+	uint64_t end_addr = start_addr + shader->bo->b.b.width0;
+	unsigned i;
+
+	/* See if any wave executes the shader. */
+	for (i = 0; i < num_waves; i++) {
+		if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
+			break;
+	}
+	if (i == num_waves)
+		return; /* the shader is not being executed */
+
+	/* Remember the first found wave. The waves are sorted according to PC. */
+	waves = &waves[i];
+	num_waves -= i;
+
+	/* Get the list of instructions.
+	 * Buffer size / 4 is the upper bound of the instruction count.
+	 */
+	unsigned num_inst = 0;
+	struct si_shader_inst *instructions =
+		calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
+
+	if (shader->prolog) {
+		si_add_split_disasm(shader->prolog->binary.disasm_string,
+				    start_addr, &num_inst, instructions);
+	}
+	si_add_split_disasm(shader->binary.disasm_string,
+			    start_addr, &num_inst, instructions);
+	if (shader->epilog) {
+		si_add_split_disasm(shader->epilog->binary.disasm_string,
+				    start_addr, &num_inst, instructions);
+	}
+
+	fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
+		si_get_shader_name(shader, shader->selector->type));
+
+	/* Print instructions with annotations. */
+	for (i = 0; i < num_inst; i++) {
+		struct si_shader_inst *inst = &instructions[i];
+
+		fprintf(f, "%s\n", inst->text);
+
+		/* Print which waves execute the instruction right now. */
+		while (num_waves && start_addr + inst->offset == waves->pc) {
+			fprintf(f,
+				"          " COLOR_GREEN "^ SE%u SH%u CU%u "
+				"SIMD%u WAVE%u  EXEC=%016"PRIx64 "  ",
+				waves->se, waves->sh, waves->cu, waves->simd,
+				waves->wave, waves->exec);
+
+			if (inst->size == 4) {
+				fprintf(f, "INST32=%08X" COLOR_RESET "\n",
+					waves->inst_dw0);
+			} else {
+				fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n",
+					waves->inst_dw0, waves->inst_dw1);
+			}
+
+			waves->matched = true;
+			waves = &waves[1];
+			num_waves--;
+		}
+	}
+
+	fprintf(f, "\n\n");
+	free(instructions);
+}
+
+static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f)
+{
+	struct si_wave_info waves[MAX_WAVES_PER_CHIP];
+	unsigned num_waves = si_get_wave_info(waves);
+
+	fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET
+		"\n\n", num_waves);
+
+	si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
+	si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
+	si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
+	si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
+	si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
+
+	/* Print waves executing shaders that are not currently bound. */
+	unsigned i;
+	bool found = false;
+	for (i = 0; i < num_waves; i++) {
+		if (waves[i].matched)
+			continue;
+
+		if (!found) {
+			fprintf(f, COLOR_CYAN
+				"Waves not executing currently-bound shaders:"
+				COLOR_RESET "\n");
+			found = true;
+		}
+		fprintf(f, "    SE%u SH%u CU%u SIMD%u WAVE%u  EXEC=%016"PRIx64
+			"  INST=%08X %08X  PC=%"PRIx64"\n",
+			waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd,
+			waves[i].wave, waves[i].exec, waves[i].inst_dw0,
+			waves[i].inst_dw1, waves[i].pc);
+	}
+	if (found)
+		fprintf(f, "\n\n");
+}
+
 static void si_dump_command(const char *title, const char *command, FILE *f)
 {
 	char line[2000];
 
 	FILE *p = popen(command, "r");
 	if (!p)
 		return;
 
 	fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title);
 	while (fgets(line, sizeof(line), p))
@@ -495,21 +743,22 @@ static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
 		si_dump_framebuffer(sctx, f);
 
 	if (flags & PIPE_DUMP_CURRENT_SHADERS) {
 		si_dump_shader(sctx->screen, &sctx->vs_shader, f);
 		si_dump_shader(sctx->screen, &sctx->tcs_shader, f);
 		si_dump_shader(sctx->screen, &sctx->tes_shader, f);
 		si_dump_shader(sctx->screen, &sctx->gs_shader, f);
 		si_dump_shader(sctx->screen, &sctx->ps_shader, f);
 
 		if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
-			si_dump_command("Active waves", "umr -wa | column -t", f);
+			si_dump_annotated_shaders(sctx, f);
+			si_dump_command("Active waves (raw data)", "umr -wa | column -t", f);
 			si_dump_command("Wave information", "umr -O bits -wa", f);
 		}
 
 		si_dump_descriptor_list(&sctx->descriptors[SI_DESCS_RW_BUFFERS],
 					"", "RW buffers", SI_NUM_RW_BUFFERS, f);
 		si_dump_descriptors(sctx, &sctx->vs_shader, f);
 		si_dump_descriptors(sctx, &sctx->tcs_shader, f);
 		si_dump_descriptors(sctx, &sctx->tes_shader, f);
 		si_dump_descriptors(sctx, &sctx->gs_shader, f);
 		si_dump_descriptors(sctx, &sctx->ps_shader, f);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 89229ee..d898afd 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5911,22 +5911,21 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
 	pipe_debug_message(debug, SHADER_INFO,
 			   "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
 			   "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
 			   "Spilled VGPRs: %d PrivMem VGPRs: %d",
 			   conf->num_sgprs, conf->num_vgprs, code_size,
 			   conf->lds_size, conf->scratch_bytes_per_wave,
 			   max_simd_waves, conf->spilled_sgprs,
 			   conf->spilled_vgprs, conf->private_mem_vgprs);
 }
 
-static const char *si_get_shader_name(struct si_shader *shader,
-				      unsigned processor)
+const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
 {
 	switch (processor) {
 	case PIPE_SHADER_VERTEX:
 		if (shader->key.as_es)
 			return "Vertex Shader as ES";
 		else if (shader->key.as_ls)
 			return "Vertex Shader as LS";
 		else
 			return "Vertex Shader as VS";
 	case PIPE_SHADER_TESS_CTRL:
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index cf4da85..5464d67 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -569,12 +569,13 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
 				      unsigned *lds_size);
 void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			struct si_shader *shader,
 			struct si_shader_config *config,
 			uint64_t scratch_va);
 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
 				  struct si_shader_config *conf,
 				  unsigned symbol_offset);
 unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
 				    bool writes_samplemask);
+const char *si_get_shader_name(struct si_shader *shader, unsigned processor);
 
 #endif
-- 
2.7.4



More information about the mesa-dev mailing list