[Mesa-dev] [PATCH 2/2] radeonsi: write shader asm annotated with wave info into GPU hang reports
Marek Olšák
maraeo at gmail.com
Sat Feb 4 22:44:29 UTC 2017
From: Marek Olšák <marek.olsak at amd.com>
Note that the disassembly is written twice - first the unmodified compiler
output and then the wave-annotated output only if there are waves executing
the shader.
Sample output from a real GPU hang most likely caused by image_sample:
The number of active waves = 28
Pixel Shader - annotated disassembly:
s_mov_b64 s[6:7], exec ; BE86017E [PC=0x10f3e3800, off=0, size=4]
s_wqm_b64 exec, exec ; BEFE077E [PC=0x10f3e3804, off=4, size=4]
...
image_sample v[7:9], v[0:1], s[12:19], s[20:23] dmask:0x7 ; F0800700 00A30700 [PC=0x10f3e3a94, off=660, size=8]
s_buffer_load_dword s20, s[0:3], 0x50 ; C0220500 00000050 [PC=0x10f3e3a9c, off=668, size=8]
s_load_dwordx4 s[24:27], s[4:5], 0x170 ; C00A0602 00000170 [PC=0x10f3e3aa4, off=676, size=8]
s_load_dwordx8 s[12:19], s[4:5], 0x140 ; C00E0302 00000140 [PC=0x10f3e3aac, off=684, size=8]
s_buffer_load_dword s11, s[0:3], 0x5c ; C02202C0 0000005C [PC=0x10f3e3ab4, off=692, size=8]
s_buffer_load_dword s21, s[0:3], 0x54 ; C0220540 00000054 [PC=0x10f3e3abc, off=700, size=8]
s_buffer_load_dword s22, s[0:3], 0x58 ; C0220580 00000058 [PC=0x10f3e3ac4, off=708, size=8]
s_waitcnt vmcnt(0) ; BF8C0F70 [PC=0x10f3e3acc, off=716, size=4]
^ SE0 SH0 CU1 SIMD1 WAVE0 EXEC=aaaaaaa555aaaaaa INST32=BF8C0F70
^ SE0 SH0 CU1 SIMD2 WAVE0 EXEC=aaaa85555555552a INST32=BF8C0F70
^ SE0 SH0 CU1 SIMD3 WAVE0 EXEC=000000000000000a INST32=BF8C0F70
^ SE0 SH0 CU6 SIMD1 WAVE0 EXEC=25a5a5aa82aaaaaa INST32=BF8C0F70
^ SE0 SH0 CU6 SIMD3 WAVE0 EXEC=50aaaa8fffa55555 INST32=BF8C0F70
^ SE0 SH0 CU7 SIMD0 WAVE0 EXEC=5554aaaaaaa1a555 INST32=BF8C0F70
^ SE0 SH0 CU7 SIMD0 WAVE1 EXEC=aaaa5555ffffffff INST32=BF8C0F70
^ SE0 SH0 CU7 SIMD1 WAVE0 EXEC=555557aaaaaaaaa5 INST32=BF8C0F70
^ SE0 SH0 CU7 SIMD3 WAVE0 EXEC=5555aaaaaaaaaa85 INST32=BF8C0F70
^ SE1 SH0 CU3 SIMD1 WAVE0 EXEC=aaaaaaaaaaaaaaaa INST32=BF8C0F70
^ SE1 SH0 CU4 SIMD0 WAVE0 EXEC=aaaaaaaa5a5a5a5a INST32=BF8C0F70
^ SE1 SH0 CU4 SIMD1 WAVE0 EXEC=aaaaaaa5a5a5a4a5 INST32=BF8C0F70
^ SE1 SH0 CU4 SIMD2 WAVE0 EXEC=5555555000000000 INST32=BF8C0F70
^ SE1 SH0 CU4 SIMD3 WAVE0 EXEC=aa555554155aaaaa INST32=BF8C0F70
^ SE1 SH0 CU5 SIMD0 WAVE0 EXEC=55ffff55555555aa INST32=BF8C0F70
^ SE1 SH0 CU5 SIMD1 WAVE0 EXEC=555555555aaaaaaa INST32=BF8C0F70
^ SE1 SH0 CU5 SIMD2 WAVE0 EXEC=a0aaaaaaa8555555 INST32=BF8C0F70
^ SE1 SH0 CU5 SIMD3 WAVE0 EXEC=8aaaaaaaaaaaa555 INST32=BF8C0F70
^ SE1 SH0 CU6 SIMD0 WAVE0 EXEC=000000002aaaaaaa INST32=BF8C0F70
^ SE2 SH0 CU1 SIMD0 WAVE0 EXEC=5aaaa5400aaaa15a INST32=BF8C0F70
^ SE2 SH0 CU1 SIMD1 WAVE0 EXEC=00aaaaaaaa5555aa INST32=BF8C0F70
^ SE2 SH0 CU1 SIMD2 WAVE0 EXEC=aa00005555554555 INST32=BF8C0F70
^ SE2 SH0 CU1 SIMD3 WAVE0 EXEC=aaaaaaa000000000 INST32=BF8C0F70
^ SE3 SH0 CU4 SIMD0 WAVE0 EXEC=5555aaaaaaaaaaaa INST32=BF8C0F70
^ SE3 SH0 CU4 SIMD2 WAVE0 EXEC=ffaaaaaaaaaa5555 INST32=BF8C0F70
^ SE3 SH0 CU4 SIMD3 WAVE0 EXEC=aaaa55555555aa00 INST32=BF8C0F70
^ SE3 SH0 CU5 SIMD0 WAVE0 EXEC=00aaaaaaaaaaaa5a INST32=BF8C0F70
^ SE3 SH0 CU5 SIMD1 WAVE0 EXEC=5a555555005555ff INST32=BF8C0F70
v_mul_f32_e32 v7, s6, v7 ; 0A0E0E06 [PC=0x10f3e3ad0, off=720, size=4]
...
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
---
src/gallium/drivers/radeonsi/si_debug.c | 251 ++++++++++++++++++++++++++++++-
src/gallium/drivers/radeonsi/si_shader.c | 3 +-
src/gallium/drivers/radeonsi/si_shader.h | 1 +
3 files changed, 252 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index e4a350f..1f154c2 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -461,20 +461,268 @@ static void si_dump_descriptors(struct si_context *sctx,
if (type == PIPE_SHADER_VERTEX) {
si_dump_descriptor_list(&sctx->vertex_buffers, shader_name[type],
" - Vertex buffer", info->num_inputs, f);
}
for (unsigned i = 0; i < SI_NUM_SHADER_DESCS; ++i, ++descs)
si_dump_descriptor_list(descs, shader_name[type], elem_name[i],
num_elements[i], f);
}
+struct si_shader_inst {
+ char text[160]; /* one disasm line */
+ unsigned offset; /* instruction offset */
+ unsigned size; /* instruction size = 4 or 8 */
+};
+
+/* Split a disassembly string into lines and add them to the array pointed
+ * to by "instructions". */
+static void si_add_split_disasm(const char *disasm,
+ uint64_t start_addr,
+ unsigned *num,
+ struct si_shader_inst *instructions)
+{
+ struct si_shader_inst *last_inst = *num ? &instructions[*num - 1] : NULL;
+ char *next;
+
+ while ((next = strchr(disasm, '\n'))) {
+ struct si_shader_inst *inst = &instructions[*num];
+ unsigned len = next - disasm;
+
+ assert(len < ARRAY_SIZE(inst->text));
+ memcpy(inst->text, disasm, len);
+ inst->text[len] = 0;
+ inst->offset = last_inst ? last_inst->offset + last_inst->size : 0;
+
+ const char *semicolon = strchr(disasm, ';');
+ assert(semicolon);
+ /* More than 16 chars after ";" means the instruction is 8 bytes long. */
+ inst->size = next - semicolon > 16 ? 8 : 4;
+
+ snprintf(inst->text + len, ARRAY_SIZE(inst->text) - len,
+ " [PC=0x%"PRIx64", off=%u, size=%u]",
+ start_addr + inst->offset, inst->offset, inst->size);
+
+ last_inst = inst;
+ (*num)++;
+ disasm = next + 1;
+ }
+}
+
+#define MAX_WAVES_PER_CHIP (64 * 40)
+
+struct si_wave_info {
+ unsigned se; /* shader engine */
+ unsigned sh; /* shader array */
+ unsigned cu; /* compute unit */
+ unsigned simd;
+ unsigned wave;
+ uint32_t status;
+ uint64_t pc; /* program counter */
+ uint32_t inst_dw0;
+ uint32_t inst_dw1;
+ uint64_t exec;
+ bool matched; /* whether the wave is used by a currently-bound shader */
+};
+
+static int compare_wave(const void *p1, const void *p2)
+{
+ struct si_wave_info *w1 = (struct si_wave_info *)p1;
+ struct si_wave_info *w2 = (struct si_wave_info *)p2;
+
+ /* Sort waves according to PC and then SE, SH, CU, etc. */
+ if (w1->pc < w2->pc)
+ return -1;
+ if (w1->pc > w2->pc)
+ return 1;
+ if (w1->se < w2->se)
+ return -1;
+ if (w1->se > w2->se)
+ return 1;
+ if (w1->sh < w2->sh)
+ return -1;
+ if (w1->sh > w2->sh)
+ return 1;
+ if (w1->cu < w2->cu)
+ return -1;
+ if (w1->cu > w2->cu)
+ return 1;
+ if (w1->simd < w2->simd)
+ return -1;
+ if (w1->simd > w2->simd)
+ return 1;
+ if (w1->wave < w2->wave)
+ return -1;
+ if (w1->wave > w2->wave)
+ return 1;
+
+ return 0;
+}
+
+/* Return wave information. "waves" should be a large enough array. */
+static unsigned si_get_wave_info(struct si_wave_info waves[MAX_WAVES_PER_CHIP])
+{
+ char line[2000];
+ unsigned num_waves = 0;
+
+ FILE *p = popen("umr -wa", "r");
+ if (!p)
+ return 0;
+
+ if (!fgets(line, sizeof(line), p) ||
+ strncmp(line, "SE", 2) != 0) {
+ pclose(p);
+ return 0;
+ }
+
+ while (fgets(line, sizeof(line), p)) {
+ struct si_wave_info *w;
+ uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
+
+ assert(num_waves < MAX_WAVES_PER_CHIP);
+ w = &waves[num_waves];
+
+ if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x",
+ &w->se, &w->sh, &w->cu, &w->simd, &w->wave,
+ &w->status, &pc_hi, &pc_lo, &w->inst_dw0,
+ &w->inst_dw1, &exec_hi, &exec_lo) == 12) {
+ w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
+ w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
+ w->matched = false;
+ num_waves++;
+ }
+ }
+
+ qsort(waves, num_waves, sizeof(struct si_wave_info), compare_wave);
+
+ pclose(p);
+ return num_waves;
+}
+
+/* If the shader is being executed, print its asm instructions, and annotate
+ * those that are being executed right now with information about waves that
+ * execute them. This is most useful during a GPU hang.
+ */
+static void si_print_annotated_shader(struct si_shader *shader,
+ struct si_wave_info *waves,
+ unsigned num_waves,
+ FILE *f)
+{
+ if (!shader || !shader->binary.disasm_string)
+ return;
+
+ uint64_t start_addr = shader->bo->gpu_address;
+ uint64_t end_addr = start_addr + shader->bo->b.b.width0;
+ unsigned i;
+
+ /* See if any wave executes the shader. */
+ for (i = 0; i < num_waves; i++) {
+ if (start_addr <= waves[i].pc && waves[i].pc <= end_addr)
+ break;
+ }
+ if (i == num_waves)
+ return; /* the shader is not being executed */
+
+ /* Remember the first found wave. The waves are sorted according to PC. */
+ waves = &waves[i];
+ num_waves -= i;
+
+ /* Get the list of instructions.
+ * Buffer size / 4 is the upper bound of the instruction count.
+ */
+ unsigned num_inst = 0;
+ struct si_shader_inst *instructions =
+ calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst));
+
+ if (shader->prolog) {
+ si_add_split_disasm(shader->prolog->binary.disasm_string,
+ start_addr, &num_inst, instructions);
+ }
+ si_add_split_disasm(shader->binary.disasm_string,
+ start_addr, &num_inst, instructions);
+ if (shader->epilog) {
+ si_add_split_disasm(shader->epilog->binary.disasm_string,
+ start_addr, &num_inst, instructions);
+ }
+
+ fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n",
+ si_get_shader_name(shader, shader->selector->type));
+
+ /* Print instructions with annotations. */
+ for (i = 0; i < num_inst; i++) {
+ struct si_shader_inst *inst = &instructions[i];
+
+ fprintf(f, "%s\n", inst->text);
+
+ /* Print which waves execute the instruction right now. */
+ while (num_waves && start_addr + inst->offset == waves->pc) {
+ fprintf(f,
+ " " COLOR_GREEN "^ SE%u SH%u CU%u "
+ "SIMD%u WAVE%u EXEC=%016"PRIx64 " ",
+ waves->se, waves->sh, waves->cu, waves->simd,
+ waves->wave, waves->exec);
+
+ if (inst->size == 4) {
+ fprintf(f, "INST32=%08X" COLOR_RESET "\n",
+ waves->inst_dw0);
+ } else {
+ fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n",
+ waves->inst_dw0, waves->inst_dw1);
+ }
+
+ waves->matched = true;
+ waves = &waves[1];
+ num_waves--;
+ }
+ }
+
+ fprintf(f, "\n\n");
+ free(instructions);
+}
+
+static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f)
+{
+ struct si_wave_info waves[MAX_WAVES_PER_CHIP];
+ unsigned num_waves = si_get_wave_info(waves);
+
+ fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET
+ "\n\n", num_waves);
+
+ si_print_annotated_shader(sctx->vs_shader.current, waves, num_waves, f);
+ si_print_annotated_shader(sctx->tcs_shader.current, waves, num_waves, f);
+ si_print_annotated_shader(sctx->tes_shader.current, waves, num_waves, f);
+ si_print_annotated_shader(sctx->gs_shader.current, waves, num_waves, f);
+ si_print_annotated_shader(sctx->ps_shader.current, waves, num_waves, f);
+
+ /* Print waves executing shaders that are not currently bound. */
+ unsigned i;
+ bool found = false;
+ for (i = 0; i < num_waves; i++) {
+ if (waves[i].matched)
+ continue;
+
+ if (!found) {
+ fprintf(f, COLOR_CYAN
+ "Waves not executing currently-bound shaders:"
+ COLOR_RESET "\n");
+ found = true;
+ }
+ fprintf(f, " SE%u SH%u CU%u SIMD%u WAVE%u EXEC=%016"PRIx64
+ " INST=%08X %08X PC=%"PRIx64"\n",
+ waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd,
+ waves[i].wave, waves[i].exec, waves[i].inst_dw0,
+ waves[i].inst_dw1, waves[i].pc);
+ }
+ if (found)
+ fprintf(f, "\n\n");
+}
+
static void si_dump_command(const char *title, const char *command, FILE *f)
{
char line[2000];
FILE *p = popen(command, "r");
if (!p)
return;
fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title);
while (fgets(line, sizeof(line), p))
@@ -495,21 +743,22 @@ static void si_dump_debug_state(struct pipe_context *ctx, FILE *f,
si_dump_framebuffer(sctx, f);
if (flags & PIPE_DUMP_CURRENT_SHADERS) {
si_dump_shader(sctx->screen, &sctx->vs_shader, f);
si_dump_shader(sctx->screen, &sctx->tcs_shader, f);
si_dump_shader(sctx->screen, &sctx->tes_shader, f);
si_dump_shader(sctx->screen, &sctx->gs_shader, f);
si_dump_shader(sctx->screen, &sctx->ps_shader, f);
if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) {
- si_dump_command("Active waves", "umr -wa | column -t", f);
+ si_dump_annotated_shaders(sctx, f);
+ si_dump_command("Active waves (raw data)", "umr -wa | column -t", f);
si_dump_command("Wave information", "umr -O bits -wa", f);
}
si_dump_descriptor_list(&sctx->descriptors[SI_DESCS_RW_BUFFERS],
"", "RW buffers", SI_NUM_RW_BUFFERS, f);
si_dump_descriptors(sctx, &sctx->vs_shader, f);
si_dump_descriptors(sctx, &sctx->tcs_shader, f);
si_dump_descriptors(sctx, &sctx->tes_shader, f);
si_dump_descriptors(sctx, &sctx->gs_shader, f);
si_dump_descriptors(sctx, &sctx->ps_shader, f);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 89229ee..d898afd 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5911,22 +5911,21 @@ static void si_shader_dump_stats(struct si_screen *sscreen,
pipe_debug_message(debug, SHADER_INFO,
"Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
"LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
"Spilled VGPRs: %d PrivMem VGPRs: %d",
conf->num_sgprs, conf->num_vgprs, code_size,
conf->lds_size, conf->scratch_bytes_per_wave,
max_simd_waves, conf->spilled_sgprs,
conf->spilled_vgprs, conf->private_mem_vgprs);
}
-static const char *si_get_shader_name(struct si_shader *shader,
- unsigned processor)
+const char *si_get_shader_name(struct si_shader *shader, unsigned processor)
{
switch (processor) {
case PIPE_SHADER_VERTEX:
if (shader->key.as_es)
return "Vertex Shader as ES";
else if (shader->key.as_ls)
return "Vertex Shader as LS";
else
return "Vertex Shader as VS";
case PIPE_SHADER_TESS_CTRL:
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index cf4da85..5464d67 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -569,12 +569,13 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen,
unsigned *lds_size);
void si_shader_apply_scratch_relocs(struct si_context *sctx,
struct si_shader *shader,
struct si_shader_config *config,
uint64_t scratch_va);
void si_shader_binary_read_config(struct radeon_shader_binary *binary,
struct si_shader_config *conf,
unsigned symbol_offset);
unsigned si_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
bool writes_samplemask);
+const char *si_get_shader_name(struct si_shader *shader, unsigned processor);
#endif
--
2.7.4
More information about the mesa-dev
mailing list