[PATCH umr] Add profiler using scanned wave data arrays.
Tom St Denis
tom.stdenis at amd.com
Thu Apr 26 16:37:01 UTC 2018
Signed-off-by: Tom St Denis <tom.stdenis at amd.com>
---
doc/sphinx/source/index.rst | 1 +
doc/sphinx/source/profiler.rst | 36 ++++++++++++
doc/umr.1 | 4 ++
src/app/CMakeLists.txt | 1 +
src/app/main.c | 15 ++++-
src/app/print_waves.c | 4 +-
src/app/profile.c | 128 +++++++++++++++++++++++++++++++++++++++++
src/app/ring_read.c | 12 +++-
src/lib/dump_ib.c | 4 +-
src/lib/umr_llvm_disasm.c | 48 +++++++++++++++-
src/umr.h | 3 +-
src/umrapp.h | 4 +-
12 files changed, 246 insertions(+), 14 deletions(-)
create mode 100644 doc/sphinx/source/profiler.rst
create mode 100644 src/app/profile.c
diff --git a/doc/sphinx/source/index.rst b/doc/sphinx/source/index.rst
index fd8b2561e570..fec89140db70 100644
--- a/doc/sphinx/source/index.rst
+++ b/doc/sphinx/source/index.rst
@@ -15,6 +15,7 @@ UMR: User Mode Register Debugger
basic
register_access
wave_status
+ profiler
vm_decoding
ring
top
diff --git a/doc/sphinx/source/profiler.rst b/doc/sphinx/source/profiler.rst
new file mode 100644
index 000000000000..0e44cfd2825d
--- /dev/null
+++ b/doc/sphinx/source/profiler.rst
@@ -0,0 +1,36 @@
+=========
+Profiling
+=========
+
+When testing a shader compiler and/or a shader under testing
+a profile of where the GPU tends to spend time can be generated with
+the umr "--profiler" command:
+
+::
+
+ --profiler <nsamples> <usec_delay>
+
+Which will capture 'nsamples' many wave samples with a delay of at
+least 'usec_delay' microseconds between them. The output then
+contains the sorted list of addresses and opcodes in descending order.
+For example,
+
+::
+
+ 2865 hits (13 %) 2 at 0x100009c68 0xc4001c0f 0x00000100 exp mrt0 v0, v0, v1, v1 done compr vm
+ 1199 hits ( 5 %) 2 at 0x1055e9724 0xc40008cf 0x0f090706 exp pos0 v6, v7, v9, v15 done
+ 1155 hits ( 5 %) 2 at 0x100009c48 0xbf8c0f70 0x16000080 s_waitcnt vmcnt(0)
+ 710 hits ( 3 %) 2 at 0x10000acf0 0xc4001c0f 0x00000100 exp mrt0 v0, v0, v1, v1 done compr vm
+ 633 hits ( 3 %) 2 at 0x1023f14c4 0xc400040f 0x00000100 exp mrt0 v0, v0, v1, v1 compr
+ 633 hits ( 3 %) 2 at 0x100008d64 0xbf8c0f70 0x0a161b12 s_waitcnt vmcnt(0)
+ 617 hits ( 2 %) 2 at 0x10000a238 0xf0800700 0x00020400 image_sample v[4:6], v0, s[8:15], s[0:3] dmask:0x7
+ ...<snip>...
+
+Indicates that the opcode at VMID 2 offset 0x100009C68 had waves halted
+there 2865 times (13% of all captured wave data). The next columns
+indicate the raw opcode data and the last columns are the LLVM disassembly
+of the opcode.
+
+When testing a known shader this can be used to determine where
+the bulk of the processing time is spent.
+
diff --git a/doc/umr.1 b/doc/umr.1
index f1f5fec55946..a777d9312054 100644
--- a/doc/umr.1
+++ b/doc/umr.1
@@ -118,6 +118,10 @@ from stdin.
Disassemble 'size' bytes (in hex) from a given address (in hex). The size can be
specified as zero to have umr try and compute the shader size.
+.IP "--profiler, -prof <nsamples> <usec_delay>"
+Capture 'nsamples' samples of wave data with at least usec_delay microseconds
+between captures.
+
.IP "--update, -u" <filename>
Specify update file to add, change, or delete registers from the register
database. Useful for adding registers that are not including in the kernel headers.
diff --git a/src/app/CMakeLists.txt b/src/app/CMakeLists.txt
index 4dceebb00e0d..7512a54f68bf 100644
--- a/src/app/CMakeLists.txt
+++ b/src/app/CMakeLists.txt
@@ -6,6 +6,7 @@ project(umr)
add_library(umrapp
print.c
print_config.c
+ profile.c
ring_read.c
scan.c
scan_log.c
diff --git a/src/app/main.c b/src/app/main.c
index 600f3ca02988..d6571e77b74d 100644
--- a/src/app/main.c
+++ b/src/app/main.c
@@ -495,13 +495,23 @@ int main(int argc, char **argv)
shader.addr = address;
size = umr_compute_shader_size(asic, &shader);
}
- umr_vm_disasm(asic, vmid, address, 0, size);
+ umr_vm_disasm(asic, vmid, address, 0, size, NULL);
i += 2;
} else {
printf("--vm-disasm requires two parameters\n");
return EXIT_FAILURE;
}
+ } else if (!strcmp(argv[i], "-prof") || !strcmp(argv[i], "--profiler")) {
+ if (i + 2 < argc) {
+ if (!asic)
+ asic = get_asic();
+ umr_profiler(asic, atoi(argv[i+1]), atoi(argv[i+2]));
+ i += 2;
+ } else {
+ printf("--profiler requires two parameters\n");
+ return EXIT_FAILURE;
+ }
} else if (!strcmp(argv[i], "--option") || !strcmp(argv[i], "-O")) {
if (i + 1 < argc) {
parse_options(argv[i+1]);
@@ -581,6 +591,9 @@ int main(int argc, char **argv)
"\n\t--vm-disasm, -vdis [<vmid>@]<address> <size>"
"\n\t\tDisassemble 'size' bytes (in hex) from a given address (in hex). The size can"
"\n\t\tbe specified as zero to have umr try and compute the shader size.\n"
+"\n\t--profiler, -prof <nsamples> <usec_delay>"
+ "\n\t\tCapture 'nsamples' samples of wave data with at least usec_delay"
+ "\n\t\tmicroseconds between captures.\n"
"\n\t--option -O <string>[,<string>,...]\n\t\tEnable various flags: bits, bitsfull, empty_log, follow, no_follow_ib, named, many,"
"\n\t\tuse_pci, use_colour, read_smc, quiet, no_kernel, verbose, halt_waves, disasm_early_term.\n"
"\n\n", UMR_BUILD_VER, UMR_BUILD_REV);
diff --git a/src/app/print_waves.c b/src/app/print_waves.c
index d901bc902ff3..6965f7f31854 100644
--- a/src/app/print_waves.c
+++ b/src/app/print_waves.c
@@ -100,7 +100,7 @@ void umr_print_waves(struct umr_asic *asic)
}
pgm_addr = (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo) - (NUM_OPCODE_WORDS*4)/2;
- umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4);
+ umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4, NULL);
} else {
first = 0;
printf("\n------------------------------------------------------\nse%u.sh%u.cu%u.simd%u.wave%u\n",
@@ -222,7 +222,7 @@ void umr_print_waves(struct umr_asic *asic)
printf("\n\nPGM_MEM:\n");
pgm_addr = (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo) - (NUM_OPCODE_WORDS*4)/2;
- umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4);
+ umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4, NULL);
Hv("LDS_ALLOC", wd->ws.lds_alloc.value);
PP(lds_alloc, lds_base);
diff --git a/src/app/profile.c b/src/app/profile.c
new file mode 100644
index 000000000000..3ba3b36efe64
--- /dev/null
+++ b/src/app/profile.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Tom St Denis <tom.stdenis at amd.com>
+ *
+ */
+#include "umrapp.h"
+
+struct umr_profiler_hit {
+ uint32_t
+ vmid,
+ inst_dw0,
+ inst_dw1;
+
+ uint64_t
+ pc;
+};
+
+struct umr_profiler_rle {
+ struct umr_profiler_hit data;
+ uint32_t cnt;
+};
+
+static int comp_hits(const void *A, const void *B)
+{
+ return memcmp(A, B, sizeof(struct umr_profiler_hit));
+}
+
+static int comp_rle(const void *A, const void *B)
+{
+ const struct umr_profiler_rle *a = A, *b = B;
+ return b->cnt - a->cnt;
+}
+
+void umr_profiler(struct umr_asic *asic, int samples, int delay)
+{
+ struct umr_profiler_hit *ophit, *phit;
+ struct umr_profiler_rle *prle;
+ struct umr_wave_data *owd, *wd;
+ unsigned nitems, nmax, x, y, z;
+
+ nmax = samples;
+ nitems = 0;
+ ophit = phit = calloc(nmax, sizeof *phit);
+
+ while (samples--) {
+ fprintf(stderr, "%5u samples left\r", samples);
+ fflush(stderr);
+ do {
+ umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME);
+ if (delay)
+ usleep(delay);
+ umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_HALT);
+ wd = umr_scan_wave_data(asic);
+ } while (!wd);
+
+ // loop through data ...
+ while (wd) {
+ phit[nitems].vmid = wd->ws.hw_id.vm_id;
+ phit[nitems].inst_dw0 = wd->ws.wave_inst_dw0;
+ phit[nitems].inst_dw1 = wd->ws.wave_inst_dw1;
+ phit[nitems++].pc = ((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo;
+
+ if (nitems == nmax) {
+ nmax += 1000;
+ ophit = realloc(phit, nmax * sizeof(*phit));
+ phit = ophit;
+ }
+
+ owd = wd->next;
+ free(wd);
+ wd = owd;
+ }
+ }
+ umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME);
+
+ qsort(phit, nitems, sizeof(*phit), comp_hits);
+ prle = calloc(nitems, sizeof *prle);
+ for (z = y = 0, x = 1; x < nitems; x++) {
+ if (memcmp(&phit[x], &phit[y], sizeof(*phit))) {
+ prle[z].data = phit[y];
+ prle[z++].cnt = x - y;
+ y = x;
+ }
+ }
+
+ qsort(prle, z, sizeof(*prle), comp_rle);
+ for (x = 0; x < z; x++) {
+ char *str[2];
+ unsigned char buf[8];
+
+ memset(str, 0, sizeof(str));
+ memcpy(buf, &prle[x].data.inst_dw0, 4);
+ memcpy(buf + 4, &prle[x].data.inst_dw1, 4);
+ umr_llvm_disasm(asic, buf, 8, 0, &str[0]);
+
+ printf("%5u hits (%2u %%)\t%u at 0x%llx\t 0x%08lx 0x%08lx\t%s\n",
+ prle[x].cnt,
+ (prle[x].cnt * 100) / nitems,
+ (unsigned)prle[x].data.vmid,
+ (unsigned long long)prle[x].data.pc,
+ (unsigned long)prle[x].data.inst_dw0,
+ (unsigned long)prle[x].data.inst_dw1, str[0]);
+ free(str[0]);
+ free(str[1]);
+ }
+
+ free(prle);
+ free(phit);
+}
diff --git a/src/app/ring_read.c b/src/app/ring_read.c
index 3ccec1be6d90..112e9f0414ad 100644
--- a/src/app/ring_read.c
+++ b/src/app/ring_read.c
@@ -32,6 +32,7 @@ void umr_read_ring(struct umr_asic *asic, char *ringpath)
uint32_t wptr, rptr, drv_wptr, ringsize, start, end, value,
*ring_data;
struct umr_ring_decoder decoder, *pdecoder, *ppdecoder;
+ struct umr_wave_data *wd;
memset(ringname, 0, sizeof ringname);
memset(from, 0, sizeof from);
@@ -146,18 +147,25 @@ void umr_read_ring(struct umr_asic *asic, char *ringpath)
free(ring_data);
printf("\n");
- umr_dump_shaders(asic, &decoder);
+ wd = umr_scan_wave_data(asic);
+ umr_dump_shaders(asic, &decoder, wd);
pdecoder = decoder.next_ib;
while (pdecoder) {
if (asic->options.follow_ib) {
umr_dump_ib(asic, pdecoder);
- umr_dump_shaders(asic, pdecoder);
+ umr_dump_shaders(asic, pdecoder, wd);
}
ppdecoder = pdecoder->next_ib;
free(pdecoder);
pdecoder = ppdecoder;
}
+ while (wd) {
+ struct umr_wave_data *pwd = wd->next;
+ free(wd);
+ wd = pwd;
+ }
+
end:
if (asic->options.halt_waves)
umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME);
diff --git a/src/lib/dump_ib.c b/src/lib/dump_ib.c
index cdcbb8a70edd..d5e68d6981a0 100644
--- a/src/lib/dump_ib.c
+++ b/src/lib/dump_ib.c
@@ -67,7 +67,7 @@ void umr_dump_ib(struct umr_asic *asic, struct umr_ring_decoder *decoder)
printf("End of IB\n\n");
}
-void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder)
+void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder, struct umr_wave_data *wd)
{
struct umr_shaders_pgm *pshader, *shader;
@@ -79,7 +79,7 @@ void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder)
BLUE, (unsigned)shader->vmid, RST,
YELLOW, (unsigned long long)shader->src.ib_base, RST,
YELLOW, (unsigned)shader->src.ib_offset * 4, RST);
- umr_vm_disasm(asic, shader->vmid, shader->addr, 0, shader->size);
+ umr_vm_disasm(asic, shader->vmid, shader->addr, 0, shader->size, wd);
printf("\n");
pshader = shader->next;
free(shader);
diff --git a/src/lib/umr_llvm_disasm.c b/src/lib/umr_llvm_disasm.c
index 68f23f990fd2..5e1adf39a262 100644
--- a/src/lib/umr_llvm_disasm.c
+++ b/src/lib/umr_llvm_disasm.c
@@ -85,10 +85,31 @@ int umr_llvm_disasm(struct umr_asic *asic,
return 0;
}
-void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size)
+static struct umr_wave_data *find_wave(struct umr_wave_data *wd, unsigned vmid, uint64_t addr)
{
- uint32_t *opcodes, x;
+ while (wd) {
+ uint64_t PC;
+ PC = ((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo;
+ if (wd->ws.hw_id.vm_id == vmid && addr == PC)
+ break;
+ wd = wd->next;
+ }
+ return wd;
+}
+
+
+void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size, struct umr_wave_data *wd)
+{
+ uint32_t *opcodes, x, nwave, wavehits;
char **opcode_strs = NULL;
+ struct umr_wave_data *pwd;
+
+ wavehits = nwave = 0;
+ pwd = wd;
+ while (pwd) {
+ ++nwave;
+ pwd = pwd->next;
+ }
opcodes = calloc(size/4, sizeof(*opcodes));
if (!opcodes)
@@ -106,14 +127,35 @@ void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t
printf(" * ");
else
printf(" ");
- printf("pgm[%s%lu%s@%s0x%llx%s + %s0x%-4x%s] = %s0x%08lx%s\t%s%s%s\n",
+ printf("pgm[%s%lu%s@%s0x%llx%s + %s0x%-4x%s] = %s0x%08lx%s\t%s%-60s%s\t",
BLUE, (unsigned long)vmid, RST,
YELLOW, (unsigned long long)addr, RST,
YELLOW, (unsigned)x * 4, RST,
BLUE, (unsigned long)opcodes[x], RST,
GREEN, opcode_strs[x], RST);
free(opcode_strs[x]);
+
+ if (wd) {
+ unsigned n;
+ pwd = find_wave(wd, vmid, addr + x * 4);
+ n = 0;
+ while (pwd) {
+ ++n;
+ ++wavehits;
+ if (asic->options.bitfields)
+ printf("[se%u.sh%u.cu%u.simd%u.wave%u] ",
+ (unsigned)pwd->se, (unsigned)pwd->sh, (unsigned)pwd->cu, (unsigned)pwd->ws.hw_id.simd_id, (unsigned)pwd->ws.hw_id.wave_id);
+ pwd = find_wave(pwd->next, vmid, addr + x * 4);
+ }
+ if (n)
+ printf("[%3u waves (%3u %%)]", n, (n * 100) / nwave);
+ }
+ printf("\n");
}
+ printf("End of disassembly.\n");
+
+ if (wd && wavehits)
+ printf("\t%u waves in this shader (out of %u active waves)\n", wavehits, nwave);
free(opcode_strs);
free(opcodes);
diff --git a/src/umr.h b/src/umr.h
index e99ee965527e..f026e82be98e 100644
--- a/src/umr.h
+++ b/src/umr.h
@@ -621,12 +621,13 @@ int umr_sq_cmd_halt_waves(struct umr_asic *asic, enum umr_sq_cmd_halt_resume mod
/* IB/ring decoding/dumping/etc */
void umr_print_decode(struct umr_asic *asic, struct umr_ring_decoder *decoder, uint32_t ib);
void umr_dump_ib(struct umr_asic *asic, struct umr_ring_decoder *decoder);
-void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder);
+void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder, struct umr_wave_data *wd);
int umr_llvm_disasm(struct umr_asic *asic,
uint8_t *inst, unsigned inst_bytes,
uint64_t PC,
char **disasm_text);
+void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size, struct umr_wave_data *wd);
uint32_t umr_compute_shader_size(struct umr_asic *asic,
struct umr_shaders_pgm *shader);
diff --git a/src/umrapp.h b/src/umrapp.h
index 2f52d3093abe..e11a7d6e53f5 100644
--- a/src/umrapp.h
+++ b/src/umrapp.h
@@ -48,6 +48,4 @@ void umr_top(struct umr_asic *asic);
void umr_print_config(struct umr_asic *asic);
void umr_print_waves(struct umr_asic *asic);
-
-void umr_app_disasm(struct umr_asic *asic);
-void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size);
+void umr_profiler(struct umr_asic *asic, int samples, int delay);
--
2.14.3
More information about the amd-gfx
mailing list