[PATCH umr] Add profiler using scanned wave data arrays.

Tom St Denis tom.stdenis at amd.com
Thu Apr 26 16:37:01 UTC 2018


Signed-off-by: Tom St Denis <tom.stdenis at amd.com>
---
 doc/sphinx/source/index.rst    |   1 +
 doc/sphinx/source/profiler.rst |  36 ++++++++++++
 doc/umr.1                      |   4 ++
 src/app/CMakeLists.txt         |   1 +
 src/app/main.c                 |  15 ++++-
 src/app/print_waves.c          |   4 +-
 src/app/profile.c              | 128 +++++++++++++++++++++++++++++++++++++++++
 src/app/ring_read.c            |  12 +++-
 src/lib/dump_ib.c              |   4 +-
 src/lib/umr_llvm_disasm.c      |  48 +++++++++++++++-
 src/umr.h                      |   3 +-
 src/umrapp.h                   |   4 +-
 12 files changed, 246 insertions(+), 14 deletions(-)
 create mode 100644 doc/sphinx/source/profiler.rst
 create mode 100644 src/app/profile.c

diff --git a/doc/sphinx/source/index.rst b/doc/sphinx/source/index.rst
index fd8b2561e570..fec89140db70 100644
--- a/doc/sphinx/source/index.rst
+++ b/doc/sphinx/source/index.rst
@@ -15,6 +15,7 @@ UMR: User Mode Register Debugger
    basic
    register_access
    wave_status
+   profiler
    vm_decoding
    ring
    top
diff --git a/doc/sphinx/source/profiler.rst b/doc/sphinx/source/profiler.rst
new file mode 100644
index 000000000000..0e44cfd2825d
--- /dev/null
+++ b/doc/sphinx/source/profiler.rst
@@ -0,0 +1,36 @@
+=========
+Profiling
+=========
+
+When testing a shader compiler and/or a shader under testing
+a profile of where the GPU tends to spend time can be generated with
+the umr "--profiler" command:
+
+::
+
+	--profiler <nsamples> <usec_delay>
+
+Which will capture 'nsamples' many wave samples with a delay of at
+least 'usec_delay' microseconds between them.  The output then
+contains the sorted list of addresses and opcodes in descending order.
+For example,
+
+::
+
+	 2865 hits (13 %)       2 at 0x100009c68    0xc4001c0f 0x00000100          exp mrt0 v0, v0, v1, v1 done compr vm
+	 1199 hits ( 5 %)       2 at 0x1055e9724    0xc40008cf 0x0f090706          exp pos0 v6, v7, v9, v15 done
+	 1155 hits ( 5 %)       2 at 0x100009c48    0xbf8c0f70 0x16000080          s_waitcnt vmcnt(0)
+	  710 hits ( 3 %)       2 at 0x10000acf0    0xc4001c0f 0x00000100          exp mrt0 v0, v0, v1, v1 done compr vm
+	  633 hits ( 3 %)       2 at 0x1023f14c4    0xc400040f 0x00000100          exp mrt0 v0, v0, v1, v1 compr
+	  633 hits ( 3 %)       2 at 0x100008d64    0xbf8c0f70 0x0a161b12          s_waitcnt vmcnt(0)
+	  617 hits ( 2 %)       2 at 0x10000a238    0xf0800700 0x00020400          image_sample v[4:6], v0, s[8:15], s[0:3] dmask:0x7
+	...<snip>...
+
+Indicates that the opcode at VMID 2 offset 0x100009C68 had waves halted
+there 2865 times (13% of all captured wave data).  The next columns
+indicate the raw opcode data and the last columns are the LLVM disassembly
+of the opcode.
+
+When testing a known shader this can be used to determine where
+the bulk of the processing time is spent.
+
diff --git a/doc/umr.1 b/doc/umr.1
index f1f5fec55946..a777d9312054 100644
--- a/doc/umr.1
+++ b/doc/umr.1
@@ -118,6 +118,10 @@ from stdin.
 Disassemble 'size' bytes (in hex) from a given address (in hex).  The size can be
 specified as zero to have umr try and compute the shader size.
 
+.IP "--profiler, -prof <nsamples> <usec_delay>"
+Capture 'nsamples' samples of wave data with at least usec_delay microseconds
+between captures.
+
 .IP "--update, -u" <filename>
 Specify update file to add, change, or delete registers from the register
 database.  Useful for adding registers that are not including in the kernel headers.
diff --git a/src/app/CMakeLists.txt b/src/app/CMakeLists.txt
index 4dceebb00e0d..7512a54f68bf 100644
--- a/src/app/CMakeLists.txt
+++ b/src/app/CMakeLists.txt
@@ -6,6 +6,7 @@ project(umr)
 add_library(umrapp
   print.c
   print_config.c
+  profile.c
   ring_read.c
   scan.c
   scan_log.c
diff --git a/src/app/main.c b/src/app/main.c
index 600f3ca02988..d6571e77b74d 100644
--- a/src/app/main.c
+++ b/src/app/main.c
@@ -495,13 +495,23 @@ int main(int argc, char **argv)
 					shader.addr = address;
 					size = umr_compute_shader_size(asic, &shader);
 				}
-				umr_vm_disasm(asic, vmid, address, 0, size);
+				umr_vm_disasm(asic, vmid, address, 0, size, NULL);
 
 				i += 2;
 			} else {
 				printf("--vm-disasm requires two parameters\n");
 				return EXIT_FAILURE;
 			}
+		} else if (!strcmp(argv[i], "-prof") || !strcmp(argv[i], "--profiler")) {
+			if (i + 2 < argc) {
+				if (!asic)
+					asic = get_asic();
+				umr_profiler(asic, atoi(argv[i+1]), atoi(argv[i+2]));
+				i += 2;
+			} else {
+				printf("--profiler requires two parameters\n");
+				return EXIT_FAILURE;
+			}
 		} else if (!strcmp(argv[i], "--option") || !strcmp(argv[i], "-O")) {
 			if (i + 1 < argc) {
 				parse_options(argv[i+1]);
@@ -581,6 +591,9 @@ int main(int argc, char **argv)
 "\n\t--vm-disasm, -vdis [<vmid>@]<address> <size>"
 	"\n\t\tDisassemble 'size' bytes (in hex) from a given address (in hex).  The size can"
 	"\n\t\tbe specified as zero to have umr try and compute the shader size.\n"
+"\n\t--profiler, -prof <nsamples> <usec_delay>"
+	"\n\t\tCapture 'nsamples' samples of wave data with at least usec_delay"
+	"\n\t\tmicroseconds between captures.\n"
 "\n\t--option -O <string>[,<string>,...]\n\t\tEnable various flags: bits, bitsfull, empty_log, follow, no_follow_ib, named, many,"
 	"\n\t\tuse_pci, use_colour, read_smc, quiet, no_kernel, verbose, halt_waves, disasm_early_term.\n"
 "\n\n", UMR_BUILD_VER, UMR_BUILD_REV);
diff --git a/src/app/print_waves.c b/src/app/print_waves.c
index d901bc902ff3..6965f7f31854 100644
--- a/src/app/print_waves.c
+++ b/src/app/print_waves.c
@@ -100,7 +100,7 @@ void umr_print_waves(struct umr_asic *asic)
 			}
 
 			pgm_addr = (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo) - (NUM_OPCODE_WORDS*4)/2;
-			umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4);
+			umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4, NULL);
 		} else {
 			first = 0;
 			printf("\n------------------------------------------------------\nse%u.sh%u.cu%u.simd%u.wave%u\n",
@@ -222,7 +222,7 @@ void umr_print_waves(struct umr_asic *asic)
 
 			printf("\n\nPGM_MEM:\n");
 			pgm_addr = (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo) - (NUM_OPCODE_WORDS*4)/2;
-			umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4);
+			umr_vm_disasm(asic, wd->ws.hw_id.vm_id, pgm_addr, (((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo), NUM_OPCODE_WORDS*4, NULL);
 
 			Hv("LDS_ALLOC", wd->ws.lds_alloc.value);
 			PP(lds_alloc, lds_base);
diff --git a/src/app/profile.c b/src/app/profile.c
new file mode 100644
index 000000000000..3ba3b36efe64
--- /dev/null
+++ b/src/app/profile.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Tom St Denis <tom.stdenis at amd.com>
+ *
+ */
+#include "umrapp.h"
+
+struct umr_profiler_hit {
+	uint32_t
+		vmid,
+		inst_dw0,
+		inst_dw1;
+
+	uint64_t
+		pc;
+};
+
+struct umr_profiler_rle {
+	struct umr_profiler_hit data;
+	uint32_t cnt;
+};
+
+static int comp_hits(const void *A, const void *B)
+{
+	return memcmp(A, B, sizeof(struct umr_profiler_hit));
+}
+
+static int comp_rle(const void *A, const void *B)
+{
+	const struct umr_profiler_rle *a = A, *b = B;
+	return b->cnt - a->cnt;
+}
+
+void umr_profiler(struct umr_asic *asic, int samples, int delay)
+{
+	struct umr_profiler_hit *ophit, *phit;
+	struct umr_profiler_rle *prle;
+	struct umr_wave_data *owd, *wd;
+	unsigned nitems, nmax, x, y, z;
+
+	nmax = samples;
+	nitems = 0;
+	ophit = phit = calloc(nmax, sizeof *phit);
+
+	while (samples--) {
+		fprintf(stderr, "%5u samples left\r", samples);
+		fflush(stderr);
+		do {
+			umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME);
+			if (delay)
+				usleep(delay);
+			umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_HALT);
+			wd = umr_scan_wave_data(asic);
+		} while (!wd);
+
+		// loop through data ...
+		while (wd) {
+			phit[nitems].vmid = wd->ws.hw_id.vm_id;
+			phit[nitems].inst_dw0 = wd->ws.wave_inst_dw0;
+			phit[nitems].inst_dw1 = wd->ws.wave_inst_dw1;
+			phit[nitems++].pc = ((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo;
+
+			if (nitems == nmax) {
+				nmax += 1000;
+				ophit = realloc(phit, nmax * sizeof(*phit));
+				phit = ophit;
+			}
+
+			owd = wd->next;
+			free(wd);
+			wd = owd;
+		}
+	}
+	umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME);
+
+	qsort(phit, nitems, sizeof(*phit), comp_hits);
+	prle = calloc(nitems, sizeof *prle);
+	for (z = y = 0, x = 1; x < nitems; x++) {
+		if (memcmp(&phit[x], &phit[y], sizeof(*phit))) {
+			prle[z].data = phit[y];
+			prle[z++].cnt = x - y;
+			y = x;
+		}
+	}
+
+	qsort(prle, z, sizeof(*prle), comp_rle);
+	for (x = 0; x < z; x++) {
+		char *str[2];
+		unsigned char buf[8];
+
+		memset(str, 0, sizeof(str));
+		memcpy(buf, &prle[x].data.inst_dw0, 4);
+		memcpy(buf + 4, &prle[x].data.inst_dw1, 4);
+		umr_llvm_disasm(asic, buf, 8, 0, &str[0]);
+
+		printf("%5u hits (%2u %%)\t%u at 0x%llx\t 0x%08lx 0x%08lx\t%s\n",
+			prle[x].cnt,
+			(prle[x].cnt * 100) / nitems,
+			(unsigned)prle[x].data.vmid,
+			(unsigned long long)prle[x].data.pc,
+			(unsigned long)prle[x].data.inst_dw0,
+			(unsigned long)prle[x].data.inst_dw1, str[0]);
+		free(str[0]);
+		free(str[1]);
+	}
+
+	free(prle);
+	free(phit);
+}
diff --git a/src/app/ring_read.c b/src/app/ring_read.c
index 3ccec1be6d90..112e9f0414ad 100644
--- a/src/app/ring_read.c
+++ b/src/app/ring_read.c
@@ -32,6 +32,7 @@ void umr_read_ring(struct umr_asic *asic, char *ringpath)
 	uint32_t wptr, rptr, drv_wptr, ringsize, start, end, value,
 		 *ring_data;
 	struct umr_ring_decoder decoder, *pdecoder, *ppdecoder;
+	struct umr_wave_data *wd;
 
 	memset(ringname, 0, sizeof ringname);
 	memset(from, 0, sizeof from);
@@ -146,18 +147,25 @@ void umr_read_ring(struct umr_asic *asic, char *ringpath)
 	free(ring_data);
 	printf("\n");
 
-	umr_dump_shaders(asic, &decoder);
+	wd = umr_scan_wave_data(asic);
+	umr_dump_shaders(asic, &decoder, wd);
 	pdecoder = decoder.next_ib;
 	while (pdecoder) {
 		if (asic->options.follow_ib) {
 			umr_dump_ib(asic, pdecoder);
-			umr_dump_shaders(asic, pdecoder);
+			umr_dump_shaders(asic, pdecoder, wd);
 		}
 		ppdecoder = pdecoder->next_ib;
 		free(pdecoder);
 		pdecoder = ppdecoder;
 	}
 
+	while (wd) {
+		struct umr_wave_data *pwd = wd->next;
+		free(wd);
+		wd = pwd;
+	}
+
 end:
 	if (asic->options.halt_waves)
 		umr_sq_cmd_halt_waves(asic, UMR_SQ_CMD_RESUME);
diff --git a/src/lib/dump_ib.c b/src/lib/dump_ib.c
index cdcbb8a70edd..d5e68d6981a0 100644
--- a/src/lib/dump_ib.c
+++ b/src/lib/dump_ib.c
@@ -67,7 +67,7 @@ void umr_dump_ib(struct umr_asic *asic, struct umr_ring_decoder *decoder)
 	printf("End of IB\n\n");
 }
 
-void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder)
+void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder, struct umr_wave_data *wd)
 {
 	struct umr_shaders_pgm *pshader, *shader;
 
@@ -79,7 +79,7 @@ void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder)
 				BLUE, (unsigned)shader->vmid, RST,
 				YELLOW, (unsigned long long)shader->src.ib_base, RST,
 				YELLOW, (unsigned)shader->src.ib_offset * 4, RST);
-		umr_vm_disasm(asic, shader->vmid, shader->addr, 0, shader->size);
+		umr_vm_disasm(asic, shader->vmid, shader->addr, 0, shader->size, wd);
 		printf("\n");
 		pshader = shader->next;
 		free(shader);
diff --git a/src/lib/umr_llvm_disasm.c b/src/lib/umr_llvm_disasm.c
index 68f23f990fd2..5e1adf39a262 100644
--- a/src/lib/umr_llvm_disasm.c
+++ b/src/lib/umr_llvm_disasm.c
@@ -85,10 +85,31 @@ int umr_llvm_disasm(struct umr_asic *asic,
 	return 0;
 }
 
-void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size)
+static struct umr_wave_data *find_wave(struct umr_wave_data *wd, unsigned vmid, uint64_t addr)
 {
-	uint32_t *opcodes, x;
+	while (wd) {
+		uint64_t PC;
+		PC = ((uint64_t)wd->ws.pc_hi << 32) | wd->ws.pc_lo;
+		if (wd->ws.hw_id.vm_id == vmid && addr == PC)
+			break;
+		wd = wd->next;
+	}
+	return wd;
+}
+
+
+void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size, struct umr_wave_data *wd)
+{
+	uint32_t *opcodes, x, nwave, wavehits;
 	char **opcode_strs = NULL;
+	struct umr_wave_data *pwd;
+
+	wavehits = nwave = 0;
+	pwd = wd;
+	while (pwd) {
+		++nwave;
+		pwd = pwd->next;
+	}
 
 	opcodes = calloc(size/4, sizeof(*opcodes));
 	if (!opcodes)
@@ -106,14 +127,35 @@ void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t
 			printf(" * ");
 		else
 			printf("   ");
-		printf("pgm[%s%lu%s@%s0x%llx%s + %s0x%-4x%s] = %s0x%08lx%s\t%s%s%s\n",
+		printf("pgm[%s%lu%s@%s0x%llx%s + %s0x%-4x%s] = %s0x%08lx%s\t%s%-60s%s\t",
 			BLUE, (unsigned long)vmid, RST,
 			YELLOW, (unsigned long long)addr, RST,
 			YELLOW, (unsigned)x * 4, RST,
 			BLUE, (unsigned long)opcodes[x], RST,
 			GREEN, opcode_strs[x], RST);
 		free(opcode_strs[x]);
+
+		if (wd) {
+			unsigned n;
+			pwd = find_wave(wd, vmid, addr + x * 4);
+			n = 0;
+			while (pwd) {
+				++n;
+				++wavehits;
+				if (asic->options.bitfields)
+					printf("[se%u.sh%u.cu%u.simd%u.wave%u] ",
+						(unsigned)pwd->se, (unsigned)pwd->sh, (unsigned)pwd->cu, (unsigned)pwd->ws.hw_id.simd_id, (unsigned)pwd->ws.hw_id.wave_id);
+				pwd = find_wave(pwd->next, vmid, addr + x * 4);
+			}
+			if (n)
+				printf("[%3u waves (%3u %%)]", n, (n * 100) / nwave);
+		}
+		printf("\n");
 	}
+	printf("End of disassembly.\n");
+
+	if (wd && wavehits)
+		printf("\t%u waves in this shader (out of %u active waves)\n", wavehits, nwave);
 
 	free(opcode_strs);
 	free(opcodes);
diff --git a/src/umr.h b/src/umr.h
index e99ee965527e..f026e82be98e 100644
--- a/src/umr.h
+++ b/src/umr.h
@@ -621,12 +621,13 @@ int umr_sq_cmd_halt_waves(struct umr_asic *asic, enum umr_sq_cmd_halt_resume mod
 /* IB/ring decoding/dumping/etc */
 void umr_print_decode(struct umr_asic *asic, struct umr_ring_decoder *decoder, uint32_t ib);
 void umr_dump_ib(struct umr_asic *asic, struct umr_ring_decoder *decoder);
-void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder);
+void umr_dump_shaders(struct umr_asic *asic, struct umr_ring_decoder *decoder, struct umr_wave_data *wd);
 
 int umr_llvm_disasm(struct umr_asic *asic,
 					uint8_t *inst, unsigned inst_bytes,
 					uint64_t PC,
 					char **disasm_text);
+void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size, struct umr_wave_data *wd);
 uint32_t umr_compute_shader_size(struct umr_asic *asic,
 								 struct umr_shaders_pgm *shader);
 
diff --git a/src/umrapp.h b/src/umrapp.h
index 2f52d3093abe..e11a7d6e53f5 100644
--- a/src/umrapp.h
+++ b/src/umrapp.h
@@ -48,6 +48,4 @@ void umr_top(struct umr_asic *asic);
 
 void umr_print_config(struct umr_asic *asic);
 void umr_print_waves(struct umr_asic *asic);
-
-void umr_app_disasm(struct umr_asic *asic);
-void umr_vm_disasm(struct umr_asic *asic, unsigned vmid, uint64_t addr, uint64_t PC, uint32_t size);
+void umr_profiler(struct umr_asic *asic, int samples, int delay);
-- 
2.14.3



More information about the amd-gfx mailing list