[Intel-gfx] [RFC i-g-t] intel-gpu-top: Rewrite the tool to be safe to use

Petri Latvala petri.latvala at intel.com
Thu Mar 29 08:20:31 UTC 2018


Eero, can you give this a try and provide some comments?



-- 
Petri Latvala



On 03/28/2018 09:29 PM, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> intel-gpu-top is a dangerous tool which can hang machines due unsafe mmio
> register access. This patch rewrites it to use only PMU.
>
> Only overall command streamer busyness and GPU global data such as power
> and frequencies are included in this new version.
>
> For access to more GPU functional unit level data, an OA metric based tool
> like gpu-top should be used instead.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> Cc: Petri Latvala <petri.latvala at intel.com>
> ---
>   tools/Makefile.am     |   2 +
>   tools/intel_gpu_top.c | 982 +++++++++++++++++++++-----------------------------
>   tools/meson.build     |   6 +-
>   3 files changed, 413 insertions(+), 577 deletions(-)
>
> diff --git a/tools/Makefile.am b/tools/Makefile.am
> index 09b6dbcc3ece..a0b016ddd7ff 100644
> --- a/tools/Makefile.am
> +++ b/tools/Makefile.am
> @@ -28,6 +28,8 @@ intel_aubdump_la_LDFLAGS = -module -avoid-version -no-undefined
>   intel_aubdump_la_SOURCES = aubdump.c
>   intel_aubdump_la_LIBADD = $(top_builddir)/lib/libintel_tools.la -ldl
>   
> +intel_gpu_top_LDADD = $(top_builddir)/lib/libigt_perf.la
> +
>   bin_SCRIPTS = intel_aubdump
>   CLEANFILES = $(bin_SCRIPTS)
>   
> diff --git a/tools/intel_gpu_top.c b/tools/intel_gpu_top.c
> index 098e6ce3ff86..4eef634eb436 100644
> --- a/tools/intel_gpu_top.c
> +++ b/tools/intel_gpu_top.c
> @@ -1,6 +1,5 @@
>   /*
> - * Copyright © 2007 Intel Corporation
> - * Copyright © 2011 Intel Corporation
> + * Copyright © 2018 Intel Corporation
>    *
>    * Permission is hereby granted, free of charge, to any person obtaining a
>    * copy of this software and associated documentation files (the "Software"),
> @@ -18,701 +17,532 @@
>    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>    * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - *
> - * Authors:
> - *    Eric Anholt <eric at anholt.net>
> - *    Eugeni Dodonov <eugeni.dodonov at intel.com>
> - *
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
>    */
>   
> -#include "config.h"
> -
> -#include <inttypes.h>
> -#include <unistd.h>
> -#include <stdlib.h>
>   #include <stdio.h>
> -#include <err.h>
> -#include <sys/ioctl.h>
> -#include <sys/time.h>
> -#include <sys/wait.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <stdint.h>
> +#include <assert.h>
>   #include <string.h>
> -#ifdef HAVE_TERMIOS_H
> -#include <termios.h>
> -#endif
> -#include "intel_io.h"
> -#include "instdone.h"
> -#include "intel_reg.h"
> -#include "intel_chipset.h"
> -#include "drmtest.h"
> -
> -#define  FORCEWAKE	    0xA18C
> -#define  FORCEWAKE_ACK	    0x130090
> -
> -#define SAMPLES_PER_SEC             10000
> -#define SAMPLES_TO_PERCENT_RATIO    (SAMPLES_PER_SEC / 100)
> -
> -#define MAX_NUM_TOP_BITS            100
> -
> -#define HAS_STATS_REGS(devid)		IS_965(devid)
> -
> -struct top_bit {
> -	struct instdone_bit *bit;
> -	int count;
> -} top_bits[MAX_NUM_TOP_BITS];
> -struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS];
> -
> -static uint32_t instdone, instdone1;
> -
> -static const char *bars[] = {
> -	" ",
> -	"▏",
> -	"▎",
> -	"▍",
> -	"▌",
> -	"▋",
> -	"▊",
> -	"▉",
> -	"█"
> -};
> +#include <ctype.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <sys/ioctl.h>
> +#include <errno.h>
> +#include <math.h>
> +#include <locale.h>
> +
> +#include "igt_perf.h"
>   
> -enum stats_counts {
> -	IA_VERTICES,
> -	IA_PRIMITIVES,
> -	VS_INVOCATION,
> -	GS_INVOCATION,
> -	GS_PRIMITIVES,
> -	CL_INVOCATION,
> -	CL_PRIMITIVES,
> -	PS_INVOCATION,
> -	PS_DEPTH,
> -	STATS_COUNT
> +struct pmu_pair {
> +	uint64_t cur;
> +	uint64_t prev;
>   };
>   
> -const uint32_t stats_regs[STATS_COUNT] = {
> -	IA_VERTICES_COUNT_QW,
> -	IA_PRIMITIVES_COUNT_QW,
> -	VS_INVOCATION_COUNT_QW,
> -	GS_INVOCATION_COUNT_QW,
> -	GS_PRIMITIVES_COUNT_QW,
> -	CL_INVOCATION_COUNT_QW,
> -	CL_PRIMITIVES_COUNT_QW,
> -	PS_INVOCATION_COUNT_QW,
> -	PS_DEPTH_COUNT_QW,
> +struct pmu_counter {
> +	uint64_t config;
> +	unsigned int idx;
> +	struct pmu_pair val;
>   };
>   
> -const char *stats_reg_names[STATS_COUNT] = {
> -	"vert fetch",
> -	"prim fetch",
> -	"VS invocations",
> -	"GS invocations",
> -	"GS prims",
> -	"CL invocations",
> -	"CL prims",
> -	"PS invocations",
> -	"PS depth pass",
> +struct engine {
> +	const char *name;
> +	struct pmu_counter busy;
> +	struct pmu_counter wait;
> +	struct pmu_counter sema;
>   };
>   
> -uint64_t stats[STATS_COUNT];
> -uint64_t last_stats[STATS_COUNT];
> +struct engines {
> +	unsigned int num_engines;
> +	unsigned int num_counters;
> +	DIR *root;
> +	int fd;
> +	struct pmu_pair ts;
>   
> -static unsigned long
> -gettime(void)
> -{
> -    struct timeval t;
> -    gettimeofday(&t, NULL);
> -    return (t.tv_usec + (t.tv_sec * 1000000));
> -}
> +	int rapl_fd;
> +	double rapl_scale;
>   
> -static int
> -top_bits_sort(const void *a, const void *b)
> +	struct pmu_counter freq_req;
> +	struct pmu_counter freq_act;
> +	struct pmu_counter irq;
> +	struct pmu_counter rc6;
> +	struct pmu_counter rapl;
> +
> +	struct engine engine;
> +};
> +
> +static uint64_t
> +get_pmu_config(int dirfd, const char *name, const char *counter)
>   {
> -	struct top_bit * const *bit_a = a;
> -	struct top_bit * const *bit_b = b;
> -	int a_count = (*bit_a)->count;
> -	int b_count = (*bit_b)->count;
> +	char buf[128], *p;
> +	int fd, ret;
>   
> -	if (a_count < b_count)
> -		return 1;
> -	else if (a_count == b_count)
> -		return 0;
> -	else
> +	ret = snprintf(buf, sizeof(buf), "%s-%s", name, counter);
> +	if (ret < 0 || ret == sizeof(buf))
>   		return -1;
> -}
>   
> -static void
> -update_idle_bit(struct top_bit *top_bit)
> -{
> -	uint32_t reg_val;
> +	fd = openat(dirfd, buf, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
>   
> -	if (top_bit->bit->reg == INSTDONE_1)
> -		reg_val = instdone1;
> -	else
> -		reg_val = instdone;
> +	ret = read(fd, buf, sizeof(buf));
> +	close(fd);
> +	if (ret <= 0)
> +		return -1;
>   
> -	if ((reg_val & top_bit->bit->bit) == 0)
> -		top_bit->count++;
> -}
> +	p = index(buf, '0');
> +	if (!p)
> +		return -1;
>   
> -static void
> -print_clock(const char *name, int clock) {
> -	if (clock == -1)
> -		printf("%s clock: unknown", name);
> -	else
> -		printf("%s clock: %d Mhz", name, clock);
> +	return strtoul(p, NULL, 0);
>   }
>   
> -static int
> -print_clock_info(struct pci_device *pci_dev)
> +#define engine_ptr(engines, n) \
> +	((struct engine *)((unsigned char *)(&engines->engine) + (n) * sizeof(struct engine)))
> +
> +static struct engines *discover_engines(void)
>   {
> -	uint32_t devid = pci_dev->device_id;
> -	uint16_t gcfgc;
> +	const char *sysfs_root = "/sys/devices/i915/events";
> +	struct engines *engines;
> +	struct dirent *dent;
> +	int ret = 0;
> +	DIR *d;
>   
> -	if (IS_GM45(devid)) {
> -		int core_clock = -1;
> +	engines = malloc(sizeof(struct engines));
> +	if (!engines)
> +		return NULL;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	memset(engines, 0, sizeof(*engines));
>   
> -		switch (gcfgc & 0xf) {
> -		case 8:
> -			core_clock = 266;
> -			break;
> -		case 9:
> -			core_clock = 320;
> -			break;
> -		case 11:
> -			core_clock = 400;
> -			break;
> -		case 13:
> -			core_clock = 533;
> -			break;
> -		}
> -		print_clock("core", core_clock);
> -	} else if (IS_965(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, sampler_clock = -1;
> +	engines->num_engines = 0;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	d = opendir(sysfs_root);
> +	if (!d)
> +		return NULL;
>   
> -		switch (gcfgc & 0xf) {
> -		case 2:
> -			render_clock = 250; sampler_clock = 267;
> -			break;
> -		case 3:
> -			render_clock = 320; sampler_clock = 333;
> -			break;
> -		case 4:
> -			render_clock = 400; sampler_clock = 444;
> -			break;
> -		case 5:
> -			render_clock = 500; sampler_clock = 533;
> +	while ((dent = readdir(d)) != NULL) {
> +		const char *endswith = "-busy";
> +		const unsigned int endlen = strlen(endswith);
> +		struct engine *engine =
> +				engine_ptr(engines, engines->num_engines);
> +		char buf[256];
> +
> +		if (dent->d_type != DT_REG)
> +			continue;
> +
> +		if (strlen(dent->d_name) >= sizeof(buf)) {
> +			ret = -1;
>   			break;
>   		}
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("sampler", sampler_clock);
> -	} else if (IS_945(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +		strcpy(buf, dent->d_name);
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +		/* xxxN-busy */
> +		if (strlen(buf) < (endlen + 4))
> +			continue;
> +		if (strcmp(&buf[strlen(buf) - endlen], endswith))
> +			continue;
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 166;
> -			break;
> -		case 1:
> -			render_clock = 200;
> -			break;
> -		case 3:
> -			render_clock = 250;
> -			break;
> -		case 5:
> -			render_clock = 400;
> +		memset(engine, 0, sizeof(*engine));
> +
> +		buf[strlen(buf) - endlen] = 0;
> +		engine->name = strdup(buf);
> +		if (!engine->name) {
> +			ret = -1;
>   			break;
>   		}
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 200;
> +		engine->busy.config = get_pmu_config(dirfd(d), engine->name,
> +						     "busy");
> +		if (engine->busy.config == -1) {
> +			ret = -1;
>   			break;
> -		case 4:
> -			display_clock = 320;
> +		}
> +
> +		engines->num_engines++;
> +		engines = realloc(engines, sizeof(struct engines) +
> +				  engines->num_engines * sizeof(struct engine));
> +		if (!engines) {
> +			ret = -ENOMEM;
>   			break;
>   		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	}
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	} else if (IS_915(devid) && IS_MOBILE(devid)) {
> -		int render_clock = -1, display_clock = -1;
> +	if (ret)
> +		free(engines);
> +	else
> +		engines->root = d;
>   
> -		pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC);
> +	return ret == 0 ? engines : NULL;
> +}
>   
> -		switch (gcfgc & 0x7) {
> -		case 0:
> -			render_clock = 160;
> -			break;
> -		case 1:
> -			render_clock = 190;
> -			break;
> -		case 4:
> -			render_clock = 333;
> -			break;
> -		}
> -		if (gcfgc & (1 << 13))
> -		    render_clock = 133;
> +static int
> +filename_to_buf(const char *filename, char *buf, unsigned int bufsize)
> +{
> +	int fd;
> +	ssize_t ret;
>   
> -		switch (gcfgc & 0x70) {
> -		case 0:
> -			display_clock = 190;
> -			break;
> -		case 4:
> -			display_clock = 333;
> -			break;
> -		}
> -		if (gcfgc & (1 << 7))
> -		    display_clock = 133;
> +	fd = open(filename, O_RDONLY);
> +	if (fd < 0)
> +		return -1;
>   
> -		print_clock("render", render_clock);
> -		printf("  ");
> -		print_clock("display", display_clock);
> -	}
> +	ret = read(fd, buf, bufsize - 1);
> +	close(fd);
> +	if (ret < 1)
> +		return -1;
>   
> +	buf[ret] = '\0';
>   
> -	printf("\n");
> -	return -1;
> +	return 0;
>   }
>   
> -#define STATS_LEN (20)
> -#define PERCENTAGE_BAR_END	(79 - STATS_LEN)
> +static uint64_t filename_to_u64(const char *filename, int base)
> +{
> +	char buf[64], *b;
>   
> -static void
> -print_percentage_bar(float percent, int cur_line_len)
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
> +
> +	/*
> +	 * Handle both single integer and key=value formats by skipping
> +	 * leading non-digits.
> +	 */
> +	b = buf;
> +	while (*b && !isdigit(*b))
> +		b++;
> +
> +	return strtoull(b, NULL, base);
> +}
> +
> +static uint64_t rapl_type_id(void)
>   {
> -	int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8;
> -	int bar_len = bar_avail_len * (percent + .5) / 100.0;
> -	int i;
> +	return filename_to_u64("/sys/devices/power/type", 10);
> +}
>   
> -	for (i = bar_len; i >= 8; i -= 8) {
> -		printf("%s", bars[8]);
> -		cur_line_len++;
> -	}
> -	if (i) {
> -		printf("%s", bars[i]);
> -		cur_line_len++;
> -	}
> +static uint64_t rapl_gpu_power(void)
> +{
> +	return filename_to_u64("/sys/devices/power/events/energy-gpu", 0);
> +}
> +
> +static double filename_to_double(const char *filename)
> +{
> +	char *oldlocale;
> +	char buf[80];
> +	double v;
> +
> +	if (filename_to_buf(filename, buf, sizeof(buf)))
> +		return 0;
>   
> -	/* NB: We can't use a field width with utf8 so we manually
> -	* guarantee a field with of 45 chars for any bar. */
> -	printf("%*s", PERCENTAGE_BAR_END - cur_line_len, "");
> +	oldlocale = setlocale(LC_ALL, "C");
> +	v = strtod(buf, NULL);
> +	setlocale(LC_ALL, oldlocale);
> +
> +	return v;
>   }
>   
> -struct ring {
> -	const char *name;
> -	uint32_t mmio;
> -	int head, tail, size;
> -	uint64_t full;
> -	int idle;
> -};
> +static double rapl_gpu_power_scale(void)
> +{
> +	return filename_to_double("/sys/devices/power/events/energy-gpu.scale");
> +}
>   
> -static uint32_t ring_read(struct ring *ring, uint32_t reg)
> +#define __open_pmu(engines, pmu, idx) \
> +({ \
> +	int fd__; \
> +\
> +	fd__ = perf_i915_open_group((pmu)->config, (engines)->fd); \
> +	if (fd__ >= 0) { \
> +		if ((engines)->fd == -1) \
> +			(engines)->fd = fd__; \
> +		(pmu)->idx = (idx)++; \
> +		(engines)->num_counters++; \
> +	} \
> +\
> +	fd__; \
> +})
> +
> +static int pmu_init(struct engines *engines)
>   {
> -	return INREG(ring->mmio + reg);
> +	unsigned int idx = 0;
> +	unsigned int i;
> +	int fd;
> +
> +	engines->fd = -1;
> +	engines->num_counters = 0;
> +
> +	engines->freq_req.config = I915_PMU_REQUESTED_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_req, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->freq_act.config = I915_PMU_ACTUAL_FREQUENCY;
> +	fd = __open_pmu(engines, &engines->freq_act, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->irq.config = I915_PMU_INTERRUPTS;
> +	fd = __open_pmu(engines, &engines->irq, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	engines->rc6.config = I915_PMU_RC6_RESIDENCY;
> +	fd = __open_pmu(engines, &engines->rc6, idx);
> +	if (fd < 0)
> +		return -1;
> +
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
> +		struct {
> +			struct pmu_counter *pmu;
> +			const char *counter;
> +		} *cnt, counters[] = {
> +			{ .pmu = &engine->busy, .counter = "busy" },
> +			{ .pmu = &engine->wait, .counter = "wait" },
> +			{ .pmu = &engine->sema, .counter = "sema" },
> +			{ .pmu = NULL, .counter = NULL },
> +		};
> +
> +		for (cnt = counters; cnt->pmu; cnt++) {
> +			if (!cnt->pmu->config)
> +				cnt->pmu->config =
> +					get_pmu_config(dirfd(engines->root),
> +						       engine->name,
> +						       cnt->counter);
> +			fd = __open_pmu(engines, cnt->pmu, idx);
> +			if (fd < 0)
> +				return -1;
> +		}
> +	}
> +
> +	engines->rapl_scale = rapl_gpu_power_scale();
> +	if (engines->rapl_scale != NAN)
> +		engines->rapl_scale *= 1e3; /* from nano to micro */
> +	engines->rapl.config = rapl_gpu_power();
> +	engines->rapl_fd = igt_perf_open(rapl_type_id(), engines->rapl.config);
> +	if (engines->rapl_fd < 0)
> +		return -1;
> +
> +	return 0;
>   }
>   
> -static void ring_init(struct ring *ring)
> +static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val)
>   {
> -	ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096;
> +	uint64_t buf[2 + num];
> +	unsigned int i;
> +
> +	assert(read(fd, buf, sizeof(buf)) == sizeof(buf));
> +
> +	for (i = 0; i < num; i++)
> +		val[i] = buf[2 + i];
> +
> +	return buf[1];
>   }
>   
> -static void ring_reset(struct ring *ring)
> +static double pmu_calc(struct pmu_pair *p, double d, double t, double s)
>   {
> -	ring->idle = ring->full = 0;
> +	double pct;
> +
> +	pct = p->cur - p->prev;
> +	pct /= d;
> +	pct /= t;
> +	pct *= s;
> +
> +	if (s == 100.0 && pct > 100.0)
> +		pct = 100.0;
> +
> +	return pct;
>   }
>   
> -static void ring_sample(struct ring *ring)
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
>   {
> -	int full;
> +	uint64_t data[2];
>   
> -	if (!ring->size)
> -		return;
> +	assert(read(fd, data, sizeof(data)) == sizeof(data));
>   
> -	ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR;
> -	ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR;
> +	if (ts)
> +		*ts = data[1];
>   
> -	if (ring->tail == ring->head)
> -		ring->idle++;
> +	return data[0];
> +}
> +
> +static uint64_t pmu_read_single(int fd)
> +{
> +	return __pmu_read_single(fd, NULL);
> +}
>   
> -	full = ring->tail - ring->head;
> -	if (full < 0)
> -		full += ring->size;
> -	ring->full += full;
> +static void __update_sample(struct pmu_counter *counter, uint64_t val)
> +{
> +	counter->val.prev = counter->val.cur;
> +	counter->val.cur = val;
>   }
>   
> -static void ring_print_header(FILE *out, struct ring *ring)
> +static void update_sample(struct pmu_counter *counter, uint64_t *val)
>   {
> -    fprintf(out, "%.6s%%\tops\t",
> -            ring->name
> -          );
> +	__update_sample(counter, val[counter->idx]);
>   }
>   
> -static void ring_print(struct ring *ring, unsigned long samples_per_sec)
> +static void pmu_sample(struct engines *engines)
>   {
> -	int percent_busy, len;
> +	const int num_val = engines->num_counters;
> +	uint64_t val[num_val];
> +	unsigned int i;
> +
> +	engines->ts.prev = engines->ts.cur;
> +	engines->ts.cur = pmu_read_multi(engines->fd, num_val, val);
> +
> +	__update_sample(&engines->rapl, pmu_read_single(engines->rapl_fd));
>   
> -	if (!ring->size)
> -		return;
> +	update_sample(&engines->freq_req, val);
> +	update_sample(&engines->freq_act, val);
> +	update_sample(&engines->irq, val);
> +	update_sample(&engines->rc6, val);
>   
> -	percent_busy = 100 - 100 * ring->idle / samples_per_sec;
> +	for (i = 0; i < engines->num_engines; i++) {
> +		struct engine *engine = engine_ptr(engines, i);
>   
> -	len = printf("%25s busy: %3d%%: ", ring->name, percent_busy);
> -	print_percentage_bar (percent_busy, len);
> -	printf("%24s space: %d/%d\n",
> -		   ring->name,
> -		   (int)(ring->full / samples_per_sec),
> -		   ring->size);
> +		update_sample(&engine->busy, val);
> +		update_sample(&engine->sema, val);
> +		update_sample(&engine->wait, val);
> +	}
>   }
>   
> -static void ring_log(struct ring *ring, unsigned long samples_per_sec,
> -		FILE *output)
> +static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" };
> +
> +static void
> +print_percentage_bar(double percent, int max_len)
>   {
> -	if (ring->size)
> -		fprintf(output, "%3d\t%d\t",
> -			(int)(100 - 100 * ring->idle / samples_per_sec),
> -			(int)(ring->full / samples_per_sec));
> -	else
> -		fprintf(output, "-1\t-1\t");
> +	int bar_len = percent * (8 * (max_len - 2)) / 100.0;
> +	int i;
> +
> +	putchar('|');
> +
> +	for (i = bar_len; i >= 8; i -= 8)
> +		printf("%s", bars[8]);
> +	if (i)
> +		printf("%s", bars[i]);
> +
> +	for (i = 0; i < (max_len - 2 - (bar_len + 7) / 8); i++)
> +		putchar(' ');
> +
> +	putchar('|');
>   }
>   
> +#define DEFAULT_PERIOD_MS (1000)
> +
>   static void
>   usage(const char *appname)
>   {
>   	printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n"
> -			"\n"
> -			"usage: %s [parameters]\n"
> -			"\n"
> -			"The following parameters apply:\n"
> -			"[-s <samples>]       samples per seconds (default %d)\n"
> -			"[-e <command>]       command to profile\n"
> -			"[-o <file>]          output statistics to file. If file is '-',"
> -			"                     run in batch mode and output statistics to stdio only \n"
> -			"[-h]                 show this help screen\n"
> -			"\n",
> -			appname,
> -			SAMPLES_PER_SEC
> -		  );
> -	return;
> +		"\n"
> +		"Usage: %s [parameters]\n"
> +		"\n"
> +		"\tThe following parameters are optional:\n"
> +		"\t[-s <samples>]       refresh period in ms (default %ums)\n"
> +		"\t[-h]                 show this help text\n"
> +		"\n",
> +		appname, DEFAULT_PERIOD_MS);
>   }
>   
>   int main(int argc, char **argv)
>   {
> -	uint32_t devid;
> -	struct pci_device *pci_dev;
> -	struct ring render_ring = {
> -		.name = "render",
> -		.mmio = 0x2030,
> -	}, bsd_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x4030,
> -	}, bsd6_ring = {
> -		.name = "bitstream",
> -		.mmio = 0x12030,
> -	}, blt_ring = {
> -		.name = "blitter",
> -		.mmio = 0x22030,
> -	};
> -	int i, ch;
> -	int samples_per_sec = SAMPLES_PER_SEC;
> -	FILE *output = NULL;
> -	double elapsed_time=0;
> -	int print_headers=1;
> -	pid_t child_pid=-1;
> -	int child_stat;
> -	char *cmd=NULL;
> -	int interactive=1;
> -
> -	/* Parse options? */
> -	while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) {
> +	unsigned int period_us = DEFAULT_PERIOD_MS * 1000;
> +	int con_w = -1, con_h = -1;
> +	struct engines *engines;
> +	struct winsize ws;
> +	unsigned int i;
> +	int ret, ch;
> +
> +	/* Parse options */
> +	while ((ch = getopt(argc, argv, "s:h")) != -1) {
>   		switch (ch) {
> -		case 'e': cmd = strdup(optarg);
> -			break;
> -		case 's': samples_per_sec = atoi(optarg);
> -			if (samples_per_sec < 100) {
> -				fprintf(stderr, "Error: samples per second must be >= 100\n");
> -				exit(1);
> -			}
> -			break;
> -		case 'o':
> -			if (!strcmp(optarg, "-")) {
> -				/* Running in non-interactive mode */
> -				interactive = 0;
> -				output = stdout;
> -			}
> -			else
> -				output = fopen(optarg, "w");
> -			if (!output)
> -			{
> -				perror("fopen");
> -				exit(1);
> -			}
> +		case 's':
> +			period_us = atoi(optarg) * 1000;
>   			break;
>   		case 'h':
>   			usage(argv[0]);
>   			exit(0);
> -			break;
>   		default:
> -			fprintf(stderr, "Invalid flag %c!\n", (char)optopt);
> +			fprintf(stderr, "Invalid option %c!\n", (char)optopt);
>   			usage(argv[0]);
>   			exit(1);
> -			break;
>   		}
>   	}
>   
> -	pci_dev = intel_get_pci_device();
> -	devid = pci_dev->device_id;
> -	intel_mmio_use_pci_bar(pci_dev);
> -	init_instdone_definitions(devid);
> +	/* Get terminal size. */
> +	if (ioctl(0, TIOCGWINSZ, &ws) != -1) {
> +		con_w = ws.ws_col;
> +		con_h = ws.ws_row;
> +	}
>   
> -	/* Do we have a command to run? */
> -	if (cmd != NULL) {
> -		if (output) {
> -			fprintf(output, "# Profiling: %s\n", cmd);
> -			fflush(output);
> -		}
> -		child_pid = fork();
> -		if (child_pid < 0) {
> -			perror("fork");
> -			exit(1);
> -		}
> -		else if (child_pid == 0) {
> -			int res;
> -			res = system(cmd);
> -			if (res < 0)
> -				perror("running command");
> -			if (output) {
> -				fflush(output);
> -				fprintf(output, "# %s exited with status %d\n", cmd, res);
> -				fflush(output);
> -			}
> -			free(cmd);
> -			exit(0);
> -		} else {
> -			free(cmd);
> -		}
> +	engines = discover_engines();
> +	if (!engines) {
> +		fprintf(stderr, "Failed to detect engines!\n");
> +		return 1;
>   	}
>   
> -	for (i = 0; i < num_instdone_bits; i++) {
> -		top_bits[i].bit = &instdone_bits[i];
> -		top_bits[i].count = 0;
> -		top_bits_sorted[i] = &top_bits[i];
> +	ret = pmu_init(engines);
> +	if (ret) {
> +		fprintf(stderr, "Failed to initialize PMU!\n");
> +		return 1;
>   	}
>   
> -	/* Grab access to the registers */
> -	intel_register_access_init(pci_dev, 0, -1);
> +	pmu_sample(engines);
>   
> -	ring_init(&render_ring);
> -	if (IS_GEN4(devid) || IS_GEN5(devid))
> -		ring_init(&bsd_ring);
> -	if (IS_GEN6(devid) || IS_GEN7(devid)) {
> -		ring_init(&bsd6_ring);
> -		ring_init(&blt_ring);
> -	}
> +	for (;;) {
> +		double t, freq[2], irq, rc6, power;
> +		int lines = 0;
>   
> -	/* Initialize GPU stats */
> -	if (HAS_STATS_REGS(devid)) {
> -		for (i = 0; i < STATS_COUNT; i++) {
> -			uint32_t stats_high, stats_low, stats_high_2;
> +		usleep(period_us);
>   
> -			do {
> -				stats_high = INREG(stats_regs[i] + 4);
> -				stats_low = INREG(stats_regs[i]);
> -				stats_high_2 = INREG(stats_regs[i] + 4);
> -			} while (stats_high != stats_high_2);
> +		pmu_sample(engines);
> +		t = (double)(engines->ts.cur - engines->ts.prev) / 1e9;
>   
> -			last_stats[i] = (uint64_t)stats_high << 32 |
> -				stats_low;
> -		}
> -	}
> +		printf("\033[H\033[J");
>   
> -	for (;;) {
> -		int j;
> -		unsigned long long t1, ti, tf, t2;
> -		unsigned long long def_sleep = 1000000 / samples_per_sec;
> -		unsigned long long last_samples_per_sec = samples_per_sec;
> -		unsigned short int max_lines;
> -		struct winsize ws;
> -		char clear_screen[] = {0x1b, '[', 'H',
> -				       0x1b, '[', 'J',
> -				       0x0};
> -		int percent;
> -		int len;
> -
> -		t1 = gettime();
> -
> -		ring_reset(&render_ring);
> -		ring_reset(&bsd_ring);
> -		ring_reset(&bsd6_ring);
> -		ring_reset(&blt_ring);
> -
> -		for (i = 0; i < samples_per_sec; i++) {
> -			long long interval;
> -			ti = gettime();
> -			if (IS_965(devid)) {
> -				instdone = INREG(INSTDONE_I965);
> -				instdone1 = INREG(INSTDONE_1);
> -			} else
> -				instdone = INREG(INSTDONE);
> -
> -			for (j = 0; j < num_instdone_bits; j++)
> -				update_idle_bit(&top_bits[j]);
> -
> -			ring_sample(&render_ring);
> -			ring_sample(&bsd_ring);
> -			ring_sample(&bsd6_ring);
> -			ring_sample(&blt_ring);
> -
> -			tf = gettime();
> -			if (tf - t1 >= 1000000) {
> -				/* We are out of sync, bail out */
> -				last_samples_per_sec = i+1;
> -				break;
> -			}
> -			interval = def_sleep - (tf - ti);
> -			if (interval > 0)
> -				usleep(interval);
> -		}
> +		freq[0] = pmu_calc(&engines->freq_req.val, 1.0, t, 1);
> +		freq[1] = pmu_calc(&engines->freq_act.val, 1.0, t, 1);
> +		irq = pmu_calc(&engines->irq.val, 1.0, t, 1);
> +		rc6 = pmu_calc(&engines->rc6.val, 1e9, t, 100);
> +		power = pmu_calc(&engines->rapl.val, 1.0, t,
> +				 engines->rapl_scale);
>   
> -		if (HAS_STATS_REGS(devid)) {
> -			for (i = 0; i < STATS_COUNT; i++) {
> -				uint32_t stats_high, stats_low, stats_high_2;
> +		printf("intel-gpu-top - %4.0f/%4.0f MHz;  %3.0f%% RC6; %6.0fmW; %8.0f irqs/s\n",
> +		       freq[0], freq[1], rc6, power, irq);
> +		lines++;
>   
> -				do {
> -					stats_high = INREG(stats_regs[i] + 4);
> -					stats_low = INREG(stats_regs[i]);
> -					stats_high_2 = INREG(stats_regs[i] + 4);
> -				} while (stats_high != stats_high_2);
> +		printf("\n");
> +		lines++;
>   
> -				stats[i] = (uint64_t)stats_high << 32 |
> -					stats_low;
> -			}
> -		}
> +		for (i = 0; i < engines->num_engines && lines < con_h; i++) {
> +			struct engine *engine = engine_ptr(engines, i);
> +			unsigned int max_w = con_w - 1;
> +			unsigned int len;
> +			double val[2];
> +			char buf[128];
>   
> -		qsort(top_bits_sorted, num_instdone_bits,
> -		      sizeof(struct top_bit *), top_bits_sort);
> -
> -		/* Limit the number of lines printed to the terminal height so the
> -		 * most important info (at the top) will stay on screen. */
> -		max_lines = -1;
> -		if (ioctl(0, TIOCGWINSZ, &ws) != -1)
> -			max_lines = ws.ws_row - 6; /* exclude header lines */
> -		if (max_lines >= num_instdone_bits)
> -			max_lines = num_instdone_bits;
> -
> -		t2 = gettime();
> -		elapsed_time += (t2 - t1) / 1000000.0;
> -
> -		if (interactive) {
> -			printf("%s", clear_screen);
> -			print_clock_info(pci_dev);
> -
> -			ring_print(&render_ring, last_samples_per_sec);
> -			ring_print(&bsd_ring, last_samples_per_sec);
> -			ring_print(&bsd6_ring, last_samples_per_sec);
> -			ring_print(&blt_ring, last_samples_per_sec);
> -
> -			printf("\n%30s  %s\n", "task", "percent busy");
> -			for (i = 0; i < max_lines; i++) {
> -				if (top_bits_sorted[i]->count > 0) {
> -					percent = (top_bits_sorted[i]->count * 100) /
> -						last_samples_per_sec;
> -					len = printf("%30s: %3d%%: ",
> -							 top_bits_sorted[i]->bit->name,
> -							 percent);
> -					print_percentage_bar (percent, len);
> -				} else {
> -					printf("%*s", PERCENTAGE_BAR_END, "");
> -				}
> -
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					printf("%13s: %llu (%lld/sec)",
> -						   stats_reg_names[i],
> -						   (long long)stats[i],
> -						   (long long)(stats[i] - last_stats[i]));
> -					last_stats[i] = stats[i];
> -				} else {
> -					if (!top_bits_sorted[i]->count)
> -						break;
> -				}
> -				printf("\n");
> -			}
> -		}
> -		if (output) {
> -			/* Print headers for columns at first run */
> -			if (print_headers) {
> -				fprintf(output, "# time\t");
> -				ring_print_header(output, &render_ring);
> -				ring_print_header(output, &bsd_ring);
> -				ring_print_header(output, &bsd6_ring);
> -				ring_print_header(output, &blt_ring);
> -				for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -					if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -						fprintf(output, "%.6s\t",
> -							   stats_reg_names[i]
> -							   );
> -					}
> -					if (!top_bits[i].count)
> -						continue;
> -				}
> -				fprintf(output, "\n");
> -				print_headers = 0;
> -			}
> -
> -			/* Print statistics */
> -			fprintf(output, "%.2f\t", elapsed_time);
> -			ring_log(&render_ring, last_samples_per_sec, output);
> -			ring_log(&bsd_ring, last_samples_per_sec, output);
> -			ring_log(&bsd6_ring, last_samples_per_sec, output);
> -			ring_log(&blt_ring, last_samples_per_sec, output);
> -
> -			for (i = 0; i < MAX_NUM_TOP_BITS; i++) {
> -				if (i < STATS_COUNT && HAS_STATS_REGS(devid)) {
> -					fprintf(output, "%"PRIu64"\t",
> -						   stats[i] - last_stats[i]);
> -					last_stats[i] = stats[i];
> -				}
> -					if (!top_bits[i].count)
> -						continue;
> -			}
> -			fprintf(output, "\n");
> -			fflush(output);
> -		}
> +			val[0] = pmu_calc(&engine->wait.val, 1e9, t, 100);
> +			val[1] = pmu_calc(&engine->sema.val, 1e9, t, 100);
> +			len = snprintf(buf, sizeof(buf),
> +				       "%6.2f%% wait, %6.2f%% sema",
> +				       val[0], val[1]);
>   
> -		for (i = 0; i < num_instdone_bits; i++) {
> -			top_bits_sorted[i]->count = 0;
> +			val[0] = pmu_calc(&engine->busy.val, 1e9, t, 100);
> +			len += printf("%8s %6.2f%% ",
> +				      engine->name, val[0]);
> +			print_percentage_bar(val[0], max_w - len);
>   
> -			if (i < STATS_COUNT)
> -				last_stats[i] = stats[i];
> -		}
> +			printf("%s\n", buf);
>   
> -		/* Check if child has gone */
> -		if (child_pid > 0) {
> -			int res;
> -			if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) {
> -				perror("waitpid");
> -				exit(1);
> -			}
> -			if (res == 0)
> -				continue;
> -			if (WIFEXITED(child_stat))
> -				break;
> +			lines++;
>   		}
> -	}
>   
> -	fclose(output);
> +		printf("\n");
> +	}
>   
> -	intel_register_access_fini();
>   	return 0;
>   }
> diff --git a/tools/meson.build b/tools/meson.build
> index bd2d313d5156..a918eeb0bef1 100644
> --- a/tools/meson.build
> +++ b/tools/meson.build
> @@ -23,7 +23,6 @@ tools_progs = [
>   	'intel_gpu_frequency',
>   	'intel_firmware_decode',
>   	'intel_gpu_time',
> -	'intel_gpu_top',
>   	'intel_gtt',
>   	'intel_guc_logger',
>   	'intel_infoframes',
> @@ -117,6 +116,11 @@ shared_library('intel_aubdump', 'aubdump.c',
>   	       name_prefix : '',
>   	       install : true)
>   
> +executable('intel_gpu_top', 'intel_gpu_top.c',
> +	   install : true,
> +	   install_rpath : rpathdir,
> +	   dependencies : tool_deps + [ lib_igt_perf ])
> +
>   conf_data = configuration_data()
>   conf_data.set('prefix', prefix)
>   conf_data.set('exec_prefix', '${prefix}')



More information about the Intel-gfx mailing list