[Intel-gfx] [RFC i-g-t] tests/perf_pmu: test i915 RFC PMU

Thu Aug 24 15:05:56 UTC 2017

i915 RFC PMU:
* https://patchwork.freedesktop.org/series/27488/
* https://patchwork.freedesktop.org/series/28842/

Tests:
* init: try to initialize all possible metrics exposed in i915 PMU
  (limit to 0-instance of engines)
* invalid_init: verify that i915 PMU correctly error out on invalid
  initialization
* single: verify that BUSY metrics work for each engine
* parallel: verify that parallel requests for metrics do not conflict

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
---
 tests/Makefile.sources |   1 +
 tests/perf_pmu.c       | 546 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 547 insertions(+)
 create mode 100644 tests/perf_pmu.c

diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index bb013c7..51b684b 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -215,6 +215,7 @@ TESTS_progs = \
 	kms_vblank \
 	meta_test \
 	perf \
+	perf_pmu \
 	pm_backlight \
 	pm_lpsp \
 	pm_rc6_residency \
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
new file mode 100644
index 0000000..0d025a6
--- /dev/null
+++ b/tests/perf_pmu.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "igt.h"
+#include "igt_sysfs.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/time.h>
+#include <time.h>
+#include "drm.h"
+
+#define LOCAL_I915_EXEC_NO_RELOC (1<<11)
+#define LOCAL_I915_EXEC_HANDLE_LUT (1<<12)
+
+////////////////////////////////////////////////////////////////////////
+// This is a copy of perf.h from intel-gpu-tools/overlay
+// because I am lazy enough to move it to some common library
+////////////////////////////////////////////////////////////////////////
+
+#include <linux/perf_event.h>
+
+enum drm_i915_gem_engine_class {
+	I915_ENGINE_CLASS_OTHER = 0,
+	I915_ENGINE_CLASS_RENDER = 1,
+	I915_ENGINE_CLASS_COPY = 2,
+	I915_ENGINE_CLASS_VIDEO = 3,
+	I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
+	I915_ENGINE_CLASS_MAX /* non-ABI */
+};
+
+enum drm_i915_pmu_engine_sample {
+	I915_SAMPLE_QUEUED = 0,
+	I915_SAMPLE_BUSY = 1,
+	I915_SAMPLE_WAIT = 2,
+	I915_SAMPLE_SEMA = 3
+};
+
+#define I915_PMU_SAMPLE_BITS (4)
+#define I915_PMU_SAMPLE_MASK (0xf)
+#define I915_PMU_SAMPLE_INSTANCE_BITS (8)
+#define I915_PMU_CLASS_SHIFT \
+	(I915_PMU_SAMPLE_BITS + I915_PMU_SAMPLE_INSTANCE_BITS)
+
+#define __I915_PMU_ENGINE(class, instance, sample) \
+	((class) << I915_PMU_CLASS_SHIFT | \
+	(instance) << I915_PMU_SAMPLE_BITS | \
+	(sample))
+
+#define I915_PMU_ENGINE_QUEUED(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_QUEUED)
+
+#define I915_PMU_ENGINE_BUSY(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY)
+
+#define I915_PMU_ENGINE_WAIT(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT)
+
+#define I915_PMU_ENGINE_SEMA(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
+
+#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
+
+#define I915_PMU_ACTUAL_FREQUENCY 	__I915_PMU_OTHER(0)
+#define I915_PMU_REQUESTED_FREQUENCY	__I915_PMU_OTHER(1)
+#define I915_PMU_ENERGY			__I915_PMU_OTHER(2)
+#define I915_PMU_INTERRUPTS		__I915_PMU_OTHER(3)
+
+#define I915_PMU_RC6_RESIDENCY		__I915_PMU_OTHER(4)
+#define I915_PMU_RC6p_RESIDENCY		__I915_PMU_OTHER(5)
+#define I915_PMU_RC6pp_RESIDENCY	__I915_PMU_OTHER(6)
+
+static inline int
+perf_event_open(struct perf_event_attr *attr,
+		pid_t pid,
+		int cpu,
+		int group_fd,
+		unsigned long flags)
+{
+#ifndef __NR_perf_event_open
+#if defined(__i386__)
+#define __NR_perf_event_open 336
+#elif defined(__x86_64__)
+#define __NR_perf_event_open 298
+#else
+#define __NR_perf_event_open 0
+#endif
+#endif
+    attr->size = sizeof(*attr);
+    return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+}
+
+static uint64_t i915_type_id(void)
+{
+	char buf[1024];
+	int fd, n;
+
+	fd = open("/sys/bus/event_source/devices/i915/type", 0);
+	if (fd < 0) {
+		n = -1;
+	} else {
+		n = read(fd, buf, sizeof(buf)-1);
+		close(fd);
+	}
+	if (n < 0)
+		return 0;
+
+	buf[n] = '\0';
+	return strtoull(buf, 0, 0);
+}
+
+////////////////////////////////////////////////////////////////////////
+
+static double elapsed(const struct timespec *start, const struct timespec *end)
+{
+	return ((end->tv_sec - start->tv_sec) +
+		(end->tv_nsec - start->tv_nsec)*1e-9);
+}
+
+static uint64_t elapsed_ns(const struct timespec *start, const struct timespec *end)
+{
+	return ((end->tv_sec - start->tv_sec)*1e9 +
+		(end->tv_nsec - start->tv_nsec));
+}
+
+static void nop_on_ring(int fd, uint32_t handle, unsigned ring_id, int timeout)
+{
+	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_exec_object2 obj;
+	struct timespec start, now;
+
+	gem_require_ring(fd, ring_id);
+
+	memset(&obj, 0, sizeof(obj));
+	obj.handle = handle;
+
+	memset(&execbuf, 0, sizeof(execbuf));
+	execbuf.buffers_ptr = to_user_pointer(&obj);
+	execbuf.buffer_count = 1;
+	execbuf.flags = ring_id;
+	execbuf.flags |= LOCAL_I915_EXEC_HANDLE_LUT;
+	execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC;
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	if (__gem_execbuf(fd, &execbuf)) {
+		execbuf.flags = ring_id;
+		gem_execbuf(fd, &execbuf);
+	}
+
+	do {
+		for (int loop = 0; loop < 1024; loop++) {
+			gem_execbuf(fd, &execbuf);
+		}
+		clock_gettime(CLOCK_MONOTONIC, &now);
+	} while (elapsed(&start, &now) < timeout);
+	gem_sync(fd, handle);
+}
+
+static int perf_i915_open(int config, int group, int read_format)
+{
+	struct perf_event_attr attr;
+
+	memset(&attr, 0, sizeof (attr));
+
+	attr.type = i915_type_id();
+	if (attr.type == 0)
+		return -ENOENT;
+	attr.config = config;
+
+	attr.read_format = read_format;
+	if (group != -1)
+		attr.read_format &= ~PERF_FORMAT_GROUP;
+
+	return perf_event_open(&attr, -1, 0, group, 0);
+}
+
+struct metric {
+	int config;
+	uint64_t old_value;
+	uint64_t value;
+};
+
+struct pmu_metrics {
+	int fd;
+	int read_format;
+	int num_metrics;
+	struct metric* metrics;
+};
+
+static int perf_init(struct pmu_metrics *pm, int num_configs, int* configs)
+{
+	int i, res;
+
+	memset(pm, 0, sizeof(struct pmu_metrics));
+	pm->fd = -1;
+	pm->read_format =
+		PERF_FORMAT_TOTAL_TIME_ENABLED |
+		PERF_FORMAT_GROUP;
+	pm->metrics = (struct metric*)calloc(num_configs, sizeof(struct metric));
+	if (!pm->metrics)
+		return -1;
+
+	for (i = 0; i < num_configs; ++i) {
+		if (pm->fd < 0)
+			res = pm->fd = perf_i915_open(configs[i], -1, pm->read_format);
+		else
+			res = perf_i915_open(configs[i], pm->fd, pm->read_format);
+		if (res >= 0) {
+			pm->metrics[pm->num_metrics++].config = configs[i];
+		}
+	}
+
+	igt_info("perf_init: enabled %d metrics from %d requested\n",
+		pm->num_metrics, num_configs);
+
+	return 0;
+}
+
+static void perf_close(struct pmu_metrics *pm)
+{
+	if (pm->fd != -1 ) { close(pm->fd); pm->fd = -1; }
+	if (pm->metrics) { free(pm->metrics); pm->metrics= NULL; }
+}
+
+/* see 'man 2 perf_event_open' */
+struct perf_read_format {
+	uint64_t nr_values;     /* The number of events */
+	uint64_t time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+	struct {
+		uint64_t value;     /* The value of the event */
+	} values[1024];
+};
+
+static int perf_read(struct pmu_metrics *pm)
+{
+	int read_format =
+		PERF_FORMAT_TOTAL_TIME_ENABLED |
+		PERF_FORMAT_GROUP;
+	struct perf_read_format data;
+	ssize_t len;
+	int i;
+
+	if (pm->fd < 0)
+		return -1;
+
+	if (pm->read_format != read_format)
+		return -1;
+
+	len = read(pm->fd, &data, sizeof(data));
+	if (len < 0) {
+		return -1;
+	}
+
+	if (pm->num_metrics != data.nr_values)
+		return -1;
+
+	for (i = 0; i < data.nr_values; ++i) {
+		pm->metrics[i].old_value = pm->metrics[i].value;
+		pm->metrics[i].value = data.values[i].value;
+	}
+
+	return 0;
+}
+
+static const char* perf_get_metric_name(int config)
+{
+	switch (config) {
+		case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0):
+			return "i915/rcs0-busy/";
+		case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 0):
+			return "i915/vcs0-busy/";
+		case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 1):
+			return "i915/vcs1-busy/";
+		case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_COPY, 0):
+			return "i915/bcs0-busy/";
+		case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO_ENHANCE, 0):
+			return "i915/vecs0-busy/";
+		default:
+			return "i915/unknown/";
+	}
+}
+
+static uint64_t perf_elapsed(struct metric* m)
+{
+	return m->value - m->old_value;
+}
+
+static void test_init(void)
+{
+	struct pmu_metrics pm;
+	unsigned int class[] =
+	{
+		I915_ENGINE_CLASS_RENDER,
+		I915_ENGINE_CLASS_VIDEO,
+		I915_ENGINE_CLASS_VIDEO,
+		I915_ENGINE_CLASS_COPY,
+		I915_ENGINE_CLASS_VIDEO_ENHANCE,
+	};
+	int* configs = malloc(1024 * sizeof(int));
+	int num_configs = 0;
+
+	igt_assert(configs != NULL);
+
+	for (int i=0; i < sizeof(class)/sizeof(class[0]); ++i) {
+		/* TODO Adding metrics for 0-instances only. Would be nice
+		 * to get everything, but for that we either need to add
+		 * check for different platforms here or use upcoming
+		 * engines discover API.
+		 */
+		configs[num_configs++] = I915_PMU_ENGINE_BUSY(class[i], 0);
+		configs[num_configs++] = I915_PMU_ENGINE_QUEUED(class[i], 0);
+		configs[num_configs++] = I915_PMU_ENGINE_WAIT(class[i], 0);
+		configs[num_configs++] = I915_PMU_ENGINE_SEMA(class[i], 0);
+	}
+	configs[num_configs++] = I915_PMU_ACTUAL_FREQUENCY;
+	configs[num_configs++] = I915_PMU_REQUESTED_FREQUENCY;
+	configs[num_configs++] = I915_PMU_ENERGY;
+	configs[num_configs++] = I915_PMU_RC6_RESIDENCY;
+	configs[num_configs++] = I915_PMU_RC6p_RESIDENCY;
+	configs[num_configs++] = I915_PMU_RC6pp_RESIDENCY;
+
+	igt_assert_eq(perf_init(&pm, num_configs, configs), 0);
+	igt_assert_eq(perf_read(&pm), 0);
+	igt_assert_eq(pm.num_metrics, num_configs);
+
+	perf_close(&pm);
+}
+
+/* Tests that i915 PMU corectly error out in invalid initialization.
+ * i915 PMU is uncore PMU, thus:
+ *  - sampling period is not supported
+ *  - pid > 0 is not supported since we can't count per-process (we count
+ *    per whole system(
+ *  - cpu != 0 is not supported since i915 PMU exposes cpumask for CPU0
+ */
+static void test_invalid_init(void)
+{
+	struct perf_event_attr attr;
+	int pid, cpu;
+
+#define ATTR_INIT() \
+	do { \
+		memset(&attr, 0, sizeof (attr)); \
+		attr.config = I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0); \
+		attr.type = i915_type_id(); \
+		igt_assert(attr.type != 0); \
+	} while(0)
+
+	ATTR_INIT();
+	attr.sample_period = 100;
+	pid = -1;
+	cpu = 0;
+	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+	igt_assert_eq(errno, EINVAL);
+	
+	ATTR_INIT();
+	pid = 0;
+	cpu = 0;
+	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+	igt_assert_eq(errno, EINVAL);
+
+	ATTR_INIT();
+	pid = -1;
+	cpu = 1;
+	igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+	igt_assert_eq(errno, ENODEV);
+}
+
+static int test_single(int fd, uint32_t handle)
+{
+	struct {
+		const char* engine_name;
+		unsigned int class;
+		unsigned int instance;
+		unsigned int ring_id;
+	} engines[] = {
+		{ "rcs0", I915_ENGINE_CLASS_RENDER, 0, I915_EXEC_RENDER },
+		{ "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD | I915_EXEC_BSD_RING1 },
+		{ "vcs1", I915_ENGINE_CLASS_VIDEO, 1, I915_EXEC_BSD | I915_EXEC_BSD_RING2 },
+		{ "bcs0", I915_ENGINE_CLASS_COPY, 0, I915_EXEC_BLT },
+		{ "vecs0", I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, I915_EXEC_VEBOX },
+	};
+	struct pmu_metrics pm;
+	int configs[] = {
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0),
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 0),
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 1),
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_COPY, 0),
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO_ENHANCE, 0),
+	};
+	int num_configs = sizeof(configs)/sizeof(configs[0]);
+	struct timespec start, now;
+
+	igt_assert_eq(perf_init(&pm, num_configs, configs), 0);
+	igt_assert_eq(pm.num_metrics, num_configs);
+
+	for (int i = 0; i < sizeof(engines)/sizeof(engines[0]); ++i) {
+		clock_gettime(CLOCK_MONOTONIC, &start);
+		igt_assert_eq(perf_read(&pm), 0);
+
+		/* Create almost 100% load on the examined engine for specified time. */
+		nop_on_ring(fd, handle, engines[i].ring_id, 20);
+
+		igt_assert_eq(perf_read(&pm), 0);
+		clock_gettime(CLOCK_MONOTONIC, &now);
+
+		igt_info("Executed on %s for %ldus\n", engines[i].engine_name, elapsed_ns(&start, &now));
+		for (int j = 0; j < num_configs; ++j) {
+			igt_info("  %s: %ldus\n", perf_get_metric_name(pm.metrics[j].config), perf_elapsed(&pm.metrics[j]));
+
+			igt_assert(perf_elapsed(&pm.metrics[j]) < elapsed_ns(&start, &now));
+
+			if (configs[j] == I915_PMU_ENGINE_BUSY(engines[i].class, engines[i].instance)) {
+				/* Check that the loaded engine had almost 100% load, we will have 1% tolerance. */
+				igt_assert(perf_elapsed(&pm.metrics[j]) > 0.99 * elapsed_ns(&start, &now));
+			} else if (configs[j] == I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_COPY, 0)) {
+				/* Check that BCS engine had just tiny load, we will have 1% tolerance.
+				 * NOTE Some load on BCS is non-avoidable if you run under any graphical server,
+				 * so we can't check for zero.
+				 */
+				igt_assert(perf_elapsed(&pm.metrics[j]) < 0.01 * elapsed_ns(&start, &now));
+			} else {
+				/* Check that other engines did not have any load.
+				 * NOTE This may fail if you have any other workload running in parallel to this test.
+				 */
+				igt_assert_eq(perf_elapsed(&pm.metrics[j]), 0);
+			}
+		}
+	}
+	perf_close(&pm);
+
+	/* Return how many angines we have tried. */
+	return sizeof(engines)/sizeof(engines[0]);
+}
+
+static void test_parallel(int fd, uint32_t handle)
+{
+	struct pmu_metrics pm;
+	int configs[] = {
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0),
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 0),
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 1),
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_COPY, 0),
+		I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO_ENHANCE, 0),
+	};
+	int num_configs = sizeof(configs)/sizeof(configs[0]);
+	int num_engines;
+	struct timespec start, now;
+
+	igt_assert_eq(perf_init(&pm, num_configs, configs), 0);
+	igt_assert_eq(pm.num_metrics, num_configs);
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	igt_assert_eq(perf_read(&pm), 0);
+
+	/* Create almost 100% load on the engines one by one, we will get back
+	 * how many engines were tried.
+	 */
+	num_engines = test_single(fd, handle);
+
+	igt_assert_eq(perf_read(&pm), 0);
+	clock_gettime(CLOCK_MONOTONIC, &now);
+
+	igt_info("Executed on %d engines for %ldus\n", num_engines, elapsed_ns(&start, &now));
+	for (int j = 0; j < num_configs; ++j) {
+		igt_info("  %s: %ldus\n", perf_get_metric_name(pm.metrics[j].config), perf_elapsed(&pm.metrics[j]));
+
+		/* Since engines were loaded in turns one by one for the barely the same time,
+		 * they each should have produced barely the same load proportional to the
+		 * number of engines.
+		 */
+		igt_assert(perf_elapsed(&pm.metrics[j]) * num_engines > 0.99 * elapsed_ns(&start, &now));
+		igt_assert(perf_elapsed(&pm.metrics[j]) * num_engines < 1.01 * elapsed_ns(&start, &now));
+	}
+	perf_close(&pm);
+}
+
+igt_main
+{
+	uint32_t handle = 0;
+	int device = -1;
+
+	igt_fixture {
+		const uint32_t bbe = MI_BATCH_BUFFER_END;
+
+		device = drm_open_driver(DRIVER_INTEL);
+		igt_require_gem(device);
+
+		handle = gem_create(device, 4096);
+		gem_write(device, handle, 0, &bbe, sizeof(bbe));
+
+		igt_fork_hang_detector(device);
+	}
+
+	/* Test that we can intialize all the metrics. */
+	igt_subtest_f("init")
+		test_init();
+
+	/* Test that we can intialize all the metrics. */
+	igt_subtest_f("invalid_init")
+		test_invalid_init();
+
+	/* Test single metrics consumet. */
+	igt_subtest_f("single")
+		test_single(device, handle);
+
+	/* Test parallel metrics consumers. */
+	igt_subtest_f("parallel")
+		test_parallel(device, handle);
+
+	igt_fixture {
+		igt_stop_hang_detector();
+		gem_close(device, handle);
+		close(device);
+	}
+}
-- 
1.8.3.1