[Intel-gfx] [RFC i-g-t] tests/perf_pmu: test i915 RFC PMU
Dmitry Rogozhkin
dmitry.v.rogozhkin at intel.com
Thu Aug 24 15:05:56 UTC 2017
i915 RFC PMU:
* https://patchwork.freedesktop.org/series/27488/
* https://patchwork.freedesktop.org/series/28842/
Tests:
* init: try to initialize all possible metrics exposed in i915 PMU
(limit to 0-instance of engines)
* invalid_init: verify that i915 PMU correctly error out on invalid
initialization
* single: verify that BUSY metrics work for each engine
* parallel: verify that parallel requests for metrics do not conflict
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
---
tests/Makefile.sources | 1 +
tests/perf_pmu.c | 546 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 547 insertions(+)
create mode 100644 tests/perf_pmu.c
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index bb013c7..51b684b 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -215,6 +215,7 @@ TESTS_progs = \
kms_vblank \
meta_test \
perf \
+ perf_pmu \
pm_backlight \
pm_lpsp \
pm_rc6_residency \
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
new file mode 100644
index 0000000..0d025a6
--- /dev/null
+++ b/tests/perf_pmu.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "igt.h"
+#include "igt_sysfs.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/time.h>
+#include <time.h>
+#include "drm.h"
+
+#define LOCAL_I915_EXEC_NO_RELOC (1<<11)
+#define LOCAL_I915_EXEC_HANDLE_LUT (1<<12)
+
+////////////////////////////////////////////////////////////////////////
+// This is a copy of perf.h from intel-gpu-tools/overlay
+// because I am lazy enough to move it to some common library
+////////////////////////////////////////////////////////////////////////
+
+#include <linux/perf_event.h>
+
+enum drm_i915_gem_engine_class {
+ I915_ENGINE_CLASS_OTHER = 0,
+ I915_ENGINE_CLASS_RENDER = 1,
+ I915_ENGINE_CLASS_COPY = 2,
+ I915_ENGINE_CLASS_VIDEO = 3,
+ I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
+ I915_ENGINE_CLASS_MAX /* non-ABI */
+};
+
+enum drm_i915_pmu_engine_sample {
+ I915_SAMPLE_QUEUED = 0,
+ I915_SAMPLE_BUSY = 1,
+ I915_SAMPLE_WAIT = 2,
+ I915_SAMPLE_SEMA = 3
+};
+
+#define I915_PMU_SAMPLE_BITS (4)
+#define I915_PMU_SAMPLE_MASK (0xf)
+#define I915_PMU_SAMPLE_INSTANCE_BITS (8)
+#define I915_PMU_CLASS_SHIFT \
+ (I915_PMU_SAMPLE_BITS + I915_PMU_SAMPLE_INSTANCE_BITS)
+
+#define __I915_PMU_ENGINE(class, instance, sample) \
+ ((class) << I915_PMU_CLASS_SHIFT | \
+ (instance) << I915_PMU_SAMPLE_BITS | \
+ (sample))
+
+#define I915_PMU_ENGINE_QUEUED(class, instance) \
+ __I915_PMU_ENGINE(class, instance, I915_SAMPLE_QUEUED)
+
+#define I915_PMU_ENGINE_BUSY(class, instance) \
+ __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY)
+
+#define I915_PMU_ENGINE_WAIT(class, instance) \
+ __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT)
+
+#define I915_PMU_ENGINE_SEMA(class, instance) \
+ __I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
+
+#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
+
+#define I915_PMU_ACTUAL_FREQUENCY __I915_PMU_OTHER(0)
+#define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1)
+#define I915_PMU_ENERGY __I915_PMU_OTHER(2)
+#define I915_PMU_INTERRUPTS __I915_PMU_OTHER(3)
+
+#define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(4)
+#define I915_PMU_RC6p_RESIDENCY __I915_PMU_OTHER(5)
+#define I915_PMU_RC6pp_RESIDENCY __I915_PMU_OTHER(6)
+
+static inline int
+perf_event_open(struct perf_event_attr *attr,
+ pid_t pid,
+ int cpu,
+ int group_fd,
+ unsigned long flags)
+{
+#ifndef __NR_perf_event_open
+#if defined(__i386__)
+#define __NR_perf_event_open 336
+#elif defined(__x86_64__)
+#define __NR_perf_event_open 298
+#else
+#define __NR_perf_event_open 0
+#endif
+#endif
+ attr->size = sizeof(*attr);
+ return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+}
+
+static uint64_t i915_type_id(void)
+{
+ char buf[1024];
+ int fd, n;
+
+ fd = open("/sys/bus/event_source/devices/i915/type", 0);
+ if (fd < 0) {
+ n = -1;
+ } else {
+ n = read(fd, buf, sizeof(buf)-1);
+ close(fd);
+ }
+ if (n < 0)
+ return 0;
+
+ buf[n] = '\0';
+ return strtoull(buf, 0, 0);
+}
+
+////////////////////////////////////////////////////////////////////////
+
+static double elapsed(const struct timespec *start, const struct timespec *end)
+{
+ return ((end->tv_sec - start->tv_sec) +
+ (end->tv_nsec - start->tv_nsec)*1e-9);
+}
+
+static uint64_t elapsed_ns(const struct timespec *start, const struct timespec *end)
+{
+ return ((end->tv_sec - start->tv_sec)*1e9 +
+ (end->tv_nsec - start->tv_nsec));
+}
+
+static void nop_on_ring(int fd, uint32_t handle, unsigned ring_id, int timeout)
+{
+ struct drm_i915_gem_execbuffer2 execbuf;
+ struct drm_i915_gem_exec_object2 obj;
+ struct timespec start, now;
+
+ gem_require_ring(fd, ring_id);
+
+ memset(&obj, 0, sizeof(obj));
+ obj.handle = handle;
+
+ memset(&execbuf, 0, sizeof(execbuf));
+ execbuf.buffers_ptr = to_user_pointer(&obj);
+ execbuf.buffer_count = 1;
+ execbuf.flags = ring_id;
+ execbuf.flags |= LOCAL_I915_EXEC_HANDLE_LUT;
+ execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ if (__gem_execbuf(fd, &execbuf)) {
+ execbuf.flags = ring_id;
+ gem_execbuf(fd, &execbuf);
+ }
+
+ do {
+ for (int loop = 0; loop < 1024; loop++) {
+ gem_execbuf(fd, &execbuf);
+ }
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ } while (elapsed(&start, &now) < timeout);
+ gem_sync(fd, handle);
+}
+
+static int perf_i915_open(int config, int group, int read_format)
+{
+ struct perf_event_attr attr;
+
+ memset(&attr, 0, sizeof (attr));
+
+ attr.type = i915_type_id();
+ if (attr.type == 0)
+ return -ENOENT;
+ attr.config = config;
+
+ attr.read_format = read_format;
+ if (group != -1)
+ attr.read_format &= ~PERF_FORMAT_GROUP;
+
+ return perf_event_open(&attr, -1, 0, group, 0);
+}
+
+struct metric {
+ int config;
+ uint64_t old_value;
+ uint64_t value;
+};
+
+struct pmu_metrics {
+ int fd;
+ int read_format;
+ int num_metrics;
+ struct metric* metrics;
+};
+
+static int perf_init(struct pmu_metrics *pm, int num_configs, int* configs)
+{
+ int i, res;
+
+ memset(pm, 0, sizeof(struct pmu_metrics));
+ pm->fd = -1;
+ pm->read_format =
+ PERF_FORMAT_TOTAL_TIME_ENABLED |
+ PERF_FORMAT_GROUP;
+ pm->metrics = (struct metric*)calloc(num_configs, sizeof(struct metric));
+ if (!pm->metrics)
+ return -1;
+
+ for (i = 0; i < num_configs; ++i) {
+ if (pm->fd < 0)
+ res = pm->fd = perf_i915_open(configs[i], -1, pm->read_format);
+ else
+ res = perf_i915_open(configs[i], pm->fd, pm->read_format);
+ if (res >= 0) {
+ pm->metrics[pm->num_metrics++].config = configs[i];
+ }
+ }
+
+ igt_info("perf_init: enabled %d metrics from %d requested\n",
+ pm->num_metrics, num_configs);
+
+ return 0;
+}
+
+static void perf_close(struct pmu_metrics *pm)
+{
+ if (pm->fd != -1 ) { close(pm->fd); pm->fd = -1; }
+ if (pm->metrics) { free(pm->metrics); pm->metrics= NULL; }
+}
+
+/* see 'man 2 perf_event_open' */
+struct perf_read_format {
+ uint64_t nr_values; /* The number of events */
+ uint64_t time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+ struct {
+ uint64_t value; /* The value of the event */
+ } values[1024];
+};
+
+static int perf_read(struct pmu_metrics *pm)
+{
+ int read_format =
+ PERF_FORMAT_TOTAL_TIME_ENABLED |
+ PERF_FORMAT_GROUP;
+ struct perf_read_format data;
+ ssize_t len;
+ int i;
+
+ if (pm->fd < 0)
+ return -1;
+
+ if (pm->read_format != read_format)
+ return -1;
+
+ len = read(pm->fd, &data, sizeof(data));
+ if (len < 0) {
+ return -1;
+ }
+
+ if (pm->num_metrics != data.nr_values)
+ return -1;
+
+ for (i = 0; i < data.nr_values; ++i) {
+ pm->metrics[i].old_value = pm->metrics[i].value;
+ pm->metrics[i].value = data.values[i].value;
+ }
+
+ return 0;
+}
+
+static const char* perf_get_metric_name(int config)
+{
+ switch (config) {
+ case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0):
+ return "i915/rcs0-busy/";
+ case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 0):
+ return "i915/vcs0-busy/";
+ case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 1):
+ return "i915/vcs1-busy/";
+ case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_COPY, 0):
+ return "i915/bcs0-busy/";
+ case I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO_ENHANCE, 0):
+ return "i915/vecs0-busy/";
+ default:
+ return "i915/unknown/";
+ }
+}
+
+static uint64_t perf_elapsed(struct metric* m)
+{
+ return m->value - m->old_value;
+}
+
+static void test_init(void)
+{
+ struct pmu_metrics pm;
+ unsigned int class[] =
+ {
+ I915_ENGINE_CLASS_RENDER,
+ I915_ENGINE_CLASS_VIDEO,
+ I915_ENGINE_CLASS_VIDEO,
+ I915_ENGINE_CLASS_COPY,
+ I915_ENGINE_CLASS_VIDEO_ENHANCE,
+ };
+ int* configs = malloc(1024 * sizeof(int));
+ int num_configs = 0;
+
+ igt_assert(configs != NULL);
+
+ for (int i=0; i < sizeof(class)/sizeof(class[0]); ++i) {
+ /* TODO Adding metrics for 0-instances only. Would be nice
+ * to get everything, but for that we either need to add
+ * check for different platforms here or use upcoming
+ * engines discover API.
+ */
+ configs[num_configs++] = I915_PMU_ENGINE_BUSY(class[i], 0);
+ configs[num_configs++] = I915_PMU_ENGINE_QUEUED(class[i], 0);
+ configs[num_configs++] = I915_PMU_ENGINE_WAIT(class[i], 0);
+ configs[num_configs++] = I915_PMU_ENGINE_SEMA(class[i], 0);
+ }
+ configs[num_configs++] = I915_PMU_ACTUAL_FREQUENCY;
+ configs[num_configs++] = I915_PMU_REQUESTED_FREQUENCY;
+ configs[num_configs++] = I915_PMU_ENERGY;
+ configs[num_configs++] = I915_PMU_RC6_RESIDENCY;
+ configs[num_configs++] = I915_PMU_RC6p_RESIDENCY;
+ configs[num_configs++] = I915_PMU_RC6pp_RESIDENCY;
+
+ igt_assert_eq(perf_init(&pm, num_configs, configs), 0);
+ igt_assert_eq(perf_read(&pm), 0);
+ igt_assert_eq(pm.num_metrics, num_configs);
+
+ perf_close(&pm);
+}
+
+/* Tests that i915 PMU corectly error out in invalid initialization.
+ * i915 PMU is uncore PMU, thus:
+ * - sampling period is not supported
+ * - pid > 0 is not supported since we can't count per-process (we count
+ * per whole system(
+ * - cpu != 0 is not supported since i915 PMU exposes cpumask for CPU0
+ */
+static void test_invalid_init(void)
+{
+ struct perf_event_attr attr;
+ int pid, cpu;
+
+#define ATTR_INIT() \
+ do { \
+ memset(&attr, 0, sizeof (attr)); \
+ attr.config = I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0); \
+ attr.type = i915_type_id(); \
+ igt_assert(attr.type != 0); \
+ } while(0)
+
+ ATTR_INIT();
+ attr.sample_period = 100;
+ pid = -1;
+ cpu = 0;
+ igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+ igt_assert_eq(errno, EINVAL);
+
+ ATTR_INIT();
+ pid = 0;
+ cpu = 0;
+ igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+ igt_assert_eq(errno, EINVAL);
+
+ ATTR_INIT();
+ pid = -1;
+ cpu = 1;
+ igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
+ igt_assert_eq(errno, ENODEV);
+}
+
+static int test_single(int fd, uint32_t handle)
+{
+ struct {
+ const char* engine_name;
+ unsigned int class;
+ unsigned int instance;
+ unsigned int ring_id;
+ } engines[] = {
+ { "rcs0", I915_ENGINE_CLASS_RENDER, 0, I915_EXEC_RENDER },
+ { "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD | I915_EXEC_BSD_RING1 },
+ { "vcs1", I915_ENGINE_CLASS_VIDEO, 1, I915_EXEC_BSD | I915_EXEC_BSD_RING2 },
+ { "bcs0", I915_ENGINE_CLASS_COPY, 0, I915_EXEC_BLT },
+ { "vecs0", I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, I915_EXEC_VEBOX },
+ };
+ struct pmu_metrics pm;
+ int configs[] = {
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0),
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 0),
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 1),
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_COPY, 0),
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO_ENHANCE, 0),
+ };
+ int num_configs = sizeof(configs)/sizeof(configs[0]);
+ struct timespec start, now;
+
+ igt_assert_eq(perf_init(&pm, num_configs, configs), 0);
+ igt_assert_eq(pm.num_metrics, num_configs);
+
+ for (int i = 0; i < sizeof(engines)/sizeof(engines[0]); ++i) {
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ igt_assert_eq(perf_read(&pm), 0);
+
+ /* Create almost 100% load on the examined engine for specified time. */
+ nop_on_ring(fd, handle, engines[i].ring_id, 20);
+
+ igt_assert_eq(perf_read(&pm), 0);
+ clock_gettime(CLOCK_MONOTONIC, &now);
+
+ igt_info("Executed on %s for %ldus\n", engines[i].engine_name, elapsed_ns(&start, &now));
+ for (int j = 0; j < num_configs; ++j) {
+ igt_info(" %s: %ldus\n", perf_get_metric_name(pm.metrics[j].config), perf_elapsed(&pm.metrics[j]));
+
+ igt_assert(perf_elapsed(&pm.metrics[j]) < elapsed_ns(&start, &now));
+
+ if (configs[j] == I915_PMU_ENGINE_BUSY(engines[i].class, engines[i].instance)) {
+ /* Check that the loaded engine had almost 100% load, we will have 1% tolerance. */
+ igt_assert(perf_elapsed(&pm.metrics[j]) > 0.99 * elapsed_ns(&start, &now));
+ } else if (configs[j] == I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_COPY, 0)) {
+ /* Check that BCS engine had just tiny load, we will have 1% tolerance.
+ * NOTE Some load on BCS is non-avoidable if you run under any graphical server,
+ * so we can't check for zero.
+ */
+ igt_assert(perf_elapsed(&pm.metrics[j]) < 0.01 * elapsed_ns(&start, &now));
+ } else {
+ /* Check that other engines did not have any load.
+ * NOTE This may fail if you have any other workload running in parallel to this test.
+ */
+ igt_assert_eq(perf_elapsed(&pm.metrics[j]), 0);
+ }
+ }
+ }
+ perf_close(&pm);
+
+ /* Return how many angines we have tried. */
+ return sizeof(engines)/sizeof(engines[0]);
+}
+
+static void test_parallel(int fd, uint32_t handle)
+{
+ struct pmu_metrics pm;
+ int configs[] = {
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0),
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 0),
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO, 1),
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_COPY, 0),
+ I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_VIDEO_ENHANCE, 0),
+ };
+ int num_configs = sizeof(configs)/sizeof(configs[0]);
+ int num_engines;
+ struct timespec start, now;
+
+ igt_assert_eq(perf_init(&pm, num_configs, configs), 0);
+ igt_assert_eq(pm.num_metrics, num_configs);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ igt_assert_eq(perf_read(&pm), 0);
+
+ /* Create almost 100% load on the engines one by one, we will get back
+ * how many engines were tried.
+ */
+ num_engines = test_single(fd, handle);
+
+ igt_assert_eq(perf_read(&pm), 0);
+ clock_gettime(CLOCK_MONOTONIC, &now);
+
+ igt_info("Executed on %d engines for %ldus\n", num_engines, elapsed_ns(&start, &now));
+ for (int j = 0; j < num_configs; ++j) {
+ igt_info(" %s: %ldus\n", perf_get_metric_name(pm.metrics[j].config), perf_elapsed(&pm.metrics[j]));
+
+ /* Since engines were loaded in turns one by one for the barely the same time,
+ * they each should have produced barely the same load proportional to the
+ * number of engines.
+ */
+ igt_assert(perf_elapsed(&pm.metrics[j]) * num_engines > 0.99 * elapsed_ns(&start, &now));
+ igt_assert(perf_elapsed(&pm.metrics[j]) * num_engines < 1.01 * elapsed_ns(&start, &now));
+ }
+ perf_close(&pm);
+}
+
+igt_main
+{
+ uint32_t handle = 0;
+ int device = -1;
+
+ igt_fixture {
+ const uint32_t bbe = MI_BATCH_BUFFER_END;
+
+ device = drm_open_driver(DRIVER_INTEL);
+ igt_require_gem(device);
+
+ handle = gem_create(device, 4096);
+ gem_write(device, handle, 0, &bbe, sizeof(bbe));
+
+ igt_fork_hang_detector(device);
+ }
+
+ /* Test that we can intialize all the metrics. */
+ igt_subtest_f("init")
+ test_init();
+
+ /* Test that we can intialize all the metrics. */
+ igt_subtest_f("invalid_init")
+ test_invalid_init();
+
+ /* Test single metrics consumet. */
+ igt_subtest_f("single")
+ test_single(device, handle);
+
+ /* Test parallel metrics consumers. */
+ igt_subtest_f("parallel")
+ test_parallel(device, handle);
+
+ igt_fixture {
+ igt_stop_hang_detector();
+ gem_close(device, handle);
+ close(device);
+ }
+}
--
1.8.3.1
More information about the Intel-gfx
mailing list