[Intel-gfx] [PATCH i-g-t 5/7] tests/perf_pmu: Tests for i915 PMU API

Mon Sep 25 16:21:30 UTC 2017

Quoting Tvrtko Ursulin (2017-09-25 16:15:00)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> A bunch of tests for the new i915 PMU feature.
> 
> Parts of the code were initialy sketched by Dmitry Rogozhkin.
> 
> v2: (Most suggestions by Chris Wilson)
>  * Add new class/instance based engine list.
>  * Add gem_has_engine/gem_require_engine to work with class/instance.
>  * Use the above two throughout the test.
>  * Shorten tests to 100ms busy batches, seems enough.
>  * Add queued counter sanity checks.
>  * Use igt_nsec_elapsed.
>  * Skip on perf -ENODEV in some tests instead of embedding knowledge locally.
>  * Fix multi ordering for busy accounting.
>  * Use new guranteed_usleep when sleep time is asserted on.
>  * Check for no queued when idle/busy.
>  * Add queued counter init test.
>  * Add queued tests.
>  * Consolidate and increase multiple busy engines tests to most-busy and
>    all-busy tests.
>  * Guarantte interrupts by using fences.
>  * Test RC6 via forcewake.
> 
> v3:
>  * Tweak assert in interrupts subtest.
>  * Sprinkle of comments.
>  * Fix multi-client test which got broken in v2.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin at intel.com>
> ---
>  lib/igt_gt.c           |  50 +++
>  lib/igt_gt.h           |  38 +++
>  lib/igt_perf.h         |   9 +-
>  tests/Makefile.sources |   1 +
>  tests/perf_pmu.c       | 869 +++++++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 959 insertions(+), 8 deletions(-)
>  create mode 100644 tests/perf_pmu.c
> 
> diff --git a/lib/igt_gt.c b/lib/igt_gt.c
> index b3f3b3809eee..4c75811fb1b3 100644
> --- a/lib/igt_gt.c
> +++ b/lib/igt_gt.c
> @@ -568,3 +568,53 @@ bool gem_can_store_dword(int fd, unsigned int engine)
>  
>         return true;
>  }
> +

> +const struct intel_execution_engine2 intel_execution_engines2[] = {

If you want to do a quick
	s/intel_execution_engine/intel_execution_legacy_ring/
or wait until class/inst is the natural execbuf interface?

> +const double tolerance = 0.02f;
> +const unsigned long batch_duration_ns = 100 * 1000 * 1000;
> +
> +static int open_pmu(uint64_t config)
> +{
> +       int fd;
> +
> +       fd = perf_i915_open(config);
> +       igt_require(fd >= 0 || (fd < 0 && errno != ENODEV));
> +       igt_assert(fd >= 0);

So why are we not allowed to disable perf counters?

> +       return fd;
> +}
> +
> +static int open_group(uint64_t config, int group)
> +{
> +       int fd;
> +
> +       fd = perf_i915_open_group(config, group);
> +       igt_require(fd >= 0 || (fd < 0 && errno != ENODEV));
> +       igt_assert(fd >= 0);
> +
> +       return fd;
> +}
> +
> +static void
> +init(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample)
> +{
> +       int fd;
> +
> +       fd = open_pmu(__I915_PMU_ENGINE(e->class, e->instance, sample));
> +
> +       close(fd);
> +}
> +
> +static uint64_t pmu_read_single(int fd)
> +{
> +       uint64_t data[2];
> +       ssize_t len;
> +
> +       len = read(fd, data, sizeof(data));
> +       igt_assert_eq(len, sizeof(data));

read() isn't a complicated macro, so won't this have a reasonable
string as igt_assert_eq(read(fd, data, sizeof(data)), sizeof(data)) ?

> +       return data[0];
> +}
> +
> +static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)
> +{
> +       uint64_t buf[2 + num];
> +       unsigned int i;
> +       ssize_t len;
> +
> +       len = read(fd, buf, sizeof(buf));
> +       igt_assert_eq(len, sizeof(buf));
> +       for (i = 0; i < num; i++)
> +               val[i] = buf[2 + i];
> +}
> +
> +#define assert_within_epsilon(x, ref, tolerance) \
> +       igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
> +                    (double)(x) >= (1.0 - tolerance) * (double)ref, \
> +                    "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
> +                    #x, #ref, (double)x, tolerance * 100.0, (double)ref)
> +
> +/*
> + * Helper for cases where we assert on time spent sleeping (directly or
> + * indirectly), so make it more robust by ensuring the system sleep time
> + * is within test tolerance to start with.
> + */

Looking at the callers, I don't see where we need a guaranteed sleep vs
a known sleep. We are comparing elapsed/wall time with a perf counter, so
we only care that they match. By asserting we just cause random fails if
the system decides not to wake up for a 1s (outside the purvey of the
test).

> +static void guaranteed_usleep(unsigned int usec)
> +{
> +       uint64_t slept = 0, to_sleep = usec;
> +
> +       while (usec > 0) {
> +               struct timespec start = { };
> +               uint64_t this_sleep;
> +
> +               igt_nsec_elapsed(&start);
> +               usleep(usec);
> +               this_sleep = igt_nsec_elapsed(&start) / 1000;
> +               slept += this_sleep;
> +               if (this_sleep > usec)
> +                       break;
> +               usec -= this_sleep;
> +       }
> +
> +       assert_within_epsilon(slept, to_sleep, tolerance);
> +}
> +
> +static unsigned int e2ring(int gem_fd, const struct intel_execution_engine2 *e)
> +{
> +       return gem_class_instance_to_eb_flags(gem_fd, e->class, e->instance);
> +}
> +
> +static void
> +single(int gem_fd, const struct intel_execution_engine2 *e, uint8_t sample,
> +       bool busy)
> +{
> +       double ref = busy && sample == I915_SAMPLE_BUSY ?
> +                    batch_duration_ns : 0.0f;
> +       igt_spin_t *spin;
> +       uint64_t val;
> +       int fd;
> +
> +       fd = open_pmu(__I915_PMU_ENGINE(e->class, e->instance, sample));
> +
> +       if (busy) {
> +               spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +               igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +       } else {
> +               guaranteed_usleep(batch_duration_ns / 1000);
> +       }
> +
> +       if (busy)
> +               gem_sync(gem_fd, spin->handle);
> +
> +       val = pmu_read_single(fd);
> +
> +       assert_within_epsilon(val, ref, tolerance);
> +
> +       if (busy)
> +               igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +
> +static void
> +queued(int gem_fd, const struct intel_execution_engine2 *e)
> +{
> +       igt_spin_t *spin[2];
> +       uint64_t val;
> +       int fd;
> +
> +       fd = open_pmu(I915_PMU_ENGINE_QUEUED(e->class, e->instance));
> +
> +       /*
> +        * First spin batch will be immediately executing.
> +        */
> +       spin[0] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +       igt_spin_batch_set_timeout(spin[0], batch_duration_ns);
> +
> +       /*
> +        * Second spin batch will sit in the execution queue behind the
> +        * first one so must cause the PMU to correctly report the queued
> +        * counter.
> +        */

Hmm, the second to the same ctx should be executed. We will see that
port[1] is empty and so ask if we can fit the batch into port[1]. In
fact, it turns out that we can coalesce the batch with port[0].
Once coalesced, we can't distinguish the two requests since they are a
single context in the ELSP.

All this is presuming execlists...

iirc execbuf-ioctl() -> [untracked]
  \- submit_request -> QUEUED
      \- execlists_dequeue -> BUSY

> +       spin[1] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +       igt_spin_batch_set_timeout(spin[1], batch_duration_ns);
> +
> +       gem_sync(gem_fd, spin[0]->handle);
> +
> +       val = pmu_read_single(fd);
> +       assert_within_epsilon(val, batch_duration_ns, tolerance);
> +
> +       gem_sync(gem_fd, spin[1]->handle);
> +
> +       igt_spin_batch_free(gem_fd, spin[0]);
> +       igt_spin_batch_free(gem_fd, spin[1]);
> +       close(fd);
> +}
> +
> +static void
> +busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
> +              const unsigned int num_engines)
> +{
> +       const struct intel_execution_engine2 *e_;
> +       uint64_t val[num_engines];
> +       int fd[num_engines];
> +       igt_spin_t *spin;
> +       unsigned int busy_idx, i;
> +
> +       i = 0;
> +       fd[0] = -1;
> +       for_each_engine_class_instance(fd, e_) {
> +               if (!gem_has_engine(gem_fd, e_->class, e_->instance))
> +                       continue;
> +               else if (e == e_)
> +                       busy_idx = i;
> +
> +               fd[i++] = open_group(I915_PMU_ENGINE_BUSY(e_->class,
> +                                                         e_->instance),
> +                                    fd[0]);
> +       }
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +
> +       gem_sync(gem_fd, spin->handle);
> +
> +       pmu_read_multi(fd[0], num_engines, val);
> +
> +       assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
> +       for (i = 0; i < num_engines; i++) {

double expected;

if (i == busy_idx)
	expected = batch_duration_ns;
else
	expected = 0;
assert_within_epsilon(val[i], expected, tolerance);

Probably worth a preceding

for (i = 0; i < num_engines; i++)
	len += sprintf(buf + len, "%s%s=%f, ", i == busy_idx ? "*" : "", name[i], val[i]);
buf[len-2] = '\0;'
igt_info("%s\n", buf);

How about batches executing from a different context, or submitted from
a different fd?

> +               if (i == busy_idx)
> +                       continue;
> +               assert_within_epsilon(val[i], 0.0f, tolerance);
> +       }
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd[0]);
> +}
> +
> +static void
> +most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
> +                   const unsigned int num_engines)
> +{
> +       const struct intel_execution_engine2 *e_;
> +       uint64_t val[num_engines];
> +       int fd[num_engines];
> +       igt_spin_t *spin[num_engines];
> +       unsigned int idle_idx, i;
> +
> +       gem_require_engine(gem_fd, e->class, e->instance);
> +
> +       i = 0;
> +       fd[0] = -1;
> +       for_each_engine_class_instance(fd, e_) {
> +               if (!gem_has_engine(gem_fd, e_->class, e_->instance))
> +                       continue;
> +
> +               fd[i] = open_group(I915_PMU_ENGINE_BUSY(e_->class,
> +                                                       e_->instance),
> +                                  fd[0]);
> +
> +               if (e == e_) {
> +                       idle_idx = i;
> +               } else {
> +                       spin[i] = igt_spin_batch_new(gem_fd, 0,
> +                                                    e2ring(gem_fd, e_), 0);
> +                       igt_spin_batch_set_timeout(spin[i], batch_duration_ns);
> +               }
> +
> +               i++;
> +       }
> +
> +       for (i = 0; i < num_engines; i++) {
> +               if (i != idle_idx)
> +                       gem_sync(gem_fd, spin[i]->handle);
> +       }
> +
> +       pmu_read_multi(fd[0], num_engines, val);
> +
> +       for (i = 0; i < num_engines; i++) {
> +               if (i == idle_idx)
> +                       assert_within_epsilon(val[i], 0.0f, tolerance);
> +               else
> +                       assert_within_epsilon(val[i], batch_duration_ns,
> +                                             tolerance);
> +       }
> +
> +       for (i = 0; i < num_engines; i++) {
> +               if (i != idle_idx)
> +                       igt_spin_batch_free(gem_fd, spin[i]);
> +       }
> +       close(fd[0]);
> +}
> +
> +static void
> +all_busy_check_all(int gem_fd, const unsigned int num_engines)
> +{
> +       const struct intel_execution_engine2 *e;
> +       uint64_t val[num_engines];
> +       int fd[num_engines];
> +       igt_spin_t *spin[num_engines];
> +       unsigned int i;
> +
> +       i = 0;
> +       fd[0] = -1;
> +       for_each_engine_class_instance(fd, e) {
> +               if (!gem_has_engine(gem_fd, e->class, e->instance))
> +                       continue;
> +
> +               fd[i] = open_group(I915_PMU_ENGINE_BUSY(e->class, e->instance),
> +                                  fd[0]);
> +
> +               spin[i] = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +               igt_spin_batch_set_timeout(spin[i], batch_duration_ns);
> +
> +               i++;
> +       }
> +
> +       for (i = 0; i < num_engines; i++)
> +               gem_sync(gem_fd, spin[i]->handle);
> +
> +       pmu_read_multi(fd[0], num_engines, val);
> +
> +       for (i = 0; i < num_engines; i++)
> +               assert_within_epsilon(val[i], batch_duration_ns, tolerance);
> +
> +       for (i = 0; i < num_engines; i++)
> +               igt_spin_batch_free(gem_fd, spin[i]);
> +       close(fd[0]);
> +}
> +
> +static void
> +no_sema(int gem_fd, const struct intel_execution_engine2 *e, bool busy)
> +{
> +       igt_spin_t *spin;
> +       uint64_t val[2];
> +       int fd;
> +
> +       fd = open_group(I915_PMU_ENGINE_SEMA(e->class, e->instance), -1);
> +       open_group(I915_PMU_ENGINE_WAIT(e->class, e->instance), fd);
> +
> +       if (busy) {
> +               spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +               igt_spin_batch_set_timeout(spin, batch_duration_ns);

These busy paths are not serialised to the batch, it is quite possible
for the pmu_read to happen before the spin is even scheduled for
execution on the hw.

> +       } else {
> +               usleep(batch_duration_ns / 1000);
> +       }
> +
> +       pmu_read_multi(fd, 2, val);
> +
> +       assert_within_epsilon(val[0], 0.0f, tolerance);
> +       assert_within_epsilon(val[1], 0.0f, tolerance);
> +
> +       if (busy)
> +               igt_spin_batch_free(gem_fd, spin);
> +       close(fd);

I'm still at a loss as to why this test is interesting. You want to say
that given an idle engine with a single batch there is no time spent
waiting for an MI event, nor waiting on a semaphore. Where is that
guaranteed? What happens if we do need to do such a pre-batch + sync in
the future. If you wanted to say that whilst the batch is executing that
no time is spent processing requests the batch isn't making, then maybe.
But without a test to demonstrate the opposite, we still have no idea if
the counters work.

> +}
> +
> +static void
> +multi_client(int gem_fd, const struct intel_execution_engine2 *e)
> +{
> +       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
> +       igt_spin_t *spin;
> +       uint64_t val[2];
> +       int fd[2];
> +
> +       fd[0] = open_pmu(config);
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +
> +       guaranteed_usleep(batch_duration_ns / 3000);
> +
> +       /*
> +        * Second PMU client which is initialized after the first one,
> +        * and exists before it, should not affect accounting as reported
> +        * in the first client.
> +        */
> +       fd[1] = open_pmu(config);
> +       guaranteed_usleep(batch_duration_ns / 3000);
> +       val[1] = pmu_read_single(fd[1]);
> +       close(fd[1]);
> +
> +       gem_sync(gem_fd, spin->handle);
> +
> +       val[0] = pmu_read_single(fd[0]);
> +
> +       assert_within_epsilon(val[0], batch_duration_ns, tolerance);
> +       assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd[0]);
> +}
> +
> +/**
> + * Tests that i915 PMU corectly errors out in invalid initialization.
> + * i915 PMU is uncore PMU, thus:
> + *  - sampling period is not supported
> + *  - pid > 0 is not supported since we can't count per-process (we count
> + *    per whole system)
> + *  - cpu != 0 is not supported since i915 PMU exposes cpumask for CPU0
> + */
> +static void invalid_init(void)
> +{
> +       struct perf_event_attr attr;
> +       int pid, cpu;
> +
> +#define ATTR_INIT() \
> +do { \
> +       memset(&attr, 0, sizeof (attr)); \
> +       attr.config = I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0); \
> +       attr.type = i915_type_id(); \
> +       igt_assert(attr.type != 0); \
> +} while(0)
> +
> +       ATTR_INIT();
> +       attr.sample_period = 100;
> +       pid = -1;
> +       cpu = 0;
> +       igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
> +       igt_assert_eq(errno, EINVAL);
> +
> +       ATTR_INIT();
> +       pid = 0;
> +       cpu = 0;
> +       igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
> +       igt_assert_eq(errno, EINVAL);
> +
> +       ATTR_INIT();
> +       pid = -1;
> +       cpu = 1;
> +       igt_assert_eq(perf_event_open(&attr, pid, cpu, -1, 0), -1);
> +       igt_assert_eq(errno, ENODEV);
> +}
> +
> +static void init_other(unsigned int i, bool valid)
> +{
> +       int fd;
> +
> +       fd = perf_i915_open(__I915_PMU_OTHER(i));
> +       igt_require(!(fd < 0 && errno == ENODEV));

Make perf_i915_open return -errno. It simplifies checks so much.

> +       if (valid) {
> +               igt_assert(fd >= 0);
> +       } else {
> +               igt_assert(fd < 0);
> +               return;
> +       }
> +
> +       close(fd);
> +}
> +
> +static void read_other(unsigned int i, bool valid)
> +{
> +       int fd;
> +
> +       fd = perf_i915_open(__I915_PMU_OTHER(i));
> +       igt_require(!(fd < 0 && errno == ENODEV));
> +       if (valid) {
> +               igt_assert(fd >= 0);
> +       } else {
> +               igt_assert(fd < 0);
> +               return;
> +       }
> +
> +       (void)pmu_read_single(fd);
> +
> +       close(fd);
> +}
> +
> +static bool cpu0_hotplug_support(void)
> +{
> +       int fd = open("/sys/devices/system/cpu/cpu0/online", O_WRONLY);

access("..", W_OK);

> +
> +       close(fd);
> +
> +       return fd > 0;
> +}
> +
> +static void cpu_hotplug(int gem_fd)
> +{
> +       struct timespec start = { };
> +       igt_spin_t *spin;
> +       uint64_t val, ref;
> +       int fd;
> +
> +       igt_require(cpu0_hotplug_support());
> +
> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> +       fd = perf_i915_open(I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0));
> +       igt_assert(fd >= 0);
> +
> +       igt_nsec_elapsed(&start);
> +
> +       /*
> +        * Toggle online status of all the CPUs in a child process and ensure
> +        * this has not affected busyness stats in the parent.
> +        */
> +       igt_fork(child, 1) {
> +               int cpu = 0;
> +
> +               for (;;) {
> +                       char name[128];
> +                       int cpufd;
> +
> +                       sprintf(name, "/sys/devices/system/cpu/cpu%d/online",
> +                               cpu);
> +                       cpufd = open(name, O_WRONLY);
> +                       if (cpufd == -1) {
> +                               igt_assert(cpu > 0);
> +                               break;
> +                       }
> +                       igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +                       usleep(1000 * 1000);
> +
> +                       igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +                       close(cpufd);
> +                       cpu++;
> +               }
> +       }
> +
> +       igt_waitchildren();
> +
> +       igt_spin_batch_end(spin);
> +       gem_sync(gem_fd, spin->handle);
> +
> +       ref = igt_nsec_elapsed(&start);
> +       val = pmu_read_single(fd);
> +
> +       assert_within_epsilon(val, ref, tolerance);
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +
> +static int chain_nop(int gem_fd, int in_fence, bool sync)
> +{
> +       struct drm_i915_gem_exec_object2 obj = {};
> +       struct drm_i915_gem_execbuffer2 eb =
> +               { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
> +       const uint32_t bbe = 0xa << 23;
> +
> +       obj.handle = gem_create(gem_fd, sizeof(bbe));
> +       gem_write(gem_fd, obj.handle, 0, &bbe, sizeof(bbe));
> +
> +       eb.flags = I915_EXEC_RENDER | I915_EXEC_FENCE_OUT;
> +
> +       if (in_fence >= 0) {
> +               eb.flags |= I915_EXEC_FENCE_IN;
> +               eb.rsvd2 = in_fence;
> +       }
> +
> +       gem_execbuf_wr(gem_fd, &eb);
> +
> +       if (sync)
> +               gem_sync(gem_fd, obj.handle);
> +
> +       gem_close(gem_fd, obj.handle);
> +       if (in_fence >= 0)
> +               close(in_fence);
> +
> +       return eb.rsvd2 >> 32;
> +}
> +
> +static void
> +test_interrupts(int gem_fd)
> +{
> +       uint64_t idle, busy, prev;
> +       int fd, fence = -1;
> +       const unsigned int count = 1000;
> +       unsigned int i;
> +
> +       fd = open_pmu(I915_PMU_INTERRUPTS);
> +
> +       gem_quiescent_gpu(gem_fd);
> +
> +       /* Wait for idle state. */
> +       prev = pmu_read_single(fd);
> +       idle = prev + 1;
> +       while (idle != prev) {
> +               usleep(batch_duration_ns / 1000);
> +               prev = idle;
> +               idle = pmu_read_single(fd);
> +       }
> +
> +       igt_assert_eq(idle - prev, 0);
> +
> +       /* Send some no-op batches with chained fences to ensure interrupts. */

Not all kernels would emit interrupts for this. Remember we used to spin
on the in-fence to avoid the interrupt setup overhead. And there's no
reason to assume we wouldn't again if there was a demonstrable
advantage.

You've assumed execlists elsewhere, so why not generate context-switch
interrupts instead? I presume you did try MI_USER_INTERRUPT from a user
batch? iirc it should still work.

> +       for (i = 1; i <= count; i++)
> +               fence = chain_nop(gem_fd, fence, i < count ? false : true);
> +       close(fence);
> +
> +       /* Check at least as many interrupts has been generated. */
> +       busy = pmu_read_single(fd);
> +       igt_assert(busy > count / 20);
> +
> +       close(fd);
> +}
> +
> +static void
> +test_frequency(int gem_fd)
> +{
> +       igt_spin_t *spin;
> +       uint64_t idle[2], busy[2];
> +       int fd;
> +
> +       fd = open_group(I915_PMU_REQUESTED_FREQUENCY, -1);
> +       open_group(I915_PMU_ACTUAL_FREQUENCY, fd);
> +
> +       gem_quiescent_gpu(gem_fd);
> +       usleep(batch_duration_ns / 1000);
> +
> +       pmu_read_multi(fd, 2, idle);
> +
> +       /*
> +        * Put some load on the render engine and check that the requenst
> +        * and actual frequencies have gone up.
> +        *
> +        * FIXME This might not be guaranteed to work on all platforms.
> +        * How to detect those?
> +        */

I don't think we can make any guarantees about freq, except the
softlimits imposed via sysfs.

So use that.

Create a spin batch, set min/max to low. Measure. Set min/max to high, sample
again. The results should follow the values set via sysfs.

> +       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
> +       igt_spin_batch_set_timeout(spin, batch_duration_ns);
> +       gem_sync(gem_fd, spin->handle);
> +
> +       pmu_read_multi(fd, 2, busy);
> +
> +       igt_assert(busy[0] > idle[0]);
> +       igt_assert(busy[1] > idle[1]);
> +
> +       igt_spin_batch_free(gem_fd, spin);
> +       close(fd);
> +}
> +
> +static void
> +test_rc6(int gem_fd)
> +{
> +       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
> +       uint64_t idle, busy, prev;
> +       int fd, fw;
> +
> +       fd = open_pmu(I915_PMU_RC6_RESIDENCY);
> +
> +       gem_quiescent_gpu(gem_fd);
> +
> +       /* Go idle and check full RC6. */
> +       prev = pmu_read_single(fd);
> +       guaranteed_usleep(duration_ns / 1000);
> +       idle = pmu_read_single(fd);
> +
> +       assert_within_epsilon(idle - prev, duration_ns, tolerance);
> +
> +       /* Wake up device and check no RC6. */
> +       fw = igt_open_forcewake_handle(gem_fd);
> +       igt_assert(fw >= 0);
> +
> +       prev = pmu_read_single(fd);
> +       guaranteed_usleep(duration_ns / 1000);
> +       busy = pmu_read_single(fd);
> +
> +       assert_within_epsilon(busy - prev, 0.0, tolerance);
> +
> +       close(fw);
> +       close(fd);

Ok (other than the guaranteed vs known sleep).

> +}
> +
> +static void
> +test_rc6p(int gem_fd)
> +{
> +       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
> +       unsigned int num_pmu = 1;
> +       uint64_t idle[3], busy[3], prev[3];
> +       unsigned int i;
> +       int fd, ret, fw;
> +
> +       fd = open_group(I915_PMU_RC6_RESIDENCY, -1);
> +       ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
> +       if (ret > 0) {
> +               num_pmu++;
> +               ret = perf_i915_open_group(I915_PMU_RC6pp_RESIDENCY, fd);
> +               if (ret > 0)
> +                       num_pmu++;
> +       }
> +
> +       igt_require(num_pmu == 3);

Do we expose all RC6 counters, even on hw that doesn't support them all?
We pretty much only expect to have an rc6 counter (from snb+)

> +
> +       gem_quiescent_gpu(gem_fd);
> +
> +       /* Go idle and check full RC6. */
> +       pmu_read_multi(fd, num_pmu, prev);
> +       guaranteed_usleep(duration_ns / 1000);
> +       pmu_read_multi(fd, num_pmu, idle);
> +
> +       for (i = 0; i < num_pmu; i++)
> +               assert_within_epsilon(idle[i] - prev[i], duration_ns,
> +                                     tolerance);
> +
> +       /* Wake up device and check no RC6. */
> +       fw = igt_open_forcewake_handle(gem_fd);
> +       igt_assert(fw >= 0);
> +
> +       pmu_read_multi(fd, num_pmu, prev);
> +       guaranteed_usleep(duration_ns / 1000);
> +       pmu_read_multi(fd, num_pmu, busy);
> +
> +       for (i = 0; i < num_pmu; i++)
> +               assert_within_epsilon(busy[i] - prev[i], 0.0, tolerance);
> +
> +       close(fw);
> +       close(fd);
> +}
> +
> +igt_main
> +{
> +       const unsigned int num_other_metrics =
> +                               I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
> +       unsigned int num_engines = 0;
> +       int fd = -1;
> +       const struct intel_execution_engine2 *e;
> +       unsigned int i;
> +
> +       igt_fixture {
> +               fd = drm_open_driver_master(DRIVER_INTEL);

master? For what? I didn't see any display interactions, or secure
dispatch. Does this mean that we only have perf counters for the master?
Shouldn't we also be checking that we can capture rendernodes as well as
authed fd?
-Chris