[igt-dev] [RFC i-g-t] tests/perf_pmu: Verify engine busyness accuracy

Tue Jan 30 21:38:34 UTC 2018

Quoting Tvrtko Ursulin (2018-01-30 18:11:04)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> A subtest to verify that the engine busyness is reported with expected
> accuracy on platforms where the feature is available.
> 
> We test three patterns: 2%, 50% and 98% load per engine.
> 
> v2:
>  * Use spin batch instead of nop calibration.
>  * Various tweaks.
> 
> v3:
>  * Change loops to be time based.
>  * Use __igt_spin_batch_new inside timing sensitive loops.
>  * Fixed PWM sleep handling.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>  tests/perf_pmu.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 145 insertions(+)
> 
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 2f7d33414a53..c6083d7102bd 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -35,6 +35,7 @@
>  #include <dirent.h>
>  #include <time.h>
>  #include <poll.h>
> +#include <sched.h>
>  
>  #include "igt.h"
>  #include "igt_core.h"
> @@ -1169,6 +1170,140 @@ test_rc6(int gem_fd)
>         assert_within_epsilon(busy - prev, 0.0, tolerance);
>  }
>  
> +static uint64_t __pmu_read_single(int fd, uint64_t *ts)
> +{
> +       uint64_t data[2];
> +
> +       igt_assert_eq(read(fd, data, sizeof(data)), sizeof(data));
> +
> +       *ts = data[1];
> +
> +       return data[0];
> +}
> +
> +static double __error(double val, double ref)
> +{
> +       return (100.0 * val / ref) - 100.0;
> +}
> +
> +static void debug_error(const char *str, double val, double ref)
> +{
> +       igt_debug("%s=%.2f%% (%.2f/%.2f)\n", str, __error(val, ref), val, ref);
> +}
> +
> +static void log_error(const char *str, double val, double ref)
> +{
> +       debug_error(str, val, ref);
> +       igt_info("%s=%.2f%%\n", str, __error(val, ref));

Not convinced you want both log levels, is the raw value in () too
confusing to show by default?

> +}
> +
> +#define div_round_up(a, b) (((a) + (b) - 1) / (b))
> +
> +static void
> +accuracy(int gem_fd, const struct intel_execution_engine2 *e,
> +        unsigned long target_busy_pct)
> +{
> +       const unsigned int test_us = 1e6;
> +       const unsigned int pwm_calibration_us = 500e3;
> +       unsigned long busy_us = 2500;
> +       unsigned long idle_us = 100 * (busy_us - target_busy_pct *
> +                               busy_us / 100) / target_busy_pct;
> +       double busy_r;
> +       uint64_t val[2];
> +       uint64_t ts[2];
> +       int fd;
> +
> +       /* Sampling platforms cannot reach the high accuracy criteria. */
> +       igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8);
> +
> +       while (idle_us < 2500) {
> +               busy_us *= 2;
> +               idle_us *= 2;
> +       }
> +
> +       assert_within_epsilon((double)busy_us / (busy_us + idle_us),
> +                               (double)target_busy_pct / 100.0, tolerance);
> +
> +       /* Emit PWM pattern on the engine from a child. */
> +       igt_fork(child, 1) {
> +               struct sched_param rt = { .sched_priority = 99 };
> +               struct timespec test_start = { };
> +               unsigned long overhead_ns = 0;
> +               unsigned long loops = 0;
> +
> +               /* We need the best sleep accuracy we can get. */
> +               igt_require(sched_setscheduler(0,
> +                                              SCHED_FIFO | SCHED_RESET_ON_FORK,
> +                                              &rt) == 0);

(RESET_ON_FORK may be overkill here ;)

> +
> +               /* Measure setup overhead. */
> +               igt_nsec_elapsed(&test_start);
> +               do {
> +                       struct timespec start = { };
> +                       igt_spin_t *spin;
> +                       unsigned int ns;
> +
> +                       igt_nsec_elapsed(&start);
> +                       spin = __igt_spin_batch_new(gem_fd, 0,
> +                                                   e2ring(gem_fd, e), 0);
> +                       igt_spin_batch_end(spin);
> +                       ns = igt_nsec_elapsed(&start);
> +                       gem_sync(gem_fd, spin->handle);
> +                       igt_spin_batch_free(gem_fd, spin);
> +                       overhead_ns += ns;
> +                       loops++;
> +                       usleep(1000);

Hmm, first request is for whitespace! If we require consistent setup
overhead, I think you will want to keep the single spinner around and
re-execbuf it. (Restoring the first dword back to MI_NOOP/MI_ARB before
use.) That way we won't keep creating new bo with the irregularities
that may cause.

> +               } while (igt_nsec_elapsed(&test_start) <
> +                        pwm_calibration_us * 1000);
> +
> +               overhead_ns = div_round_up(overhead_ns, loops);
> +               igt_debug("spin setup overhead = %luus\n", overhead_ns / 1000);
> +               igt_assert(overhead_ns < busy_us * 1000);
> +
> +               /* Emit PWM busy signal. */
> +               busy_us -= overhead_ns / 1000;
> +               memset(&test_start, 0, sizeof(test_start));
> +               igt_nsec_elapsed(&test_start);
> +               do {
> +                       struct timespec start = { };
> +                       igt_spin_t *spin;
> +                       unsigned int ns, sleep;
> +
> +                       spin = __igt_spin_batch_new(gem_fd, 0,
> +                                                   e2ring(gem_fd, e), 0);
> +                       ns = measured_usleep(busy_us);
> +                       igt_spin_batch_end(spin);
> +                       debug_error("busy sleep error", ns, busy_us * 1000);
> +                       gem_sync(gem_fd, spin->handle);
> +                       igt_nsec_elapsed(&start);
> +                       igt_spin_batch_free(gem_fd, spin);
> +                       ns = igt_nsec_elapsed(&start);
> +                       igt_assert_lt(ns, idle_us * 1000);
> +                       sleep = idle_us * 1000 - ns;
> +                       ns = measured_usleep(sleep / 1000);
> +                       debug_error("idle sleep error", ns, sleep);

We definitely would benefit from whitespace here! There's at least two
different phases in each loop, and measurement on top.

> +               } while (igt_nsec_elapsed(&test_start) < test_us * 2 * 1000);
> +       }
> +
> +       /* Let the child run. */
> +       usleep(pwm_calibration_us * 2);
> +
> +       /* Collect engine busyness for a subset of child runtime. */
> +       fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
> +       val[0] = __pmu_read_single(fd, &ts[0]);
> +       usleep(test_us / 2);
> +       val[1] = __pmu_read_single(fd, &ts[1]);
> +       close(fd);

Includes its own timestamp, that should eradicate one whole class of
problem!

> +
> +       igt_waitchildren();
> +
> +       busy_r = (double)(val[1] - val[0]) / (ts[1] - ts[0]);
> +
> +       log_error("error", busy_r, target_busy_pct / 100.0);
> +
> +       assert_within_epsilon(busy_r, (double)target_busy_pct / 100.0, 0.15);
> +}
> +
>  igt_main
>  {
>         const unsigned int num_other_metrics =
> @@ -1197,6 +1332,8 @@ igt_main
>                 invalid_init();
>  
>         for_each_engine_class_instance(fd, e) {
> +               const unsigned int pct[] = { 2, 50, 98 };
> +
>                 /**
>                  * Test that a single engine metric can be initialized.
>                  */
> @@ -1277,6 +1414,14 @@ igt_main
>                  */
>                 igt_subtest_f("busy-double-start-%s", e->name)
>                         busy_double_start(fd, e);
> +
> +               /**
> +                * Check engine busyness accuracy is as expected.
> +                */
> +               for (i = 0; i < ARRAY_SIZE(pct); i++) {
> +                       igt_subtest_f("busy-accuracy-%u-%s", pct[i], e->name)
> +                               accuracy(fd, e, pct[i]);
> +               }

We cover accuracy of one engine, let's do them all concurrently as well!
That should stress our accuracy claims when running flat out? The spin
batch should come close to being the finest pwm userspace can use, so
should be a reasonable estimate of the worst-case loading across all
engines we can expect. (Although we might run into problems trying to
use more RT threads than cores.)
-Chris