[igt-dev] [Intel-gfx] [PATCH i-g-t] tests/perf_pmu: Verify engine busyness accuracy
Chris Wilson
chris at chris-wilson.co.uk
Thu Feb 15 12:43:15 UTC 2018
Quoting Tvrtko Ursulin (2018-02-15 11:53:44)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> A subtest to verify that the engine busyness is reported with expected
> accuracy on platforms where the feature is available.
>
> We test three patterns: 2%, 50% and 98% load per engine.
>
> v2:
> * Use spin batch instead of nop calibration.
> * Various tweaks.
>
> v3:
> * Change loops to be time based.
> * Use __igt_spin_batch_new inside timing sensitive loops.
> * Fixed PWM sleep handling.
>
> v4:
> * Use restarting spin batch.
> * Calibrate more carefully by looking at the real PWM loop.
>
> v5:
> * Made standalone.
> * Better info messages.
> * Tweak sleep compensation.
>
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> tests/perf_pmu.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 174 insertions(+), 18 deletions(-)
>
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index a7501ca5f7a4..fa9b54793a52 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -35,6 +35,7 @@
> #include <dirent.h>
> #include <time.h>
> #include <poll.h>
> +#include <sched.h>
>
> #include "igt.h"
> #include "igt_core.h"
> @@ -385,6 +386,22 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
> gem_quiescent_gpu(gem_fd);
> }
>
> +static void
> +__submit_spin_batch(int gem_fd, igt_spin_t *spin,
> + const struct intel_execution_engine2 *e)
> +{
> + struct drm_i915_gem_exec_object2 obj = {
> + .handle = spin->handle
> + };
> + struct drm_i915_gem_execbuffer2 eb = {
> + .buffer_count = 1,
> + .buffers_ptr = to_user_pointer(&obj),
> + .flags = e2ring(gem_fd, e),
> + };
> +
> + gem_execbuf(gem_fd, &eb);
> +}
> +
> static void
> most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
> const unsigned int num_engines, unsigned int flags)
> @@ -405,15 +422,7 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e,
> if (e == e_) {
> idle_idx = i;
> } else if (spin) {
> - struct drm_i915_gem_exec_object2 obj = {
> - .handle = spin->handle
> - };
> - struct drm_i915_gem_execbuffer2 eb = {
> - .buffer_count = 1,
> - .buffers_ptr = to_user_pointer(&obj),
> - .flags = e2ring(gem_fd, e_),
> - };
> - gem_execbuf(gem_fd, &eb);
> + __submit_spin_batch(gem_fd, spin, e_);
> } else {
> spin = igt_spin_batch_new(gem_fd, 0,
> e2ring(gem_fd, e_), 0);
> @@ -469,15 +478,7 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines,
> continue;
>
> if (spin) {
> - struct drm_i915_gem_exec_object2 obj = {
> - .handle = spin->handle
> - };
> - struct drm_i915_gem_execbuffer2 eb = {
> - .buffer_count = 1,
> - .buffers_ptr = to_user_pointer(&obj),
> - .flags = e2ring(gem_fd, e),
> - };
> - gem_execbuf(gem_fd, &eb);
> + __submit_spin_batch(gem_fd, spin, e);
> } else {
> spin = igt_spin_batch_new(gem_fd, 0,
> e2ring(gem_fd, e), 0);
> @@ -1390,6 +1391,150 @@ test_enable_race(int gem_fd, const struct intel_execution_engine2 *e)
> gem_quiescent_gpu(gem_fd);
> }
>
> +static double __error(double val, double ref)
> +{
> + igt_assert(ref != 0.0);
igt_assert(ref > 1e-5 /* smallval */) ?
Pretty sure a negative ref is also cause for confusion :)
> + return (100.0 * val / ref) - 100.0;
> +}
> +
> +static void __rearm_spin_batch(igt_spin_t *spin)
> +{
> + const uint32_t mi_arb_chk = 0x5 << 23;
> +
> + *spin->batch = mi_arb_chk;
> + __sync_synchronize();
> +}
> +
> +#define div_round_up(a, b) (((a) + (b) - 1) / (b))
> +
> +static void
> +accuracy(int gem_fd, const struct intel_execution_engine2 *e,
> + unsigned long target_busy_pct)
> +{
> + const unsigned int min_test_loops = 7;
> + const unsigned long min_test_us = 1e6;
> + unsigned long busy_us = 2500;
> + unsigned long idle_us = 100 * (busy_us - target_busy_pct *
> + busy_us / 100) / target_busy_pct;
> + unsigned long pwm_calibration_us;
> + unsigned long test_us;
> + double busy_r;
> + uint64_t val[2];
> + uint64_t ts[2];
> + int fd;
> +
> + /* Sampling platforms cannot reach the high accuracy criteria. */
> + igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8);
igt_require(gem_has_execlists(gem_fd));
> +
> + while (idle_us < 2500) {
> + busy_us *= 2;
> + idle_us *= 2;
> + }
> +
> + pwm_calibration_us = min_test_loops * (busy_us + idle_us);
> + while (pwm_calibration_us < min_test_us)
> + pwm_calibration_us += busy_us + idle_us;
> + test_us = min_test_loops * (idle_us + busy_us);
> + while (test_us < min_test_us)
> + test_us += busy_us + idle_us;
> +
> + igt_info("calibration=%luus, test=%luus; busy=%luus, idle=%luus\n",
> + pwm_calibration_us, test_us, busy_us, idle_us);
Would also be nice to show the adjusted %%.
> + assert_within_epsilon((double)busy_us / (busy_us + idle_us),
> + (double)target_busy_pct / 100.0, tolerance);
> +
> + /* Emit PWM pattern on the engine from a child. */
> + igt_fork(child, 1) {
> + struct sched_param rt = { .sched_priority = 99 };
> + const unsigned long timeout[] = { pwm_calibration_us * 1000,
> + test_us * 2 * 1000 };
> + unsigned long sleep_busy = busy_us;
> + unsigned long sleep_idle = idle_us;
> + igt_spin_t *spin;
> +
> + /* We need the best sleep accuracy we can get. */
> + igt_require(sched_setscheduler(0,
> + SCHED_FIFO | SCHED_RESET_ON_FORK,
> + &rt) == 0);
Can't use igt_require() or igt_assert() from children. So just igt_warn
if not applied.
Just SCHED_FIFO is enough as the child doesn't/shouldn't fork.
> +
> + /* Allocate our spin batch and idle it. */
> + spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> + igt_spin_batch_end(spin);
> + gem_sync(gem_fd, spin->handle);
> +
> + /* 1st pass is calibration, second pass is the test. */
> + for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
> + unsigned long busy_ns = 0, idle_ns = 0;
> + struct timespec test_start = { };
> + unsigned long loops = 0;
> + double err_busy, err_idle;
> +
> + igt_nsec_elapsed(&test_start);
> + do {
> + struct timespec t_busy = { };
> +
> + igt_nsec_elapsed(&t_busy);
> +
> + /* Restart the spinbatch. */
> + __rearm_spin_batch(spin);
> + __submit_spin_batch(gem_fd, spin, e);
> + measured_usleep(sleep_busy);
> + igt_spin_batch_end(spin);
> + gem_sync(gem_fd, spin->handle);
> +
> + busy_ns += igt_nsec_elapsed(&t_busy);
> +
> + idle_ns += measured_usleep(sleep_idle);
> +
> + loops++;
> + } while (igt_nsec_elapsed(&test_start) < timeout[pass]);
> +
> + busy_ns = div_round_up(busy_ns, loops);
> + idle_ns = div_round_up(idle_ns, loops);
> +
> + err_busy = __error(busy_ns / 1000, busy_us);
> + err_idle = __error(idle_ns / 1000, idle_us);
> +
> + igt_info("%u: busy %lu/%lu %.2f%%, idle %lu/%lu %.2f%%\n",
> + pass,
> + busy_ns / 1000, busy_us, err_busy,
> + idle_ns / 1000, idle_us, err_idle);
Ok, makes sense.
> +
> + if (pass == 0) {
> + sleep_busy = (double)busy_us -
> + (double)busy_us * err_busy / 100.0;
> + sleep_idle = (double)idle_us -
> + (double)idle_us * err_idle / 100.0;
> + igt_info("calibrated sleeps: busy=%lu, idle=%lu\n",
> + sleep_busy, sleep_idle);
> + }
> + }
> +
> + igt_spin_batch_free(gem_fd, spin);
> + }
> +
> + /* Let the child run. */
> + usleep(pwm_calibration_us * 2);
> +
> + /* Collect engine busyness for an interesting part of child runtime. */
> + fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance));
> + val[0] = __pmu_read_single(fd, &ts[0]);
> + usleep(test_us / 2);
> + val[1] = __pmu_read_single(fd, &ts[1]);
> + close(fd);
> +
> + igt_waitchildren();
> +
> + busy_r = (double)(val[1] - val[0]) / (ts[1] - ts[0]);
> +
> + igt_info("error=%.2f%% (%.2f%% vs %lu%%)\n",
> + __error(busy_r, target_busy_pct / 100.0),
> + busy_r * 100.0, target_busy_pct);
> +
> + assert_within_epsilon(busy_r, (double)target_busy_pct / 100.0, 0.15);
> +}
A fine compromise! :)
Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
-Chris
More information about the igt-dev
mailing list