[Intel-gfx] [PATCH i-g-t v2] tests/perf_pmu: Avoid RT thread for accuracy test

Tue Apr 3 13:10:49 UTC 2018

Quoting Tvrtko Ursulin (2018-04-03 13:38:25)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> Realtime scheduling interferes with execlists submission (tasklet) so try
> to simplify the PWM loop in a few ways:
> 
>  * Drop RT.
>  * Longer batches for smaller systematic error.
>  * More truthful test duration calculation.
>  * Less clock queries.
>  * No self-adjust - instead just report the achieved cycle and let the
>    parent check against it.
>  * Report absolute cycle error.
> 
> v2:
>  * Bring back self-adjust. (Chris Wilson)
>    (But slightly fixed version with no overflow.)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>  tests/perf_pmu.c | 97 +++++++++++++++++++++++++-------------------------------
>  1 file changed, 43 insertions(+), 54 deletions(-)
> 
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index f27b7ec7d2c2..0cfacd4a8fbe 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -1504,12 +1504,6 @@ test_enable_race(int gem_fd, const struct intel_execution_engine2 *e)
>         gem_quiescent_gpu(gem_fd);
>  }
>  
> -static double __error(double val, double ref)
> -{
> -       igt_assert(ref > 1e-5 /* smallval */);
> -       return (100.0 * val / ref) - 100.0;
> -}
> -
>  static void __rearm_spin_batch(igt_spin_t *spin)
>  {
>         const uint32_t mi_arb_chk = 0x5 << 23;
> @@ -1532,13 +1526,12 @@ static void
>  accuracy(int gem_fd, const struct intel_execution_engine2 *e,
>          unsigned long target_busy_pct)
>  {
> -       const unsigned int min_test_loops = 7;
> -       const unsigned long min_test_us = 1e6;
> -       unsigned long busy_us = 2500;
> +       unsigned long busy_us = 10000 - 100 * (1 + abs(50 - target_busy_pct));
>         unsigned long idle_us = 100 * (busy_us - target_busy_pct *
>                                 busy_us / 100) / target_busy_pct;
> -       unsigned long pwm_calibration_us;
> -       unsigned long test_us;
> +       const unsigned long min_test_us = 1e6;
> +       const unsigned long pwm_calibration_us = min_test_us;
> +       const unsigned long test_us = min_test_us;
>         double busy_r, expected;
>         uint64_t val[2];
>         uint64_t ts[2];
> @@ -1553,13 +1546,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
>                 idle_us *= 2;
>         }
>  
> -       pwm_calibration_us = min_test_loops * (busy_us + idle_us);
> -       while (pwm_calibration_us < min_test_us)
> -               pwm_calibration_us += busy_us + idle_us;
> -       test_us = min_test_loops * (idle_us + busy_us);
> -       while (test_us < min_test_us)
> -               test_us += busy_us + idle_us;
> -
>         igt_info("calibration=%lums, test=%lums; ratio=%.2f%% (%luus/%luus)\n",
>                  pwm_calibration_us / 1000, test_us / 1000,
>                  (double)busy_us / (busy_us + idle_us) * 100.0,
> @@ -1572,20 +1558,11 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
>  
>         /* Emit PWM pattern on the engine from a child. */
>         igt_fork(child, 1) {
> -               struct sched_param rt = { .sched_priority = 99 };
>                 const unsigned long timeout[] = {
>                         pwm_calibration_us * 1000, test_us * 1000
>                 };
> -               uint64_t total_busy_ns = 0, total_idle_ns = 0;
> +               uint64_t total_busy_ns = 0, total_ns = 0;
>                 igt_spin_t *spin;
> -               int ret;
> -
> -               /* We need the best sleep accuracy we can get. */
> -               ret = sched_setscheduler(0,
> -                                        SCHED_FIFO | SCHED_RESET_ON_FORK,
> -                                        &rt);
> -               if (ret)
> -                       igt_warn("Failed to set scheduling policy!\n");
>  
>                 /* Allocate our spin batch and idle it. */
>                 spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0);
> @@ -1594,39 +1571,51 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
>  
>                 /* 1st pass is calibration, second pass is the test. */
>                 for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) {
> -                       uint64_t busy_ns = -total_busy_ns;
> -                       uint64_t idle_ns = -total_idle_ns;
> -                       struct timespec test_start = { };
> +                       unsigned int target_idle_us = idle_us;
> +                       uint64_t busy_ns = 0, idle_ns = 0;
> +                       struct timespec start = { };
> +                       unsigned long pass_ns = 0;
> +
> +                       igt_nsec_elapsed(&start);
>  
> -                       igt_nsec_elapsed(&test_start);
>                         do {
> -                               unsigned int target_idle_us, t_busy;
> +                               unsigned long loop_ns, loop_busy;
> +                               struct timespec _ts = { };
> +                               double err;
> +
> +                               /* PWM idle sleep. */
> +                               _ts.tv_nsec = target_idle_us * 1000;
> +                               nanosleep(&_ts, NULL);
>  
>                                 /* Restart the spinbatch. */
>                                 __rearm_spin_batch(spin);
>                                 __submit_spin_batch(gem_fd, spin, e, 0);
>  
> -                               /*
> -                                * Note that the submission may be delayed to a
> -                                * tasklet (ksoftirqd) which cannot run until we
> -                                * sleep as we hog the cpu (we are RT).
> -                                */
> -
> -                               t_busy = measured_usleep(busy_us);
> +                               /* PWM busy sleep. */
> +                               loop_busy = igt_nsec_elapsed(&start);
> +                               _ts.tv_nsec = busy_us * 1000;
> +                               nanosleep(&_ts, NULL);
>                                 igt_spin_batch_end(spin);
> -                               gem_sync(gem_fd, spin->handle);
> -
> -                               total_busy_ns += t_busy;
> -
> -                               target_idle_us =
> -                                       (100 * total_busy_ns / target_busy_pct - (total_busy_ns + total_idle_ns)) / 1000;
> -                               total_idle_ns += measured_usleep(target_idle_us);
> -                       } while (igt_nsec_elapsed(&test_start) < timeout[pass]);
> -
> -                       busy_ns += total_busy_ns;
> -                       idle_ns += total_idle_ns;
>  
> -                       expected = (double)busy_ns / (busy_ns + idle_ns);
> +                               /* Time accounting. */
> +                               loop_ns = igt_nsec_elapsed(&start);
> +                               loop_busy = loop_ns - loop_busy;
> +                               loop_ns -= pass_ns;
> +
> +                               busy_ns += loop_busy;
> +                               total_busy_ns += loop_busy;
> +                               idle_ns += loop_ns - loop_busy;
> +                               pass_ns += loop_ns;
> +                               total_ns += loop_ns;
> +
> +                               /* Re-calibrate. */
> +                               err = (double)total_busy_ns / total_ns -
> +                                     (double)target_busy_pct / 100.0;
> +                               target_idle_us = (double)target_idle_us *
> +                                                (1.0 + err);

Previously the question we answered was how long should I sleep for the
busy:idle ratio to hit the target.

expected_total_ns = 100.0 * total_busy_ns / target_busy_pct;
target_idle_us = (expected_total_ns - current_total_ns) / 1000;

	unsigned long loop_ns, loop_busy;
	struct timespec _ts = { };
	double err;

	/* PWM idle sleep. */
	_ts.tv_nsec = target_idle_us * 1000;
	nanosleep(&_ts, NULL);

Assuming no >1s sleeps.
(Ok, so the sleep after recalc is still here.)

	/* Restart the spinbatch. */
	__rearm_spin_batch(spin);
	__submit_spin_batch(gem_fd, spin, e, 0);

	/* PWM busy sleep. */
	loop_busy = igt_nsec_elapsed(&start);
	_ts.tv_nsec = busy_us * 1000;
	nanosleep(&_ts, NULL);
	igt_spin_batch_end(spin);

	/* Time accounting. */
	loop_ns = igt_nsec_elapsed(&start);
	loop_busy = loop_ns - loop_busy;
	loop_ns -= pass_ns;

So pass_ns is time from start of calibration, loop_ns is time for this
loop.

	busy_ns += loop_busy;
	total_busy_ns += loop_busy;

busy_ns will be calibration pass, total all passes?

	idle_ns += loop_ns - loop_busy;

And idle is the residual between the time up to this point, and what has
been busy.

	pass_ns += loop_ns;
	total_ns += loop_ns;

	/* Re-calibrate. */
	err = (double)total_busy_ns / total_ns -
	      (double)target_busy_pct / 100.0;

Hmm, I thought you didn't like the run on calculations, and wanted to
reset between passes? (Have I got total_busy_ns and busy_ns confused?)

	target_idle_us = (double)target_idle_us * (1.0 + err);

Ok, I'm tired, but... So, if busy is 10% larger than expected, sleep 10%
longer to try and compensate, would be the gist.

And this is because you always sleep and spin together and so cannot
just sleep to compensate for the earlier inaccuracy. Which means we
never truly try to correct the error in the same pass, but apply a
correction factor for the next.

To me it seems like the closed system with each loop being "spin then
adjusted sleep" will autocorrect and more likely to finish correct (as
we are less reliant on the next loop for the accuracy). It's pretty much
immaterial, as we expect the pmu to match the measurements (and not our
expectations), but I find the one pass does all much simpler to follow.
-Chris