[Intel-gfx] [igt-dev] [PATCH i-g-t 2/3] tests/gem_eio: Speed up test execution

Thu Mar 22 11:39:15 UTC 2018

Quoting Tvrtko Ursulin (2018-03-22 11:17:11)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> If we stop relying on regular GPU hangs to be detected, but trigger them
> manually as soon as we know our batch of interest is actually executing
> on the GPU, we can dramatically speed up various subtests.
> 
> This is enabled by the pollable spin batch added in the previous patch.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Suggested-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Antonio Argenziano <antonio.argenziano at intel.com>
> ---
> Note that the 'wait' subtest is mysteriously hanging for me in the no-op
> batch send by gem_test_engines, but only on RCS engine. TBD while I am
> getting some CI results.
> ---
>  lib.tar         | Bin 0 -> 102400 bytes
>  tests/gem_eio.c |  97 ++++++++++++++++++++++++++++++++++++++++----------------
>  2 files changed, 70 insertions(+), 27 deletions(-)
>  create mode 100644 lib.tar
> 
> diff --git a/lib.tar b/lib.tar
> new file mode 100644
> index 0000000000000000000000000000000000000000..ea04fad219a87f2e975852989526f8da4c9b7d6d
> GIT binary patch
> literal 102400
> zcmeHw>vkJQlBWMsPf=!{kwJ=gUAkMAJc3A2!kPrQASAWM<5LF&3M57#fW}3X+V;H9

Looks correct.

> diff --git a/tests/gem_eio.c b/tests/gem_eio.c
> index 4bcc5937db39..93400056124b 100644
> --- a/tests/gem_eio.c
> +++ b/tests/gem_eio.c
> @@ -71,26 +71,23 @@ static void trigger_reset(int fd)
>         gem_quiescent_gpu(fd);
>  }
>  
> -static void wedge_gpu(int fd)
> +static void manual_hang(int drm_fd)
>  {
> -       /* First idle the GPU then disable GPU resets before injecting a hang */
> -       gem_quiescent_gpu(fd);
> -
> -       igt_require(i915_reset_control(false));
> +       int dir = igt_debugfs_dir(drm_fd);
>  
> -       igt_debug("Wedging GPU by injecting hang\n");
> -       igt_post_hang_ring(fd, igt_hang_ring(fd, I915_EXEC_DEFAULT));
> +       igt_sysfs_set(dir, "i915_wedged", "-1");
>  
> -       igt_assert(i915_reset_control(true));
> +       close(dir);
>  }

Ok.

> -static void wedgeme(int drm_fd)
> +static void wedge_gpu(int fd)
>  {
> -       int dir = igt_debugfs_dir(drm_fd);
> -
> -       igt_sysfs_set(dir, "i915_wedged", "-1");
> +       /* First idle the GPU then disable GPU resets before injecting a hang */
> +       gem_quiescent_gpu(fd);
>  
> -       close(dir);
> +       igt_require(i915_reset_control(false));
> +       manual_hang(fd);
> +       igt_assert(i915_reset_control(true));
>  }

Ok.

>  
>  static int __gem_throttle(int fd)
> @@ -149,29 +146,66 @@ static int __gem_wait(int fd, uint32_t handle, int64_t timeout)
>         return err;
>  }
>  
> +static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
> +{
> +       if (gem_can_store_dword(fd, flags))
> +               return __igt_spin_batch_new_poll(fd, ctx, flags);
> +       else
> +               return __igt_spin_batch_new(fd, ctx, flags, 0);
> +}
> +
> +static void __spin_wait(int fd, igt_spin_t *spin)
> +{
> +       if (spin->running) {
> +               while (!*((volatile bool *)spin->running))
> +                       ;
> +       } else {
> +               igt_debug("__spin_wait - usleep mode\n");
> +               usleep(500e3); /* Better than nothing! */
> +       }
> +}
> +
> +/*
> + * Wedge the GPU when we know our batch is running.
> + */
> +static void wedge_after_running(int fd, igt_spin_t *spin)
> +{
> +       __spin_wait(fd, spin);
> +       manual_hang(fd);
> +}
> +
>  static void test_wait(int fd)
>  {
> -       igt_hang_t hang;
> +       struct timespec ts = { };
> +       igt_spin_t *hang;
>  
>         igt_require_gem(fd);
>  
> +       igt_nsec_elapsed(&ts);
> +
>         /* If the request we wait on completes due to a hang (even for
>          * that request), the user expects the return value to 0 (success).
>          */
> -       hang = igt_hang_ring(fd, I915_EXEC_DEFAULT);
> -       igt_assert_eq(__gem_wait(fd, hang.handle, -1), 0);
> -       igt_post_hang_ring(fd, hang);
> +       igt_require(i915_reset_control(true));
> +       hang = __spin_poll(fd, 0, I915_EXEC_DEFAULT);
> +       wedge_after_running(fd, hang);
> +       igt_assert_eq(__gem_wait(fd, hang->handle, -1), 0);
> +       igt_spin_batch_free(fd, hang);

>  
>         /* If the GPU is wedged during the wait, again we expect the return
>          * value to be 0 (success).
>          */
>         igt_require(i915_reset_control(false));
> -       hang = igt_hang_ring(fd, I915_EXEC_DEFAULT);
> -       igt_assert_eq(__gem_wait(fd, hang.handle, -1), 0);
> -       igt_post_hang_ring(fd, hang);
> +       hang = __spin_poll(fd, 0, I915_EXEC_DEFAULT);
> +       wedge_after_running(fd, hang);
> +       igt_assert_eq(__gem_wait(fd, hang->handle, -1), 0);
> +       igt_spin_batch_free(fd, hang);
>         igt_require(i915_reset_control(true));

Hmm. These are not equivalent to the original test. The tests
requires hangcheck to kick in while the test is blocked on igt_wait.
To do a fast equivalent, we need to kick off a timer. (Here we are just
asking if a wait on an already completed request doesn't block, not how
we handle the reset in the middle of a wait. Seems a reasonable addition
though.)

I think that's a general pattern worth repeating for the rest of tests:
don't immediately inject the hang, but leave it a few milliseconds to
allow us to block on the subsequent wait. I would even repeat the tests
a few times with different timeouts; 0, 1us, 10ms (thinking of the
different phases for i915_request_wait).

>         trigger_reset(fd);
> +
> +       /* HACK for CI */
> +       igt_assert(igt_nsec_elapsed(&ts) < 5e9);

igt_seconds_elapsed() the approximation is worth the readability.

In this case you might like to try igt_set_timeout(), as I think each
subtest and exithandlers are in place to make them robust against
premature failures.
-Chris