[Intel-gfx] [igt-dev] [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread

Fri Aug 3 14:03:05 UTC 2018

On 27/07/2018 12:13, Chris Wilson wrote:
> We assert that we complete a wedge within 250ms. However, when we use a
> thread to delay the wedging until after we start waiting, that thread
> itself is delayed longer than our wait timeout. This results in a false
> positive error where we fail the test before we even trigger the reset.
> 
> Reorder the test so that we only ever measure the delay from triggering
> the reset until we wakeup, and assert that is in a timely fashion
> (less than 250ms).
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105954
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   tests/gem_eio.c | 45 ++++++++++++++++++++++++---------------------
>   1 file changed, 24 insertions(+), 21 deletions(-)
> 
> diff --git a/tests/gem_eio.c b/tests/gem_eio.c
> index 3162a3170..b26b4b895 100644
> --- a/tests/gem_eio.c
> +++ b/tests/gem_eio.c
> @@ -190,7 +190,8 @@ static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
>   
>   struct hang_ctx {
>   	int debugfs;
> -	struct timespec ts;
> +	struct timespec delay;
> +	struct timespec *ts;
>   	timer_t timer;
>   };
>   
> @@ -198,8 +199,10 @@ static void hang_handler(union sigval arg)
>   {
>   	struct hang_ctx *ctx = arg.sival_ptr;
>   
> -	igt_debug("hang delay = %.2fus\n", igt_nsec_elapsed(&ctx->ts) / 1000.0);
> +	igt_debug("hang delay = %.2fus\n",
> +		  igt_nsec_elapsed(&ctx->delay) / 1000.0);
>   
> +	igt_nsec_elapsed(ctx->ts);
>   	igt_assert(igt_sysfs_set(ctx->debugfs, "i915_wedged", "-1"));
>   
>   	igt_assert_eq(timer_delete(ctx->timer), 0);
> @@ -207,7 +210,7 @@ static void hang_handler(union sigval arg)
>   	free(ctx);
>   }
>   
> -static void hang_after(int fd, unsigned int us)
> +static void hang_after(int fd, unsigned int us, struct timespec *ts)
>   {
>   	struct sigevent sev = {
>   		.sigev_notify = SIGEV_THREAD,
> @@ -229,30 +232,30 @@ static void hang_after(int fd, unsigned int us)
>   
>   	igt_assert_eq(timer_create(CLOCK_MONOTONIC, &sev, &ctx->timer), 0);
>   
> -	igt_nsec_elapsed(&ctx->ts);
> +	ctx->ts = ts;
> +	igt_nsec_elapsed(&ctx->delay);
>   
>   	igt_assert_eq(timer_settime(ctx->timer, 0, &its, NULL), 0);
>   }
>   
> -static int __check_wait(int fd, uint32_t bo, unsigned int wait)
> +static void __check_wait(int fd, uint32_t bo, unsigned int wait)
>   {
> -	unsigned long wait_timeout = 250e6; /* Some margin for actual reset. */
> -	int ret;
> +	struct timespec ts = {};
> +	uint64_t elapsed;
>   
>   	if (wait) {
> -		/*
> -		 * Double the wait plus some fixed margin to ensure gem_wait
> -		 * can never time out before the async hang runs.
> -		 */
> -		wait_timeout += wait * 2000 + 250e6;
> -		hang_after(fd, wait);
> +		hang_after(fd, wait, &ts);
>   	} else {
> +		igt_nsec_elapsed(&ts);
>   		manual_hang(fd);
>   	}
>   
> -	ret = __gem_wait(fd, bo, wait_timeout);
> +	gem_sync(fd, bo);

One drawback I can see is if reset fails to work then we hang until IGT 
runner timeout. Alternative would be a smaller patch to bump the wait 
delay which should work given how rare are the failures. It was one of 
the goals of the gem_eio rewrite I think, but whatever, I suppose 
runtime of test when reset is broken is not that important.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko

>   
> -	return ret;
> +	elapsed = igt_nsec_elapsed(&ts);
> +	igt_assert_f(elapsed < 250e6,
> +		     "Wake up following reset+wedge took %.3fms\n",
> +		     elapsed*1e-6);
>   }
>   
>   static void __test_banned(int fd)
> @@ -322,7 +325,7 @@ static void test_wait(int fd, unsigned int flags, unsigned int wait)
>   
>   	hang = spin_sync(fd, 0, I915_EXEC_DEFAULT);
>   
> -	igt_assert_eq(__check_wait(fd, hang->handle, wait), 0);
> +	__check_wait(fd, hang->handle, wait);
>   
>   	igt_spin_batch_free(fd, hang);
>   
> @@ -392,7 +395,7 @@ static void test_inflight(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -443,7 +446,7 @@ static void test_inflight_suspend(int fd)
>   	igt_set_autoresume_delay(30);
>   	igt_system_suspend_autoresume(SUSPEND_STATE_MEM, SUSPEND_TEST_NONE);
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, 10), 0);
> +	__check_wait(fd, obj[1].handle, 10);
>   
>   	for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   		igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -521,7 +524,7 @@ static void test_inflight_contexts(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -630,7 +633,7 @@ static void test_inflight_internal(int fd, unsigned int wait)
>   		nfence++;
>   	}
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +	__check_wait(fd, obj[1].handle, wait);
>   
>   	while (nfence--) {
>   		igt_assert_eq(sync_fence_status(fences[nfence]), -EIO);
> @@ -682,7 +685,7 @@ static void test_reset_stress(int fd, unsigned int flags)
>   			gem_execbuf(fd, &execbuf);
>   
>   		/* Wedge after a small delay. */
> -		igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0);
> +		__check_wait(fd, obj.handle, 100e3);
>   
>   		/* Unwedge by forcing a reset. */
>   		igt_assert(i915_reset_control(true));
>