[igt-dev] [RFC] tests/gem_watchdog: Initial set of tests for GPU watchdog

Mon Apr 15 21:50:50 UTC 2019

On 15/04/19 11:22, Carlos Santa wrote:
> This test adds basic set of tests to reset the different
> GPU engines through the watchdog timer.
> 
> Credits to Antonio for the original codebase this is based on.
> 
> Cc: Ursulin Tvrtko <tvrtko.ursulin at intel.com>
> Cc: Antonio Argenziano <antonio.argenziano at intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk

nit: missing '>' at the end of address ^

> Signed-off-by: Carlos Santa <carlos.santa at intel.com>
> ---
>   tests/Makefile.sources    |   3 +
>   tests/i915/gem_watchdog.c | 439 ++++++++++++++++++++++++++++++++++++++++++++++
>   tests/meson.build         |   1 +
>   3 files changed, 443 insertions(+)
>   create mode 100644 tests/i915/gem_watchdog.c
> 
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index 214698d..7f17f20 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
>   TESTS_progs += gem_wait
>   gem_wait_SOURCES = i915/gem_wait.c
>   
> +TESTS_progs += gem_watchdog
> +gem_watchdog_SOURCES = i915/gem_watchdog.c
> +
>   TESTS_progs += gem_workarounds
>   gem_workarounds_SOURCES = i915/gem_workarounds.c
>   
> diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
> new file mode 100644
> index 0000000..65e2f5b
> --- /dev/null
> +++ b/tests/i915/gem_watchdog.c
> @@ -0,0 +1,439 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +#include "igt.h"
> +#include "igt_sysfs.h"
> +#include "sw_sync.h"
> +
> +#include <pthread.h>
> +#include <fcntl.h>
> +
> +#include <sys/ioctl.h>
> +#include <sys/poll.h>
> +#include <sys/signal.h>
> +#include "i915/gem_ring.h"
> +
> +#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
> +#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
> +#define HIGH 1
> +#define LOW 0
> +#define LOCAL_EXEC_FENCE_OUT (1 << 17)
> +#define WATCHDOG_THRESHOLD (100)
> +#define MAX_ENGINES 5
> +#define RENDER_CLASS 0
> +#define VIDEO_DECODE_CLASS 1
> +#define VIDEO_ENHANCEMENT_CLASS 2
> +#define COPY_ENGINE_CLASS 3
> +#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
> +
> +const uint64_t timeout_100ms = 100000000LL;
> +static pthread_mutex_t list_lock = PTHREAD_MUTEX_INITIALIZER;
> +
> +struct drm_i915_gem_watchdog_timeout {
> +	union {
> +		struct {
> +			/*
> +			 * Engine class & instance to be configured or queried.
> +			 */
> +			__u16 engine_class;
> +			__u16 engine_instance;
> +		};
> +		/* Index based addressing mode */
> +		__u32 index;
> +	};
> +	/* GPU Engine watchdog resets timeout in us */
> +	__u32 timeout_us;
> +};
> +
> +static void clear_error_state(int fd)
> +{
> +	int dir;
> +
> +	dir = igt_sysfs_open(fd);
> +
> +	if (dir < 0)
> +		return;
> +
> +	/* Any write to the error state clears it */
> +	igt_sysfs_set(dir, "error", "");
> +	close(dir);
> +}
> +
> +static bool check_error_state(int fd) > +{

Don't think this is used.

> +	char *error, *str;
> +	bool found = false;
> +	int dir;
> +
> +	dir = igt_sysfs_open(fd);
> +
> +	error = igt_sysfs_get(dir, "error");
> +	igt_sysfs_set(dir, "error", "Begone!");
> +
> +	igt_assert(error);
> +	igt_debug("Error: %s\n", error);
> +
> +	if ((str = strstr(error, "GPU HANG"))) {
> +		igt_debug("Found error state! GPU hang triggered! %s\n", str);
> +		found = true;
> +	}
> +
> +	close(dir);
> +
> +	return found;
> +}
> +
> +static void context_set_watchdog(int fd, int engine_id,
> +                                 unsigned ctx_id, unsigned threshold)
> +{
> +	struct drm_i915_gem_watchdog_timeout engines_threshold[MAX_ENGINES];
> +	struct drm_i915_gem_context_param arg = {
> +		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
> +		.ctx_id = ctx_id,
> +		.size = sizeof(engines_threshold),
> +		.value = (uint64_t)&engines_threshold
> +	};
> +
> +    memset(&engines_threshold, 0, sizeof(engines_threshold));
> +
> +    /* read existing values */
> +    gem_context_get_param(fd, &arg);

Why do we need to read the existing values?

> +
> +    switch (engine_id & I915_EXEC_RING_MASK) {

If I read this correctly we cannot set the threshold for more than one 
engine at a time. Maybe use a mask?

> +    case I915_EXEC_RENDER:
> +         engines_threshold[RENDER_CLASS].timeout_us = threshold;
> +	 engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> +	 engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
> +	 engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
> +         break;
> +    case I915_EXEC_BSD:
> +         engines_threshold[RENDER_CLASS].timeout_us = 0;
> +	 engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
> +	 engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
> +	 engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
> +         break;
> +    case I915_EXEC_VEBOX:
> +         engines_threshold[RENDER_CLASS].timeout_us = 0;
> +	 engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> +         engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = threshold;
> +	 engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
> +         break;
> +    default:
> +	engines_threshold[RENDER_CLASS].timeout_us = 0;
> +	engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
> +	engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
> +	engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;

Not sure about clearing everything here, we should at least log 
something to say we got in an unwanted situation.

nit: bit of a weird indent in this block.

> +        break;
> +    }
> +	gem_context_set_param(fd, &arg);
> +}
> +
> +static void send_canary(uint32_t fd, uint32_t ctx_id, unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle, uint64_t timeout)

Should be possible to use gem_test_engine();

> +{
> +    struct drm_i915_gem_exec_object2 obj[2];
> +    struct drm_i915_gem_relocation_entry reloc;
> +    struct drm_i915_gem_execbuffer2 execbuf;
> +    igt_spin_t *spin = NULL;
> +
> +    const uint32_t bbe = MI_BATCH_BUFFER_END;
> +    int i = 0;
> +    int fence;
> +
> +    gem_quiescent_gpu(fd);
> +
> +    memset(&execbuf, 0, sizeof(execbuf));
> +    memset(&obj, 0, sizeof(obj));
> +    memset(&reloc, 0, sizeof(reloc));
> +
> +    execbuf.buffers_ptr = to_user_pointer(obj);
> +
> +    execbuf.buffer_count = 2;
> +    execbuf.flags = exec_id | LOCAL_EXEC_FENCE_OUT;
> +
> +    obj[0].handle = target;
> +    obj[1].handle = gem_create(fd, 4096);
> +
> +    obj[1].relocation_count = 1;
> +    obj[1].relocs_ptr = to_user_pointer(&reloc);
> +
> +    reloc.target_handle = obj[0].handle;
> +    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
> +    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
> +    reloc.delta = offset * sizeof(uint32_t);
> +
> +    reloc.offset = i * sizeof(uint32_t);
> +    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
> +
> +    __sync_synchronize();
> +
> +    if (handle) {
> +        *handle = obj[1].handle;
> +        return;
> +    }
> +    gem_sync(fd, obj[1].handle);
> +    execbuf.rsvd1 = ctx_id;
> +    execbuf.rsvd2 = -1;
> +
> +    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
> +    igt_spin_batch_set_timeout(spin, timeout);
> +    igt_assert(gem_bo_busy(fd, obj[0].handle));
> +
> +    gem_execbuf_wr(fd, &execbuf);
> +    igt_spin_batch_free(fd, spin);
> +
> +    fence = execbuf.rsvd2 >> 32;
> +
> +#if 0
> +    igt_info("fence:%d, fence status : %d EIO: %d ctx_id:%d\n",fence, sync_fence_status(fence),-EIO, ctx_id);
> +#endif
> +
> +    close(fence);
> +
> +    gem_close(fd, obj[1].handle);
> +    gem_quiescent_gpu(fd);
> +}
> +
> +static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
> +{
> +	uint32_t ctx = gem_context_create(fd);
> +
> +	switch (ctx_prio) {
> +	case HIGH:
> +		__gem_context_set_priority(fd, ctx, MAX_PRIO);
> +		igt_info("Setting MAX priority %d\n", ctx_prio);
> +		break;
> +	case LOW:
> +		__gem_context_set_priority(fd, ctx, MIN_PRIO);
> +		igt_info("Setting MIN priority %d\n", ctx_prio);
> +		break;
> +	default:
> +		igt_info("Ignoring context priority %d\n", ctx_prio);
> +		break;
> +	}
> +	return ctx;
> +}
> +
> +static void bb_factory(uint32_t fd, uint32_t *ctx, int prio, uint64_t timeout, int pid)

I would scrap this function. AFAICT you use it both for submitting a 
canary batch and for creating contexts. Do it explicitly in the test or 
somewhere it is clear what is happening.

> +{
> +	uint32_t scratch;
> +	unsigned int nengine = 0;
> +	unsigned int engines[16];
> +	int i, j;
> +	unsigned int engine;
> +
> +	for_each_physical_engine(fd, engine)
> +		engines[nengine++] = engine;
> +
> +	igt_require(nengine);
> +
> +	for (i = 0; i < nengine; i++) {
> +		scratch = gem_create(fd, 4096);
> +		pthread_mutex_lock(&list_lock);
> +
> +		if (prio == HIGH)
> +			create_ctx_with_priority(fd, HIGH);

missing "ctx[j] ="   ^^^ and ...

> +		else if (prio == LOW)
> +			create_ctx_with_priority(fd, LOW);

... here		^^^ ?

> +		else
> +			ctx[j] = gem_context_create(fd);
> +
> +		send_canary(fd, ctx[j], engines[i], scratch, 0, NULL, timeout);
> +		gem_close(fd, scratch);
> +		pthread_mutex_unlock(&list_lock);
> +		j++;
> +	}
> +}
> +
> +static void inject_hang(uint32_t fd, unsigned ring, uint32_t ctx_id,  unsigned flags)
> +{
> +	igt_hang_t hang;
> +	hang = igt_hang_ctx(fd, ctx_id, ring, flags);
> +	gem_sync(fd, hang.spin->handle);
> +}
> +
> +static void gpu_watchdog_long_batch_2_contexts(int fd)
> +{
> +	unsigned flags = HANG_ALLOW_CAPTURE;
> +	int i = 0, j=0;
> +
> +	igt_fork(child, 1) {

Why do you want to do this from two separate processes?

> +		/* Child process */
> +		uint32_t ctx[5];
> +		const uint64_t batch_timeout_ms = timeout_100ms*3;
> +		memset(&ctx, 0, sizeof(ctx));
> +		bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));
> +
> +		sleep(6);
> +
> +	    	context_set_watchdog(fd, 1, 1, WATCHDOG_THRESHOLD);
> +	    	clear_error_state(fd);
> +
> +		/* Engine ID: RCS0, Context ID: 1 */
> +	   	inject_hang(fd, 1, 1, flags);
> +
> +		for (i = 0; i <5; i++) {
> +			printf("ctx_child: %u ctx_cnt: %d tid:%d\n",ctx[i], syscall(SYS_gettid));
> +			//gem_context_destroy(fd, ctx[i]);
> +		}
> +	}
> +
> +	{
> +		/* Parent process */
> +		uint32_t ctx[5];
> +		const uint64_t batch_timeout_ms = timeout_100ms*1;
> +		memset(&ctx, 0, sizeof(ctx));
> +		printf("%s %d , tid: %d\n", __FUNCTION__,__LINE__, syscall(SYS_gettid));
> +		
> +		bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));
> +
> +		sleep(6);
> +
> +	    	context_set_watchdog(fd, 4, 10, WATCHDOG_THRESHOLD);
> +	    	clear_error_state(fd);
> +
> +        	/* Engine ID: VECS0, Context ID: 10 */
> +   		inject_hang(fd, 4, 10, flags);

Why hardcoding the context ids? You can use the ones coming from 
gem_context_create();

> +
> +		for (i = 0; i <5; i++) {
> +			igt_info("ctx_dad: %u ctx_cnt: %d tid:%d \n",ctx[i], i, syscall(SYS_gettid));
> +			//gem_context_destroy(fd, ctx[i]);
> +		}
> +	}
> +
> +	igt_waitchildren();
> +}
> +
> +static void gpu_watchodg_hang_long_batch_single_engine(int fd, unsigned engine_id, int nengine)
> +{
> +	uint32_t *ctx;
> +	int i;
> +	unsigned flags = HANG_ALLOW_CAPTURE;
> +	const uint64_t batch_timeout_ms = timeout_100ms*4;
> +
> +	igt_require(nengine);
> +
> +	ctx = (uint32_t *)malloc(sizeof(uint32_t)*nengine);
> +
> +	if (!ctx) {
> +		igt_info("Out of memory\n");
> +		exit(1);
> +	}
> +
> +	bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));

The call to bb_factory sends a canary batch to each engine, it would 
make more sense to do that after the hang triggered.

> +	
> +	if (engine_id == 1) {

Not sure what this if{}else block is for, can you add a comment?

> +		context_set_watchdog(fd, engine_id, 1, WATCHDOG_THRESHOLD); > +		clear_error_state(fd);
> +		inject_hang(fd, engine_id, 1, flags);
> +	} else if (engine_id == 8194) {
> +		context_set_watchdog(fd, engine_id, 2, WATCHDOG_THRESHOLD);
> +		clear_error_state(fd);
> +		inject_hang(fd, engine_id, 2, flags);
> +	} else if (engine_id == 16386) {
> +		context_set_watchdog(fd, engine_id, 3, WATCHDOG_THRESHOLD);
> +		clear_error_state(fd);
> +		inject_hang(fd, engine_id, 3, flags);
> +	} else if (engine_id == 4) {
> +		context_set_watchdog(fd, engine_id, 5, WATCHDOG_THRESHOLD);
> +		clear_error_state(fd);
> +		inject_hang(fd, engine_id, 5, flags);
> +	}

context ids hardcoded above as well.

> +
> +	for (i = 0; i <nengine; i++)
> +		gem_context_destroy(fd, ctx[i]);
> +	free(ctx);
> +}
> +static void no_gpu_hang_long_batch_all_engines(int fd, int nengine, int prio)

Wouldn't this test case be covered by the other tests that do normal 
submissions?

> +{
> +	uint32_t *ctx;
> +	int i;
> +	const uint64_t batch_timeout_ms = timeout_100ms*4;
> +
> +	ctx = (uint32_t *)malloc(sizeof(uint32_t)*nengine);
> +
> +	if (!ctx) {
> +		printf("Out of memory\n");
> +		exit(1);
> +	}
> +
> +	bb_factory(fd, ctx, prio, batch_timeout_ms, syscall(SYS_gettid));
> +
> +	for (i = 0; i <nengine; i++)
> +		gem_context_destroy(fd, ctx[i]);
> +
> +	free(ctx);
> +}
> +
> +igt_main
> +{
> +	int fd;
> +	unsigned int nengine = 0;
> +	unsigned int engine;
> +	unsigned int engines[16];
> +
> +	igt_skip_on_simulation();
> +
> +	igt_fixture {
> +		fd = drm_open_driver(DRIVER_INTEL);
> +		igt_require_gem(fd);
> +
> +		for_each_physical_engine(fd, engine)
> +			engines[nengine++] = engine;
> +		igt_require(nengine);

I think this test should run with the hang detector always on.

> +	}
> +
> +	igt_subtest_group {

I think we are missing a test where we set more than one engine's 
threshold and check that the wathcdog only applies to the engine it got 
set for. There is also a low hanging fruit for invalid parameters into 
the IOCTL.

Antonio

> +		igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-no-priority") {
> +			int prio = -1;
> +			no_gpu_hang_long_batch_all_engines(fd, nengine, prio);
> +		}
> +
> +		igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-low-priority") {
> +			int prio = LOW;
> +			no_gpu_hang_long_batch_all_engines(fd, nengine, prio);
> +		}
> +
> +		igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-high-priority") {
> +			int prio = HIGH;
> +			no_gpu_hang_long_batch_all_engines(fd, nengine, prio);
> +		}
> +
> +		for (const struct intel_execution_engine *e = intel_execution_engines; e->name; e++) {
> +			/* no support for gpu watchdog on BLT */
> +			if (e->exec_id == 0 || e->exec_id == I915_EXEC_BLT)
> +				continue;
> +
> +			igt_subtest_f("gpu-watchdog-long-batch-%s", e->name) {
> +				igt_require(gem_ring_has_physical_engine(fd, e->exec_id | e->flags));
> +				gpu_watchodg_hang_long_batch_single_engine(fd, e->exec_id | e->flags, nengine);
> +			}
> +		}
> +
> +		igt_subtest_f("gpu-watchdog-long-batch-2-contexts") {
> +			gpu_watchdog_long_batch_2_contexts(fd);
> +		}
> +    }
> +
> +    igt_fixture {
> +	close(fd);
> +    }
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 5167a6c..b281b75 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -210,6 +210,7 @@ i915_progs = [
>   	'gem_unref_active_buffers',
>   	'gem_userptr_blits',
>   	'gem_wait',
> +        'gem_watchdog',
>   	'gem_workarounds',
>   	'gem_write_read_ring_switch',
>   	'i915_fb_tiling',
>