[igt-dev] [RFC] tests/gem_watchdog: Initial set of tests for GPU watchdog

Mon Apr 15 18:22:51 UTC 2019

This test adds basic set of tests to reset the different
GPU engines through the watchdog timer.

Credits to Antonio for the original codebase this is based on.

Cc: Ursulin Tvrtko <tvrtko.ursulin at intel.com>
Cc: Antonio Argenziano <antonio.argenziano at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk
Signed-off-by: Carlos Santa <carlos.santa at intel.com>
---
 tests/Makefile.sources    |   3 +
 tests/i915/gem_watchdog.c | 439 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/meson.build         |   1 +
 3 files changed, 443 insertions(+)
 create mode 100644 tests/i915/gem_watchdog.c

diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 214698d..7f17f20 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -444,6 +444,9 @@ gem_userptr_blits_SOURCES = i915/gem_userptr_blits.c
 TESTS_progs += gem_wait
 gem_wait_SOURCES = i915/gem_wait.c
 
+TESTS_progs += gem_watchdog
+gem_watchdog_SOURCES = i915/gem_watchdog.c
+
 TESTS_progs += gem_workarounds
 gem_workarounds_SOURCES = i915/gem_workarounds.c
 
diff --git a/tests/i915/gem_watchdog.c b/tests/i915/gem_watchdog.c
new file mode 100644
index 0000000..65e2f5b
--- /dev/null
+++ b/tests/i915/gem_watchdog.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "igt.h"
+#include "igt_sysfs.h"
+#include "sw_sync.h"
+
+#include <pthread.h>
+#include <fcntl.h>
+
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/signal.h>
+#include "i915/gem_ring.h"
+
+#define MAX_PRIO LOCAL_I915_CONTEXT_MAX_USER_PRIORITY
+#define MIN_PRIO LOCAL_I915_CONTEXT_MIN_USER_PRIORITY
+#define HIGH 1
+#define LOW 0
+#define LOCAL_EXEC_FENCE_OUT (1 << 17)
+#define WATCHDOG_THRESHOLD (100)
+#define MAX_ENGINES 5
+#define RENDER_CLASS 0
+#define VIDEO_DECODE_CLASS 1
+#define VIDEO_ENHANCEMENT_CLASS 2
+#define COPY_ENGINE_CLASS 3
+#define LOCAL_I915_CONTEXT_PARAM_WATCHDOG 0x10
+
+const uint64_t timeout_100ms = 100000000LL;
+static pthread_mutex_t list_lock = PTHREAD_MUTEX_INITIALIZER;
+
+struct drm_i915_gem_watchdog_timeout {
+	union {
+		struct {
+			/*
+			 * Engine class & instance to be configured or queried.
+			 */
+			__u16 engine_class;
+			__u16 engine_instance;
+		};
+		/* Index based addressing mode */
+		__u32 index;
+	};
+	/* GPU Engine watchdog resets timeout in us */
+	__u32 timeout_us;
+};
+
+static void clear_error_state(int fd)
+{
+	int dir;
+
+	dir = igt_sysfs_open(fd);
+
+	if (dir < 0)
+		return;
+
+	/* Any write to the error state clears it */
+	igt_sysfs_set(dir, "error", "");
+	close(dir);
+}
+
+static bool check_error_state(int fd)
+{
+	char *error, *str;
+	bool found = false;
+	int dir;
+
+	dir = igt_sysfs_open(fd);
+
+	error = igt_sysfs_get(dir, "error");
+	igt_sysfs_set(dir, "error", "Begone!");
+
+	igt_assert(error);
+	igt_debug("Error: %s\n", error);
+
+	if ((str = strstr(error, "GPU HANG"))) {
+		igt_debug("Found error state! GPU hang triggered! %s\n", str);
+		found = true;
+	}
+
+	close(dir);
+
+	return found;
+}
+
+static void context_set_watchdog(int fd, int engine_id,
+                                 unsigned ctx_id, unsigned threshold)
+{
+	struct drm_i915_gem_watchdog_timeout engines_threshold[MAX_ENGINES];
+	struct drm_i915_gem_context_param arg = {
+		.param = LOCAL_I915_CONTEXT_PARAM_WATCHDOG,
+		.ctx_id = ctx_id,
+		.size = sizeof(engines_threshold),
+		.value = (uint64_t)&engines_threshold
+	};
+
+    memset(&engines_threshold, 0, sizeof(engines_threshold));
+
+    /* read existing values */
+    gem_context_get_param(fd, &arg);
+
+    switch (engine_id & I915_EXEC_RING_MASK) {
+    case I915_EXEC_RENDER:
+         engines_threshold[RENDER_CLASS].timeout_us = threshold;
+	 engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
+	 engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
+	 engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
+         break;
+    case I915_EXEC_BSD:
+         engines_threshold[RENDER_CLASS].timeout_us = 0;
+	 engines_threshold[VIDEO_DECODE_CLASS].timeout_us = threshold;
+	 engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
+	 engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
+         break;
+    case I915_EXEC_VEBOX:
+         engines_threshold[RENDER_CLASS].timeout_us = 0;
+	 engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
+         engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = threshold;
+	 engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
+         break;
+    default:
+	engines_threshold[RENDER_CLASS].timeout_us = 0;
+	engines_threshold[VIDEO_DECODE_CLASS].timeout_us = 0;
+	engines_threshold[VIDEO_ENHANCEMENT_CLASS].timeout_us = 0;
+	engines_threshold[COPY_ENGINE_CLASS].timeout_us = 0;
+        break;
+    }
+	gem_context_set_param(fd, &arg);
+}
+
+static void send_canary(uint32_t fd, uint32_t ctx_id, unsigned exec_id, uint32_t target, uint32_t offset, uint32_t *handle, uint64_t timeout)
+{
+    struct drm_i915_gem_exec_object2 obj[2];
+    struct drm_i915_gem_relocation_entry reloc;
+    struct drm_i915_gem_execbuffer2 execbuf;
+    igt_spin_t *spin = NULL;
+    
+    const uint32_t bbe = MI_BATCH_BUFFER_END;
+    int i = 0;
+    int fence;
+
+    gem_quiescent_gpu(fd);
+
+    memset(&execbuf, 0, sizeof(execbuf));
+    memset(&obj, 0, sizeof(obj));
+    memset(&reloc, 0, sizeof(reloc));
+
+    execbuf.buffers_ptr = to_user_pointer(obj);
+
+    execbuf.buffer_count = 2;
+    execbuf.flags = exec_id | LOCAL_EXEC_FENCE_OUT;
+
+    obj[0].handle = target;
+    obj[1].handle = gem_create(fd, 4096);
+
+    obj[1].relocation_count = 1;
+    obj[1].relocs_ptr = to_user_pointer(&reloc);
+
+    reloc.target_handle = obj[0].handle;
+    reloc.read_domains = I915_GEM_DOMAIN_COMMAND;
+    reloc.write_domain = I915_GEM_DOMAIN_COMMAND;
+    reloc.delta = offset * sizeof(uint32_t);
+
+    reloc.offset = i * sizeof(uint32_t);
+    gem_write(fd, obj[1].handle, 0, &bbe, sizeof(bbe));
+
+    __sync_synchronize();
+
+    if (handle) {
+        *handle = obj[1].handle;
+        return;
+    }
+    gem_sync(fd, obj[1].handle);
+    execbuf.rsvd1 = ctx_id;
+    execbuf.rsvd2 = -1;
+
+    spin = igt_spin_batch_new(fd, .dependency = obj[0].handle);
+    igt_spin_batch_set_timeout(spin, timeout);
+    igt_assert(gem_bo_busy(fd, obj[0].handle));
+
+    gem_execbuf_wr(fd, &execbuf);
+    igt_spin_batch_free(fd, spin);
+
+    fence = execbuf.rsvd2 >> 32;
+
+#if 0
+    igt_info("fence:%d, fence status : %d EIO: %d ctx_id:%d\n",fence, sync_fence_status(fence),-EIO, ctx_id);
+#endif
+
+    close(fence);    
+
+    gem_close(fd, obj[1].handle);
+    gem_quiescent_gpu(fd);
+}
+
+static uint32_t create_ctx_with_priority(int fd, int ctx_prio)
+{
+	uint32_t ctx = gem_context_create(fd);
+
+	switch (ctx_prio) {
+	case HIGH:
+		__gem_context_set_priority(fd, ctx, MAX_PRIO);
+		igt_info("Setting MAX priority %d\n", ctx_prio);
+		break;
+	case LOW:
+		__gem_context_set_priority(fd, ctx, MIN_PRIO);
+		igt_info("Setting MIN priority %d\n", ctx_prio);
+		break;
+	default:
+		igt_info("Ignoring context priority %d\n", ctx_prio);
+		break;
+	}
+	return ctx;
+}
+
+static void bb_factory(uint32_t fd, uint32_t *ctx, int prio, uint64_t timeout, int pid)
+{
+	uint32_t scratch;
+	unsigned int nengine = 0;
+	unsigned int engines[16];
+	int i, j;
+	unsigned int engine;
+
+	for_each_physical_engine(fd, engine)
+		engines[nengine++] = engine;
+
+	igt_require(nengine);
+
+	for (i = 0; i < nengine; i++) {
+		scratch = gem_create(fd, 4096);
+		pthread_mutex_lock(&list_lock);
+
+		if (prio == HIGH)
+			create_ctx_with_priority(fd, HIGH);
+		else if (prio == LOW)
+			create_ctx_with_priority(fd, LOW);
+		else
+			ctx[j] = gem_context_create(fd);
+
+		send_canary(fd, ctx[j], engines[i], scratch, 0, NULL, timeout);
+		gem_close(fd, scratch);
+		pthread_mutex_unlock(&list_lock);
+		j++;
+	}
+}
+
+static void inject_hang(uint32_t fd, unsigned ring, uint32_t ctx_id,  unsigned flags)
+{
+	igt_hang_t hang;
+	hang = igt_hang_ctx(fd, ctx_id, ring, flags);
+	gem_sync(fd, hang.spin->handle);
+}
+
+static void gpu_watchdog_long_batch_2_contexts(int fd)
+{
+	unsigned flags = HANG_ALLOW_CAPTURE;
+	int i = 0, j=0;
+
+	igt_fork(child, 1) {
+		/* Child process */
+		uint32_t ctx[5];
+		const uint64_t batch_timeout_ms = timeout_100ms*3;
+		memset(&ctx, 0, sizeof(ctx));
+		bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));
+
+		sleep(6);
+
+	    	context_set_watchdog(fd, 1, 1, WATCHDOG_THRESHOLD);
+	    	clear_error_state(fd);
+
+		/* Engine ID: RCS0, Context ID: 1 */
+	   	inject_hang(fd, 1, 1, flags);
+
+		for (i = 0; i <5; i++) {
+			printf("ctx_child: %u ctx_cnt: %d tid:%d\n",ctx[i], syscall(SYS_gettid));
+			//gem_context_destroy(fd, ctx[i]);
+		}
+	}
+
+	{
+		/* Parent process */
+		uint32_t ctx[5];
+		const uint64_t batch_timeout_ms = timeout_100ms*1;
+		memset(&ctx, 0, sizeof(ctx));
+		printf("%s %d , tid: %d\n", __FUNCTION__,__LINE__, syscall(SYS_gettid));
+		
+		bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));
+
+		sleep(6);
+
+	    	context_set_watchdog(fd, 4, 10, WATCHDOG_THRESHOLD);
+	    	clear_error_state(fd);
+
+        	/* Engine ID: VECS0, Context ID: 10 */
+   		inject_hang(fd, 4, 10, flags);
+
+		for (i = 0; i <5; i++) {
+			igt_info("ctx_dad: %u ctx_cnt: %d tid:%d \n",ctx[i], i, syscall(SYS_gettid));
+			//gem_context_destroy(fd, ctx[i]);
+		}
+	}
+
+	igt_waitchildren();
+}
+
+static void gpu_watchodg_hang_long_batch_single_engine(int fd, unsigned engine_id, int nengine)
+{
+	uint32_t *ctx;
+	int i;
+	unsigned flags = HANG_ALLOW_CAPTURE;
+	const uint64_t batch_timeout_ms = timeout_100ms*4;
+
+	igt_require(nengine);
+
+	ctx = (uint32_t *)malloc(sizeof(uint32_t)*nengine);
+
+	if (!ctx) {
+		igt_info("Out of memory\n");  
+		exit(1);
+	}
+
+	bb_factory(fd, ctx, -1, batch_timeout_ms, syscall(SYS_gettid));
+	
+	if (engine_id == 1) {
+		context_set_watchdog(fd, engine_id, 1, WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+		inject_hang(fd, engine_id, 1, flags);
+	} else if (engine_id == 8194) {
+		context_set_watchdog(fd, engine_id, 2, WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+		inject_hang(fd, engine_id, 2, flags);
+	} else if (engine_id == 16386) {
+		context_set_watchdog(fd, engine_id, 3, WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+		inject_hang(fd, engine_id, 3, flags);
+	} else if (engine_id == 4) {
+		context_set_watchdog(fd, engine_id, 5, WATCHDOG_THRESHOLD);
+		clear_error_state(fd);
+		inject_hang(fd, engine_id, 5, flags);
+	}
+
+	for (i = 0; i <nengine; i++)
+		gem_context_destroy(fd, ctx[i]);
+	free(ctx);
+}
+static void no_gpu_hang_long_batch_all_engines(int fd, int nengine, int prio)
+{
+	uint32_t *ctx;
+	int i;
+	const uint64_t batch_timeout_ms = timeout_100ms*4;
+
+	ctx = (uint32_t *)malloc(sizeof(uint32_t)*nengine);
+
+	if (!ctx) {
+		printf("Out of memory\n");  
+		exit(1);
+	}
+
+	bb_factory(fd, ctx, prio, batch_timeout_ms, syscall(SYS_gettid));
+
+	for (i = 0; i <nengine; i++)
+		gem_context_destroy(fd, ctx[i]);
+
+	free(ctx);
+}
+
+igt_main
+{
+	int fd;
+	unsigned int nengine = 0;
+	unsigned int engine;
+	unsigned int engines[16];
+
+	igt_skip_on_simulation();
+
+	igt_fixture {
+		fd = drm_open_driver(DRIVER_INTEL);
+		igt_require_gem(fd);
+
+		for_each_physical_engine(fd, engine)
+			engines[nengine++] = engine;
+		igt_require(nengine);
+	}
+
+	igt_subtest_group {
+		igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-no-priority") {
+			int prio = -1;
+			no_gpu_hang_long_batch_all_engines(fd, nengine, prio);
+		}
+
+		igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-low-priority") {
+			int prio = LOW;
+			no_gpu_hang_long_batch_all_engines(fd, nengine, prio);
+		}
+
+		igt_subtest_f("no-gpu-watchdog-long-batch-all-engines-high-priority") {
+			int prio = HIGH;
+			no_gpu_hang_long_batch_all_engines(fd, nengine, prio);
+		}
+
+		for (const struct intel_execution_engine *e = intel_execution_engines; e->name; e++) {
+			/* no support for gpu watchdog on BLT */
+			if (e->exec_id == 0 || e->exec_id == I915_EXEC_BLT)
+				continue;
+
+			igt_subtest_f("gpu-watchdog-long-batch-%s", e->name) {
+				igt_require(gem_ring_has_physical_engine(fd, e->exec_id | e->flags));
+				gpu_watchodg_hang_long_batch_single_engine(fd, e->exec_id | e->flags, nengine);
+			}
+		}
+
+		igt_subtest_f("gpu-watchdog-long-batch-2-contexts") {
+			gpu_watchdog_long_batch_2_contexts(fd);
+		}
+    }
+
+    igt_fixture {
+	close(fd);
+    }
+}
diff --git a/tests/meson.build b/tests/meson.build
index 5167a6c..b281b75 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -210,6 +210,7 @@ i915_progs = [
 	'gem_unref_active_buffers',
 	'gem_userptr_blits',
 	'gem_wait',
+        'gem_watchdog',
 	'gem_workarounds',
 	'gem_write_read_ring_switch',
 	'i915_fb_tiling',
-- 
2.7.4