[PATCH i-g-t v3 17/19] tests/core_hotunplug: More thorough i915 healthcheck and recovery

Janusz Krzysztofik janusz.krzysztofik at linux.intel.com
Mon Aug 17 15:15:03 UTC 2020


The test now assumes the i915 driver is able to identify potential
hardware or driver issues while rebinding to a device and indicate them
by marking the GPU wedged.  Should that assumption occur wrong, the
health check phase of the test would happily succeed while potentially
leaving the device in an unusable state.  That would not only give us
falsely positive test results but could also potentially affect
subsequently run applications.  Then, we should examine health of the
exercised device more thoroughly and try harder to recover it from
potentially detected stalls.

We could use a gem_test_engine() library function which submits and
asserts successful execution of a NOP batch on each physical engine.
Unfortunately, on failure this function jumps out of an IGT test
section it is called from, while we would like to continue with
recovery steps, possibly not adding another level of test section group
nesting.  Moreover, the function opens the device again and doesn't
close the extra file descriptor before the jump, while we care for
being able to close the exercised device completely before running
certain subtest  operations.  Then, reimplement the function locally
with those issues fixed and use it as an i915 healthcheck.  Call it
also on test startup so operations performed by the test are never
blamed for driver or hardware issues which may potentially exist and
be possible to detect on test start.

Should the i915 GPU be found unresponsive by the health check after
driver rebind, try harder to recover it to a usable state with a GPU
reset.

Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik at linux.intel.com>
---
 tests/core_hotunplug.c | 73 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 70 insertions(+), 3 deletions(-)

diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
index 13f9e474e..57b09e4cd 100644
--- a/tests/core_hotunplug.c
+++ b/tests/core_hotunplug.c
@@ -25,6 +25,7 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -202,6 +203,60 @@ static void cleanup(struct hotunplug *priv)
 	priv->fd.sysfs_dev = local_close(priv->fd.sysfs_dev);
 }
 
+static bool local_i915_is_wedged(int i915)
+{
+	int err = 0;
+
+	if (ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE))
+		err = -errno;
+	return err == -EIO;
+}
+
+static int local_i915_healthcheck(int i915)
+{
+	const uint32_t bbe = MI_BATCH_BUFFER_END;
+	struct drm_i915_gem_exec_object2 obj = { };
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+	};
+	const struct intel_execution_engine2 *engine;
+	int ret = 0;
+
+	igt_debug("running i915 GPU healthcheck\n");
+
+	if (local_i915_is_wedged(i915))
+		return -EIO;
+
+	obj.handle = gem_create(i915, 4096);
+	gem_write(i915, obj.handle, 0, &bbe, sizeof(bbe));
+
+	__for_each_physical_engine(i915, engine) {
+		execbuf.flags = engine->flags;
+		ret = __gem_execbuf(i915, &execbuf);
+		if (ret)
+			break;
+	}
+
+	gem_sync(i915, obj.handle);
+	gem_close(i915, obj.handle);
+
+	if (ret)
+		return ret;
+
+	if (local_i915_is_wedged(i915))
+		return -EIO;
+
+	return 0;
+}
+
+static int local_i915_recover(int i915)
+{
+	igt_debug("forcing i915 GPU reset\n");
+	igt_force_gpu_reset(i915);
+	return local_i915_healthcheck(i915);
+}
+
 static void healthcheck(struct hotunplug *priv)
 {
 	int fd_drm;
@@ -216,11 +271,22 @@ static void healthcheck(struct hotunplug *priv)
 		priv->fd.drm = fd_drm;
 
 	if (is_i915_device(fd_drm)) {
-		priv->failure = "GEM failure";
-		igt_require_gem(fd_drm);
+		/* Don't report library failed asserts as healthcheck failure */
+		priv->failure = "Unrecoverable test failure";
+		if (local_i915_healthcheck(fd_drm) &&
+		    local_i915_recover(fd_drm))
+			priv->failure = "Healthcheck failure!";
+		else
+			priv->failure = NULL;
+
+	} else {
+		/* No device specific healthcheck, rely on reopen result */
 		priv->failure = NULL;
 	}
 
+	/* Not only request igt_abort on failure, also fail the health check */
+	igt_fail_on_f(priv->failure, "%s\n", priv->failure);
+
 	igt_assert_eq(local_close_driver(fd_drm, priv), -1);
 }
 
@@ -322,7 +388,8 @@ igt_main
 
 		if (is_i915_device(fd_drm)) {
 			gem_quiescent_gpu(fd_drm);
-			igt_require_gem(fd_drm);
+			igt_skip_on_f(local_i915_healthcheck(fd_drm),
+				      "i915 device not healthy on test start\n");
 		}
 
 		/* Make sure subtests always reopen the same device */
-- 
2.21.1



More information about the Intel-gfx-trybot mailing list