[PATCH i-g-t v3 12/13] tests/core_hotunplug: More thorough healthcheck, more careful recovery

Janusz Krzysztofik janusz.krzysztofik at linux.intel.com
Mon Aug 10 17:30:57 UTC 2020


Proof of concept

Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik at linux.intel.com>
---
 tests/core_hotunplug.c | 210 ++++++++++++++++++++++++++++-------------
 1 file changed, 142 insertions(+), 68 deletions(-)

diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
index 313c44784..222c6087e 100644
--- a/tests/core_hotunplug.c
+++ b/tests/core_hotunplug.c
@@ -25,6 +25,7 @@
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
@@ -90,27 +91,34 @@ static void prepare(struct hotunplug *priv)
 }
 
 /* Unbind the driver from the device */
-static void driver_unbind(struct hotunplug *priv, const char *prefix)
+static void driver_unbind(struct hotunplug *priv, const char *method,
+			  int timeout)
 {
-	igt_debug("%sunbinding the driver from the device\n", prefix);
+	igt_debug("%sunbinding the driver from the device\n", method);
+	priv->failure = "Driver unbind failure!";
 
-	igt_set_timeout(60, "Driver unbind timeout!");
+	igt_set_timeout(timeout, "Driver unbind timeout!");
 	igt_sysfs_set(priv->fd.sysfs_drv, "unbind", priv->dev_bus_addr);
 	igt_reset_timeout();
 }
 
 /* Re-bind the driver to the device */
-static void driver_bind(struct hotunplug *priv)
+static void driver_bind(struct hotunplug *priv, int timeout)
 {
 	igt_debug("rebinding the driver to the device\n");
+	priv->failure = "Driver re-bind failure!";
 
-	igt_set_timeout(60, "Driver re-bind timeout!");
+	igt_set_timeout(timeout, "Driver re-bind timeout!");
 	igt_sysfs_set(priv->fd.sysfs_drv, "bind", priv->dev_bus_addr);
 	igt_reset_timeout();
+
+	/* device name may have changed, rebuild IGT device list */
+	igt_devices_scan(true);
 }
 
 /* Remove (virtually unplug) the device from its bus */
-static void device_unplug(struct hotunplug *priv, const char *prefix)
+static void device_unplug(struct hotunplug *priv, const char *method,
+			  int timeout)
 {
 	igt_require_f(priv->fd.sysfs_dev == -1,
 		      "Device sysfs node not closed properly\n");
@@ -119,9 +127,10 @@ static void device_unplug(struct hotunplug *priv, const char *prefix)
 				    O_DIRECTORY);
 	igt_assert_fd(priv->fd.sysfs_dev);
 
-	igt_debug("%sunplugging the device\n", prefix);
+	igt_debug("%sunplugging the device\n", method);
+	priv->failure = "Device unplug failure!";
 
-	igt_set_timeout(60, "Device unplug timeout!");
+	igt_set_timeout(timeout, "Device unplug timeout!");
 	igt_sysfs_set(priv->fd.sysfs_dev, "remove", "1");
 	igt_reset_timeout();
 
@@ -131,13 +140,17 @@ static void device_unplug(struct hotunplug *priv, const char *prefix)
 }
 
 /* Re-discover the device by rescanning its bus */
-static void bus_rescan(struct hotunplug *priv)
+static void bus_rescan(struct hotunplug *priv, int timeout)
 {
-	igt_debug("recovering the device\n");
+	igt_debug("restorinig the device\n");
+	priv->failure = "Bus rescan failure!";
 
-	igt_set_timeout(60, "Bus rescan timeout!");
+	igt_set_timeout(timeout, "Bus rescan timeout!");
 	igt_sysfs_set(priv->fd.sysfs_bus, "../rescan", "1");
 	igt_reset_timeout();
+
+	/* device name may have changed, rebuild IGT device list */
+	igt_devices_scan(true);
 }
 
 static void cleanup(struct hotunplug *priv)
@@ -146,51 +159,120 @@ static void cleanup(struct hotunplug *priv)
 	priv->fd.sysfs_dev = local_close(priv->fd.sysfs_dev);
 }
 
-static void healthcheck(struct hotunplug *priv)
+static int local_open_driver(int chipset, const char *reason)
 {
-	/* preserve error code potentially stored before in priv->fd.drm */
 	int fd_drm;
 
-	if (faccessat(priv->fd.sysfs_bus, priv->dev_bus_addr, F_OK, 0)) {
-		priv->failure = "Bus rescan failed!";
-		bus_rescan(priv);
-		priv->failure = NULL;
+	igt_debug("%sopening the device\n", reason);
+
+	fd_drm = __drm_open_driver(chipset);
+	igt_assert_fd(fd_drm);
+
+	return fd_drm;
+}
+
+static void local_close_driver(int fd_drm, struct hotunplug *priv)
+{
+	fd_drm = local_close(fd_drm);
+
+	if (priv->fd.drm == -1)
+		priv->fd.drm = fd_drm;
+
+	igt_warn_on_f(fd_drm != -1, "Device close failed\n");
+}
+
+static bool local_i915_is_wedged(int i915)
+{
+	int err = 0;
+
+	if (ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE))
+		err = -errno;
+	return err == -EIO;
+}
+
+static void local_i915_healthcheck(int i915, struct hotunplug *priv)
+{
+	const uint32_t bbe = MI_BATCH_BUFFER_END;
+	struct drm_i915_gem_exec_object2 obj = { };
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+	};
+	const struct intel_execution_engine2 *engine;
+
+	priv->failure = "Healthcheck failure!";
+	if (local_i915_is_wedged(i915))
+		return;
+
+	priv->failure = "Unrecoverable test failure";
+	obj.handle = gem_create(i915, 4096);
+	gem_write(i915, obj.handle, 0, &bbe, sizeof(bbe));
+
+	priv->failure = "Healthcheck failure!";
+	__for_each_physical_engine(i915, engine) {
+		execbuf.flags = engine->flags;
+		if (__gem_execbuf(i915, &execbuf))
+			return;
 	}
 
-	if (faccessat(priv->fd.sysfs_drv, priv->dev_bus_addr, F_OK, 0)) {
-		priv->failure = "Driver re-bind failed!";
-		driver_bind(priv);
+	priv->failure = "Unrecoverable test failure";
+	gem_sync(i915, obj.handle);
+	gem_close(i915, obj.handle);
+
+	if (local_i915_is_wedged(i915))
+		priv->failure = "Healthcheck failure!";
+	else
 		priv->failure = NULL;
-	}
+}
 
-	/* device name may have changed, rebuild IGT device list */
-	igt_devices_scan(true);
+static void local_i915_recover(int i915, struct hotunplug *priv)
+{
+	priv->failure = "Device recovery failed!";
+
+	igt_force_gpu_reset(i915);
+	local_i915_healthcheck(i915, priv);
+}
+
+static void healthcheck(struct hotunplug *priv)
+{
+	/* preserve error code potentially stored before in priv->fd.drm */
+	int fd_drm;
 
-	igt_debug("reopening the device\n");
-	fd_drm = __drm_open_driver(DRIVER_ANY);
-	igt_abort_on_f(fd_drm < 0, "Device reopen failure\n");
+	fd_drm = local_open_driver(DRIVER_ANY, "re");
 
-	if (is_i915_device(fd_drm)) {
-		priv->failure = "GEM failure";
-		igt_require_gem(fd_drm);
+	if (is_i915_device(fd_drm))
+		local_i915_healthcheck(fd_drm, priv);
+	else
 		priv->failure = NULL;
-	}
 
-	fd_drm = local_close(fd_drm);
-	if (priv->fd.drm == -1)
-		priv->fd.drm = fd_drm;
-	igt_assert_f(fd_drm == -1, "Device close failed\n");
+	local_close_driver(fd_drm, priv);
 }
 
 static void recover(struct hotunplug *priv)
+{
+	int fd_drm = local_open_driver(DRIVER_ANY, "re");
+
+	if (is_i915_device(fd_drm))
+		local_i915_recover(fd_drm, priv);
+
+	local_close_driver(fd_drm, priv);
+}
+
+static void restore(struct hotunplug *priv)
 {
 	cleanup(priv);
 
-	if (!priv->failure)
-		return;
-	priv->failure = NULL;
+	if (faccessat(priv->fd.sysfs_bus, priv->dev_bus_addr, F_OK, 0))
+		bus_rescan(priv, 60);
 
-	healthcheck(priv);
+	else if (faccessat(priv->fd.sysfs_drv, priv->dev_bus_addr, F_OK, 0))
+		driver_bind(priv, 60);
+
+	if (priv->failure)
+		healthcheck(priv);
+
+	if (priv->failure)
+		recover(priv);
 }
 
 static void post_healthcheck(struct hotunplug *priv)
@@ -219,54 +301,44 @@ static void set_filter_from_device(int fd)
 
 static void unbind_rebind(struct hotunplug *priv)
 {
-	priv->failure = "need healthcheck";
+	driver_unbind(priv, "", 0);
 
-	driver_unbind(priv, "");
+	driver_bind(priv, 0);
 
-	driver_bind(priv);
+	healthcheck(priv);
 }
 
 static void unplug_rescan(struct hotunplug *priv)
 {
-	priv->failure = "need healthcheck";
+	device_unplug(priv, "", 0);
 
-	device_unplug(priv, "");
+	bus_rescan(priv, 0);
 
-	bus_rescan(priv);
+	healthcheck(priv);
 }
 
 static void hotunbind_lateclose(struct hotunplug *priv)
 {
-	igt_debug("opening device\n");
-	priv->fd.drm = __drm_open_driver(DRIVER_ANY);
-	igt_assert_fd(priv->fd.drm);
-
-	priv->failure = "need healthcheck";
+	priv->fd.drm = local_open_driver(DRIVER_ANY, "");
 
-	driver_unbind(priv, "hot ");
+	driver_unbind(priv, "hot ", 60);
 
-	driver_bind(priv);
+	driver_bind(priv, 60);
 
 	igt_debug("late closing the unbound device instance\n");
-	priv->fd.drm = local_close(priv->fd.drm);
-	igt_warn_on_f(priv->fd.drm != -1, "Device close failed\n");
+	local_close_driver(priv->fd.drm, priv);
 }
 
 static void hotunplug_lateclose(struct hotunplug *priv)
 {
-	igt_debug("opening device\n");
-	priv->fd.drm = __drm_open_driver(DRIVER_ANY);
-	igt_assert_fd(priv->fd.drm);
-
-	priv->failure = "need healthcheck";
+	priv->fd.drm = local_open_driver(DRIVER_ANY, "");
 
-	device_unplug(priv, "hot ");
+	device_unplug(priv, "hot ", 60);
 
-	bus_rescan(priv);
+	bus_rescan(priv, 60);
 
 	igt_debug("late closing the removed device instance\n");
-	priv->fd.drm = local_close(priv->fd.drm);
-	igt_warn_on_f(priv->fd.drm != -1, "Device close failed\n");
+	local_close_driver(priv->fd.drm, priv);
 }
 
 /* Main */
@@ -289,8 +361,10 @@ igt_main
 		fd_drm = __drm_open_driver(DRIVER_ANY);
 		igt_assert_fd(fd_drm);
 
-		if (is_i915_device(fd_drm))
-			igt_require_gem(fd_drm);
+		if (is_i915_device(fd_drm)) {
+			local_i915_healthcheck(fd_drm, &priv);
+			igt_skip_on_f(priv.failure, "%s\n", priv.failure);
+		}
 
 		/* Make sure subtests always reopen the same device */
 		set_filter_from_device(fd_drm);
@@ -306,7 +380,7 @@ igt_main
 			unbind_rebind(&priv);
 
 		igt_fixture
-			recover(&priv);
+			restore(&priv);
 	}
 
 	igt_fixture
@@ -318,7 +392,7 @@ igt_main
 			unplug_rescan(&priv);
 
 		igt_fixture
-			recover(&priv);
+			restore(&priv);
 	}
 
 	igt_fixture
@@ -330,7 +404,7 @@ igt_main
 			hotunbind_lateclose(&priv);
 
 		igt_fixture
-			recover(&priv);
+			restore(&priv);
 	}
 
 	igt_fixture
@@ -342,7 +416,7 @@ igt_main
 			hotunplug_lateclose(&priv);
 
 		igt_fixture
-			recover(&priv);
+			restore(&priv);
 	}
 
 	igt_fixture {
-- 
2.21.1



More information about the Intel-gfx-trybot mailing list