[PATCH i-g-t v5 18/21] tests/core_hotunplug: Add 'lateclose before restore' variants

Janusz Krzysztofik janusz.krzysztofik at linux.intel.com
Thu Aug 27 14:08:37 UTC 2020


If a GPU gets wedged during driver rebind or device re-plug for some
reason, current hotunbind/hotunplug test variants may time out before
lateclose phase, resulting in incomplete CI reports.

Add new test variants which close the device before restoring it.  Also
rename old variants to more adequate hotrebind/hotreplug-lateclose and
perform health checks both before and after late close.

v2: Rebase on upstream.
v3: Refresh,
  - further rename hotunbind/hotunplug-lateclose to hotunbind-rebind
    and hotunplug-rescan respectively, then add two more variants under
    the old names which only exercise late close, leaving rebind /
    rescan to be cared of in the post-subtest recovery phase,
  - also update descriptions of unmodified subtests for consistency.
v4: Refresh,
  - drop subtests with no health checks, adjust timeouts in successors,
  - perform health checks of hot restored devices also before late
    close,
  - in order to be able to safely run a health check while still
    keeping an unbound / unplugged device instance open, also preserve
    the open device fd, not only a close error,
  - adjust subtest descriptions.

Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik at linux.intel.com>
Reviewed-by: Michał Winiarski <michal.winiarski at intel.com> # v2
---
 tests/core_hotunplug.c | 98 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 80 insertions(+), 18 deletions(-)

diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
index 1f211a820..305c57a3f 100644
--- a/tests/core_hotunplug.c
+++ b/tests/core_hotunplug.c
@@ -276,17 +276,19 @@ static int local_i915_recover(int i915)
 
 static void healthcheck(struct hotunplug *priv, bool recover)
 {
-	/* preserve error code potentially stored before in priv->fd.drm */
+	/* preserve device fd / close status stored in priv->fd.drm */
+	int fd_drm, saved_fd_drm = priv->fd.drm;
 	bool closed = priv->fd.drm == -1;
-	int fd_drm;
 
 	/* device name may have changed, rebuild IGT device list */
 	igt_devices_scan(true);
 
 	priv->failure = "Device reopen failure!";
 	fd_drm = local_drm_open_driver("re", " for healthcheck");
-	if (closed)	/* store for cleanup if no error code to preserve */
+	if (closed)	/* store for cleanup if not dirty */
 		priv->fd.drm = fd_drm;
+	else		/* force close error should we fail prematurely */
+		priv->fd.drm = -EBADF;
 
 	if (is_i915_device(fd_drm)) {
 		const char *failure = NULL;
@@ -308,8 +310,10 @@ static void healthcheck(struct hotunplug *priv, bool recover)
 	}
 
 	fd_drm = close_device(fd_drm);
-	if (closed)	/* store result if no error code to preserve */
+	if (closed)	/* store result if no dirty status to preserve */
 		priv->fd.drm = fd_drm;
+	else if (fd_drm == -1)	/* cancel fake error, restore saved status */
+		priv->fd.drm = saved_fd_drm;
 
 	/* not only request igt_abort on failure, also fail the health check */
 	igt_fail_on_f(priv->failure, "%s\n", priv->failure);
@@ -381,31 +385,65 @@ static void unplug_rescan(struct hotunplug *priv)
 	healthcheck(priv, false);
 }
 
-static void hotunbind_lateclose(struct hotunplug *priv)
+static void hotunbind_rebind(struct hotunplug *priv)
 {
 	igt_assert_eq(priv->fd.drm, -1);
-	priv->fd.drm = local_drm_open_driver("", " for hotunbind");
+	priv->fd.drm = local_drm_open_driver("", " for hotrebind");
 
 	driver_unbind(priv, "hot ", 0);
 
-	driver_bind(priv, 0);
-
 	igt_debug("late closing the unbound device instance\n");
 	priv->fd.drm = close_device(priv->fd.drm);
 	igt_assert_eq(priv->fd.drm, -1);
 
+	driver_bind(priv, 0);
+
 	healthcheck(priv, false);
 }
 
-static void hotunplug_lateclose(struct hotunplug *priv)
+static void hotunplug_rescan(struct hotunplug *priv)
 {
 	igt_assert_eq(priv->fd.drm, -1);
-	priv->fd.drm = local_drm_open_driver("", " for hotunplug");
+	priv->fd.drm = local_drm_open_driver("", " for hotreplug");
 
 	device_unplug(priv, "hot ", 0);
 
+	igt_debug("late closing the removed device instance\n");
+	priv->fd.drm = close_device(priv->fd.drm);
+	igt_assert_eq(priv->fd.drm, -1);
+
 	bus_rescan(priv, 0);
 
+	healthcheck(priv, false);
+}
+
+static void hotrebind_lateclose(struct hotunplug *priv)
+{
+	priv->fd.drm = local_drm_open_driver("", " for hotrebind");
+
+	driver_unbind(priv, "hot ", 60);
+
+	driver_bind(priv, 0);
+
+	healthcheck(priv, false);
+
+	igt_debug("late closing the unbound device instance\n");
+	priv->fd.drm = close_device(priv->fd.drm);
+	igt_assert_eq(priv->fd.drm, -1);
+
+	healthcheck(priv, false);
+}
+
+static void hotreplug_lateclose(struct hotunplug *priv)
+{
+	priv->fd.drm = local_drm_open_driver("", " for hotreplug");
+
+	device_unplug(priv, "hot ", 60);
+
+	bus_rescan(priv, 0);
+
+	healthcheck(priv, false);
+
 	igt_debug("late closing the removed device instance\n");
 	priv->fd.drm = close_device(priv->fd.drm);
 	igt_assert_eq(priv->fd.drm, -1);
@@ -443,7 +481,7 @@ igt_main
 	}
 
 	igt_subtest_group {
-		igt_describe("Check if the driver can be cleanly unbound from a device believed to be closed");
+		igt_describe("Check if the driver can be cleanly unbound from a device believed to be closed, then rebound");
 		igt_subtest("unbind-rebind")
 			unbind_rebind(&priv);
 
@@ -455,7 +493,7 @@ igt_main
 		post_healthcheck(&priv);
 
 	igt_subtest_group {
-		igt_describe("Check if a device believed to be closed can be cleanly unplugged");
+		igt_describe("Check if a device believed to be closed can be cleanly unplugged, then restored");
 		igt_subtest("unplug-rescan")
 			unplug_rescan(&priv);
 
@@ -467,9 +505,33 @@ igt_main
 		post_healthcheck(&priv);
 
 	igt_subtest_group {
-		igt_describe("Check if the driver can be cleanly unbound from a still open device, then released");
-		igt_subtest("hotunbind-lateclose")
-			hotunbind_lateclose(&priv);
+		igt_describe("Check if the driver can be cleanly unbound from an open device, then released and rebound");
+		igt_subtest("hotunbind-rebind")
+			hotunbind_rebind(&priv);
+
+		igt_fixture
+			recover(&priv);
+	}
+
+	igt_fixture
+		post_healthcheck(&priv);
+
+	igt_subtest_group {
+		igt_describe("Check if an open device can be cleanly unplugged, then released and restored");
+		igt_subtest("hotunplug-rescan")
+			hotunplug_rescan(&priv);
+
+		igt_fixture
+			recover(&priv);
+	}
+
+	igt_fixture
+		post_healthcheck(&priv);
+
+	igt_subtest_group {
+		igt_describe("Check if the driver hot unbound from a still open device can be cleanly rebound, then the old instance released");
+		igt_subtest("hotrebind-lateclose")
+			hotrebind_lateclose(&priv);
 
 		igt_fixture
 			recover(&priv);
@@ -479,9 +541,9 @@ igt_main
 		post_healthcheck(&priv);
 
 	igt_subtest_group {
-		igt_describe("Check if a still open device can be cleanly unplugged, then released");
-		igt_subtest("hotunplug-lateclose")
-			hotunplug_lateclose(&priv);
+		igt_describe("Check if a still open while hot unplugged device can be cleanly restored, then the old instance released");
+		igt_subtest("hotreplug-lateclose")
+			hotreplug_lateclose(&priv);
 
 		igt_fixture
 			recover(&priv);
-- 
2.21.1



More information about the Intel-gfx-trybot mailing list