[igt-dev] [PATCH i-g-t v4 11/20] tests/core_hotunplug: Recover from subtest failures

Janusz Krzysztofik janusz.krzysztofik at linux.intel.com
Fri Aug 21 15:37:58 UTC 2020


Subtests now forcibly call or request igt_abort on failures in order to
avoid silently leaving an exercised device in an unusable state.
However, a failure inside a subtest doesn't always mean the device is
no longer working correctly and reboot is needed.  On the other hand,
if a subtest just fails without aborting, that doesn't mean in turn the
device is healthy.  We should still perform a device health check
in that case before deciding on next steps.

Reuse the 'failure' structure field as a mark which is set before each
critical operation which must be followed by a successful health check
in order to avoid aborting the test is executed.  Then, move health
checks not essential for subtests out of those subtest bodies, or just
copy them if essentiall, to subtest associated individual follow-up
igt_fixture sections, from where device file descriptors potentially
left open are closed, device rediscover or driver rebing operation is
run as needed, and finally the health check is run if the preceding
igt_subtest section exited with the marker set.

v2: Start each recovery phase from unconditionally closing file
    descriptors potentially left open by a subtest before it entered
    its critical section,
  - replace igt_require() with 'if() return;' construct in recover() to
    reduce noise,
  - replace "subtest failure" message used as a request for healthcheck
    with a more appropriate "need healthcheck" for clarity,
  - rebase on current upstream master.
v3: Refresh,
  - move bus_rescan() and driver_bind() function calls back from
    heaalthcheck() to recover() so a pure health check can still be
    called from a subtest if essential,
  - move failure mark assignments back from subtests to helpers for
    more adequate abort reason reporting but clean the mark only on
    health check success,
  - call cleanup() also from post_healthcheck() in order to close a
    device file descriptor potentially left open by a failed health
    check,
  - reword commit message and update description.

Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik at linux.intel.com>
Reviewed-by: Michał Winiarski <michal.winiarski at intel.com> # v1
---
 tests/core_hotunplug.c | 104 +++++++++++++++++++++++++++++------------
 1 file changed, 74 insertions(+), 30 deletions(-)

diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
index 602a91cf8..145593683 100644
--- a/tests/core_hotunplug.c
+++ b/tests/core_hotunplug.c
@@ -69,6 +69,9 @@ static int local_drm_open_driver(const char *prefix, const char *suffix)
 
 static int local_close(int fd, const char *message)
 {
+	if (fd < 0)	/* not open - return current status */
+		return fd;
+
 	errno = 0;
 	if (igt_warn_on_f(close(fd), "%s\n", message))
 		return -errno;	/* (never -1) */
@@ -115,24 +118,22 @@ static void prepare(struct hotunplug *priv)
 static void driver_unbind(struct hotunplug *priv, const char *prefix)
 {
 	igt_debug("%sunbinding the driver from the device\n", prefix);
+	priv->failure = "Driver unbind failure!";
 
-	priv->failure = "Driver unbind timeout!";
-	igt_set_timeout(60, priv->failure);
+	igt_set_timeout(60, "Driver unbind timeout!");
 	igt_sysfs_set(priv->fd.sysfs_drv, "unbind", priv->dev_bus_addr);
 	igt_reset_timeout();
-	priv->failure = NULL;
 }
 
 /* Re-bind the driver to the device */
 static void driver_bind(struct hotunplug *priv)
 {
 	igt_debug("rebinding the driver to the device\n");
+	priv->failure = "Driver re-bind failure!";
 
-	priv->failure = "Driver re-bind timeout!";
-	igt_set_timeout(60, priv->failure);
+	igt_set_timeout(60, "Driver re-bind timeout!");
 	igt_sysfs_set(priv->fd.sysfs_drv, "bind", priv->dev_bus_addr);
 	igt_reset_timeout();
-	priv->failure = NULL;
 }
 
 /* Remove (virtually unplug) the device from its bus */
@@ -145,12 +146,11 @@ static void device_unplug(struct hotunplug *priv, const char *prefix)
 	igt_assert_fd(priv->fd.sysfs_dev);
 
 	igt_debug("%sunplugging the device\n", prefix);
+	priv->failure = "Device unplug failure!";
 
-	priv->failure = "Device unplug timeout!";
-	igt_set_timeout(60, priv->failure);
+	igt_set_timeout(60, "Device unplug timeout!");
 	igt_sysfs_set(priv->fd.sysfs_dev, "remove", "1");
 	igt_reset_timeout();
-	priv->failure = NULL;
 
 	priv->fd.sysfs_dev = close_sysfs(priv->fd.sysfs_dev);
 }
@@ -159,17 +159,23 @@ static void device_unplug(struct hotunplug *priv, const char *prefix)
 static void bus_rescan(struct hotunplug *priv)
 {
 	igt_debug("rediscovering the device\n");
+	priv->failure = "Bus rescan failure!";
 
-	priv->failure = "Bus rescan timeout!";
-	igt_set_timeout(60, priv->failure);
+	igt_set_timeout(60, "Bus rescan timeout!");
 	igt_sysfs_set(priv->fd.sysfs_bus, "../rescan", "1");
 	igt_reset_timeout();
-	priv->failure = NULL;
+}
+
+static void cleanup(struct hotunplug *priv)
+{
+	priv->fd.drm = close_device(priv->fd.drm);
+	priv->fd.sysfs_dev = close_sysfs(priv->fd.sysfs_dev);
 }
 
 static void healthcheck(struct hotunplug *priv)
 {
 	/* preserve error code potentially stored before in priv->fd.drm */
+	bool closed = priv->fd.drm == -1;
 	int fd_drm;
 
 	/* device name may have changed, rebuild IGT device list */
@@ -177,23 +183,45 @@ static void healthcheck(struct hotunplug *priv)
 
 	priv->failure = "Device reopen failure!";
 	fd_drm = local_drm_open_driver("re", " for healthcheck");
-	priv->failure = NULL;
+	if (closed)	/* store for cleanup if no error code to preserve */
+		priv->fd.drm = fd_drm;
 
 	if (is_i915_device(fd_drm)) {
 		priv->failure = "GEM failure";
 		igt_require_gem(fd_drm);
 		priv->failure = NULL;
+	} else {
+		/* no device specific healthcheck, rely on reopen result */
+		priv->failure = NULL;
 	}
 
+	/* not only request igt_abort on failure, also fail the health check */
+	igt_fail_on_f(priv->failure, "%s\n", priv->failure);
+
 	fd_drm = close_device(fd_drm);
-	if (priv->fd.drm == -1)	/* store result if no error code to preserve */
+	if (closed)	/* store result if no error code to preserve */
 		priv->fd.drm = fd_drm;
 }
 
+static void recover(struct hotunplug *priv)
+{
+	cleanup(priv);
+
+	if (faccessat(priv->fd.sysfs_bus, priv->dev_bus_addr, F_OK, 0))
+		bus_rescan(priv);
+
+	else if (faccessat(priv->fd.sysfs_drv, priv->dev_bus_addr, F_OK, 0))
+		driver_bind(priv);
+
+	if (priv->failure)
+		healthcheck(priv);
+}
+
 static void post_healthcheck(struct hotunplug *priv)
 {
 	igt_abort_on_f(priv->failure, "%s\n", priv->failure);
 
+	cleanup(priv);
 	igt_require(priv->fd.drm == -1);
 }
 
@@ -242,8 +270,6 @@ static void hotunbind_lateclose(struct hotunplug *priv)
 
 	igt_debug("late closing the unbound device instance\n");
 	priv->fd.drm = close_device(priv->fd.drm);
-
-	healthcheck(priv);
 }
 
 static void hotunplug_lateclose(struct hotunplug *priv)
@@ -256,8 +282,6 @@ static void hotunplug_lateclose(struct hotunplug *priv)
 
 	igt_debug("late closing the removed device instance\n");
 	priv->fd.drm = close_device(priv->fd.drm);
-
-	healthcheck(priv);
 }
 
 /* Main */
@@ -288,30 +312,50 @@ igt_main
 		prepare(&priv);
 	}
 
-	igt_describe("Check if the driver can be cleanly unbound from a device believed to be closed");
-	igt_subtest("unbind-rebind")
-		unbind_rebind(&priv);
+	igt_subtest_group {
+		igt_describe("Check if the driver can be cleanly unbound from a device believed to be closed");
+		igt_subtest("unbind-rebind")
+			unbind_rebind(&priv);
+
+		igt_fixture
+			recover(&priv);
+	}
 
 	igt_fixture
 		post_healthcheck(&priv);
 
-	igt_describe("Check if a device believed to be closed can be cleanly unplugged");
-	igt_subtest("unplug-rescan")
-		unplug_rescan(&priv);
+	igt_subtest_group {
+		igt_describe("Check if a device believed to be closed can be cleanly unplugged");
+		igt_subtest("unplug-rescan")
+			unplug_rescan(&priv);
+
+		igt_fixture
+			recover(&priv);
+	}
 
 	igt_fixture
 		post_healthcheck(&priv);
 
-	igt_describe("Check if the driver can be cleanly unbound from a still open device, then released");
-	igt_subtest("hotunbind-lateclose")
-		hotunbind_lateclose(&priv);
+	igt_subtest_group {
+		igt_describe("Check if the driver can be cleanly unbound from a still open device, then released");
+		igt_subtest("hotunbind-lateclose")
+			hotunbind_lateclose(&priv);
+
+		igt_fixture
+			recover(&priv);
+	}
 
 	igt_fixture
 		post_healthcheck(&priv);
 
-	igt_describe("Check if a still open device can be cleanly unplugged, then released");
-	igt_subtest("hotunplug-lateclose")
-		hotunplug_lateclose(&priv);
+	igt_subtest_group {
+		igt_describe("Check if a still open device can be cleanly unplugged, then released");
+		igt_subtest("hotunplug-lateclose")
+			hotunplug_lateclose(&priv);
+
+		igt_fixture
+			recover(&priv);
+	}
 
 	igt_fixture {
 		post_healthcheck(&priv);
-- 
2.21.1



More information about the igt-dev mailing list