[PATCH i-g-t v3 12/13] tests/core_hotunplug: More thorough healthcheck, more careful recovery
Janusz Krzysztofik
janusz.krzysztofik at linux.intel.com
Mon Aug 10 17:30:57 UTC 2020
Proof of concept
Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik at linux.intel.com>
---
tests/core_hotunplug.c | 210 ++++++++++++++++++++++++++++-------------
1 file changed, 142 insertions(+), 68 deletions(-)
diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
index 313c44784..222c6087e 100644
--- a/tests/core_hotunplug.c
+++ b/tests/core_hotunplug.c
@@ -25,6 +25,7 @@
#include <limits.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
@@ -90,27 +91,34 @@ static void prepare(struct hotunplug *priv)
}
/* Unbind the driver from the device */
-static void driver_unbind(struct hotunplug *priv, const char *prefix)
+static void driver_unbind(struct hotunplug *priv, const char *method,
+ int timeout)
{
- igt_debug("%sunbinding the driver from the device\n", prefix);
+ igt_debug("%sunbinding the driver from the device\n", method);
+ priv->failure = "Driver unbind failure!";
- igt_set_timeout(60, "Driver unbind timeout!");
+ igt_set_timeout(timeout, "Driver unbind timeout!");
igt_sysfs_set(priv->fd.sysfs_drv, "unbind", priv->dev_bus_addr);
igt_reset_timeout();
}
/* Re-bind the driver to the device */
-static void driver_bind(struct hotunplug *priv)
+static void driver_bind(struct hotunplug *priv, int timeout)
{
igt_debug("rebinding the driver to the device\n");
+ priv->failure = "Driver re-bind failure!";
- igt_set_timeout(60, "Driver re-bind timeout!");
+ igt_set_timeout(timeout, "Driver re-bind timeout!");
igt_sysfs_set(priv->fd.sysfs_drv, "bind", priv->dev_bus_addr);
igt_reset_timeout();
+
+ /* device name may have changed, rebuild IGT device list */
+ igt_devices_scan(true);
}
/* Remove (virtually unplug) the device from its bus */
-static void device_unplug(struct hotunplug *priv, const char *prefix)
+static void device_unplug(struct hotunplug *priv, const char *method,
+ int timeout)
{
igt_require_f(priv->fd.sysfs_dev == -1,
"Device sysfs node not closed properly\n");
@@ -119,9 +127,10 @@ static void device_unplug(struct hotunplug *priv, const char *prefix)
O_DIRECTORY);
igt_assert_fd(priv->fd.sysfs_dev);
- igt_debug("%sunplugging the device\n", prefix);
+ igt_debug("%sunplugging the device\n", method);
+ priv->failure = "Device unplug failure!";
- igt_set_timeout(60, "Device unplug timeout!");
+ igt_set_timeout(timeout, "Device unplug timeout!");
igt_sysfs_set(priv->fd.sysfs_dev, "remove", "1");
igt_reset_timeout();
@@ -131,13 +140,17 @@ static void device_unplug(struct hotunplug *priv, const char *prefix)
}
/* Re-discover the device by rescanning its bus */
-static void bus_rescan(struct hotunplug *priv)
+static void bus_rescan(struct hotunplug *priv, int timeout)
{
- igt_debug("recovering the device\n");
+ igt_debug("restorinig the device\n");
+ priv->failure = "Bus rescan failure!";
- igt_set_timeout(60, "Bus rescan timeout!");
+ igt_set_timeout(timeout, "Bus rescan timeout!");
igt_sysfs_set(priv->fd.sysfs_bus, "../rescan", "1");
igt_reset_timeout();
+
+ /* device name may have changed, rebuild IGT device list */
+ igt_devices_scan(true);
}
static void cleanup(struct hotunplug *priv)
@@ -146,51 +159,120 @@ static void cleanup(struct hotunplug *priv)
priv->fd.sysfs_dev = local_close(priv->fd.sysfs_dev);
}
-static void healthcheck(struct hotunplug *priv)
+static int local_open_driver(int chipset, const char *reason)
{
- /* preserve error code potentially stored before in priv->fd.drm */
int fd_drm;
- if (faccessat(priv->fd.sysfs_bus, priv->dev_bus_addr, F_OK, 0)) {
- priv->failure = "Bus rescan failed!";
- bus_rescan(priv);
- priv->failure = NULL;
+ igt_debug("%sopening the device\n", reason);
+
+ fd_drm = __drm_open_driver(chipset);
+ igt_assert_fd(fd_drm);
+
+ return fd_drm;
+}
+
+static void local_close_driver(int fd_drm, struct hotunplug *priv)
+{
+ fd_drm = local_close(fd_drm);
+
+ if (priv->fd.drm == -1)
+ priv->fd.drm = fd_drm;
+
+ igt_warn_on_f(fd_drm != -1, "Device close failed\n");
+}
+
+static bool local_i915_is_wedged(int i915)
+{
+ int err = 0;
+
+ if (ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE))
+ err = -errno;
+ return err == -EIO;
+}
+
+static void local_i915_healthcheck(int i915, struct hotunplug *priv)
+{
+ const uint32_t bbe = MI_BATCH_BUFFER_END;
+ struct drm_i915_gem_exec_object2 obj = { };
+ struct drm_i915_gem_execbuffer2 execbuf = {
+ .buffers_ptr = to_user_pointer(&obj),
+ .buffer_count = 1,
+ };
+ const struct intel_execution_engine2 *engine;
+
+ priv->failure = "Healthcheck failure!";
+ if (local_i915_is_wedged(i915))
+ return;
+
+ priv->failure = "Unrecoverable test failure";
+ obj.handle = gem_create(i915, 4096);
+ gem_write(i915, obj.handle, 0, &bbe, sizeof(bbe));
+
+ priv->failure = "Healthcheck failure!";
+ __for_each_physical_engine(i915, engine) {
+ execbuf.flags = engine->flags;
+ if (__gem_execbuf(i915, &execbuf))
+ return;
}
- if (faccessat(priv->fd.sysfs_drv, priv->dev_bus_addr, F_OK, 0)) {
- priv->failure = "Driver re-bind failed!";
- driver_bind(priv);
+ priv->failure = "Unrecoverable test failure";
+ gem_sync(i915, obj.handle);
+ gem_close(i915, obj.handle);
+
+ if (local_i915_is_wedged(i915))
+ priv->failure = "Healthcheck failure!";
+ else
priv->failure = NULL;
- }
+}
- /* device name may have changed, rebuild IGT device list */
- igt_devices_scan(true);
+static void local_i915_recover(int i915, struct hotunplug *priv)
+{
+ priv->failure = "Device recovery failed!";
+
+ igt_force_gpu_reset(i915);
+ local_i915_healthcheck(i915, priv);
+}
+
+static void healthcheck(struct hotunplug *priv)
+{
+ /* preserve error code potentially stored before in priv->fd.drm */
+ int fd_drm;
- igt_debug("reopening the device\n");
- fd_drm = __drm_open_driver(DRIVER_ANY);
- igt_abort_on_f(fd_drm < 0, "Device reopen failure\n");
+ fd_drm = local_open_driver(DRIVER_ANY, "re");
- if (is_i915_device(fd_drm)) {
- priv->failure = "GEM failure";
- igt_require_gem(fd_drm);
+ if (is_i915_device(fd_drm))
+ local_i915_healthcheck(fd_drm, priv);
+ else
priv->failure = NULL;
- }
- fd_drm = local_close(fd_drm);
- if (priv->fd.drm == -1)
- priv->fd.drm = fd_drm;
- igt_assert_f(fd_drm == -1, "Device close failed\n");
+ local_close_driver(fd_drm, priv);
}
static void recover(struct hotunplug *priv)
+{
+ int fd_drm = local_open_driver(DRIVER_ANY, "re");
+
+ if (is_i915_device(fd_drm))
+ local_i915_recover(fd_drm, priv);
+
+ local_close_driver(fd_drm, priv);
+}
+
+static void restore(struct hotunplug *priv)
{
cleanup(priv);
- if (!priv->failure)
- return;
- priv->failure = NULL;
+ if (faccessat(priv->fd.sysfs_bus, priv->dev_bus_addr, F_OK, 0))
+ bus_rescan(priv, 60);
- healthcheck(priv);
+ else if (faccessat(priv->fd.sysfs_drv, priv->dev_bus_addr, F_OK, 0))
+ driver_bind(priv, 60);
+
+ if (priv->failure)
+ healthcheck(priv);
+
+ if (priv->failure)
+ recover(priv);
}
static void post_healthcheck(struct hotunplug *priv)
@@ -219,54 +301,44 @@ static void set_filter_from_device(int fd)
static void unbind_rebind(struct hotunplug *priv)
{
- priv->failure = "need healthcheck";
+ driver_unbind(priv, "", 0);
- driver_unbind(priv, "");
+ driver_bind(priv, 0);
- driver_bind(priv);
+ healthcheck(priv);
}
static void unplug_rescan(struct hotunplug *priv)
{
- priv->failure = "need healthcheck";
+ device_unplug(priv, "", 0);
- device_unplug(priv, "");
+ bus_rescan(priv, 0);
- bus_rescan(priv);
+ healthcheck(priv);
}
static void hotunbind_lateclose(struct hotunplug *priv)
{
- igt_debug("opening device\n");
- priv->fd.drm = __drm_open_driver(DRIVER_ANY);
- igt_assert_fd(priv->fd.drm);
-
- priv->failure = "need healthcheck";
+ priv->fd.drm = local_open_driver(DRIVER_ANY, "");
- driver_unbind(priv, "hot ");
+ driver_unbind(priv, "hot ", 60);
- driver_bind(priv);
+ driver_bind(priv, 60);
igt_debug("late closing the unbound device instance\n");
- priv->fd.drm = local_close(priv->fd.drm);
- igt_warn_on_f(priv->fd.drm != -1, "Device close failed\n");
+ local_close_driver(priv->fd.drm, priv);
}
static void hotunplug_lateclose(struct hotunplug *priv)
{
- igt_debug("opening device\n");
- priv->fd.drm = __drm_open_driver(DRIVER_ANY);
- igt_assert_fd(priv->fd.drm);
-
- priv->failure = "need healthcheck";
+ priv->fd.drm = local_open_driver(DRIVER_ANY, "");
- device_unplug(priv, "hot ");
+ device_unplug(priv, "hot ", 60);
- bus_rescan(priv);
+ bus_rescan(priv, 60);
igt_debug("late closing the removed device instance\n");
- priv->fd.drm = local_close(priv->fd.drm);
- igt_warn_on_f(priv->fd.drm != -1, "Device close failed\n");
+ local_close_driver(priv->fd.drm, priv);
}
/* Main */
@@ -289,8 +361,10 @@ igt_main
fd_drm = __drm_open_driver(DRIVER_ANY);
igt_assert_fd(fd_drm);
- if (is_i915_device(fd_drm))
- igt_require_gem(fd_drm);
+ if (is_i915_device(fd_drm)) {
+ local_i915_healthcheck(fd_drm, &priv);
+ igt_skip_on_f(priv.failure, "%s\n", priv.failure);
+ }
/* Make sure subtests always reopen the same device */
set_filter_from_device(fd_drm);
@@ -306,7 +380,7 @@ igt_main
unbind_rebind(&priv);
igt_fixture
- recover(&priv);
+ restore(&priv);
}
igt_fixture
@@ -318,7 +392,7 @@ igt_main
unplug_rescan(&priv);
igt_fixture
- recover(&priv);
+ restore(&priv);
}
igt_fixture
@@ -330,7 +404,7 @@ igt_main
hotunbind_lateclose(&priv);
igt_fixture
- recover(&priv);
+ restore(&priv);
}
igt_fixture
@@ -342,7 +416,7 @@ igt_main
hotunplug_lateclose(&priv);
igt_fixture
- recover(&priv);
+ restore(&priv);
}
igt_fixture {
--
2.21.1
More information about the Intel-gfx-trybot
mailing list