[igt-dev] [PATCH] tests: Add a new test for driver/device hot reload

Chris Wilson chris at chris-wilson.co.uk
Fri Apr 5 14:11:48 UTC 2019


Quoting Janusz Krzysztofik (2019-04-03 17:01:22)
> From: Janusz Krzysztofik <janusz.krzysztofik at intel.com>
> 
> Run a dummy load in background to put some workload on a device, then try
> to either remove (unplug) the device from its bus, or unbind the device's
> driver from it, depending on which subtest has been selected.  If
> succeeded, unload the driver, rescan the device's bus if needed and
> perform health checks on the device with the driver reloaded.
> 
> The driver hot unbind / device hot unplug operation is expected to succeed
> in a reasonable time, however long timeouts are used to let kernel level
> timeouts pop up first if hit by a bug.
> 
> The dummy load works only with i915 driver.  The test is skipped on other
> hardware unless they provide their implementation of igt_spin_batch_new()
> and friends.
> 
> Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik at intel.com>
> ---
> Changelog:
> v1->v2:
> - renamed to core_hot_reload,
> - extended with module unload, device rediscover (unplug case), module reload 
>   and healthcheck operations following the initial unplug/unbind, as requested
>   by Daniel and Peter,
> - fail if anything goes wrong, as requested by Daniel
> - refactored a lot to reduce code redundancy.
> 
> Resending now with the igt-dev list address added to Cc:, sorry
> 
>  tests/Makefile.sources  |   1 +
>  tests/core_hot_reload.c | 248 ++++++++++++++++++++++++++++++++++++++++
>  tests/meson.build       |   1 +
>  3 files changed, 250 insertions(+)
>  create mode 100644 tests/core_hot_reload.c
> 
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index 86ad301e..5f9dba30 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -16,6 +16,7 @@ TESTS_progs = \
>         core_getclient \
>         core_getstats \
>         core_getversion \
> +       core_hot_reload \
>         core_setmaster_vs_auth \
>         debugfs_test \
>         drm_import_export \
> diff --git a/tests/core_hot_reload.c b/tests/core_hot_reload.c
> new file mode 100644
> index 00000000..96f0efd0
> --- /dev/null
> +++ b/tests/core_hot_reload.c
> @@ -0,0 +1,248 @@
> +/*
> + * Copyright © 2019 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include "igt.h"
> +#include "igt_device.h"
> +#include "igt_dummyload.h"
> +#include "igt_kmod.h"
> +#include "igt_sysfs.h"
> +
> +#include <getopt.h>
> +#include <limits.h>
> +#include <string.h>
> +#include <unistd.h>
> +
> +
> +typedef void (*workload_wait_t)(int device, void *priv);
> +typedef int (*action_t)(int dir);
> +typedef int (*workload_t)(int device, void *priv, action_t action);
> +
> +
> +/*
> + * Actions
> + */
> +
> +/* Unbind the driver from the device */
> +static int driver_unbind(int dir)
> +{
> +       char path[PATH_MAX], *dev_bus_addr;
> +       int len;
> +
> +       len = readlinkat(dir, "device", path, sizeof(path) - 1);
> +       path[len] = '\0';
> +       dev_bus_addr = strrchr(path, '/') + 1;
> +
> +       igt_set_timeout(60, "Driver unbind timeout!");
> +       igt_sysfs_set(dir, "device/driver/unbind", dev_bus_addr);
> +
> +       /* No need for bus rescan */
> +       return -1;
> +}
> +
> +/* Returns open file descriptor to a directory of a bus to rescan for re-plug */
> +static int device_unplug(int dir)
> +{
> +       int bus;
> +
> +       bus = openat(dir, "device/subsystem", O_DIRECTORY);
> +       igt_assert(bus >= 0);
> +
> +       igt_set_timeout(60, "Device unplug timeout!");
> +       igt_sysfs_set(dir, "device/remove", "1");
> +
> +       return bus;
> +}
> +
> +
> +/*
> + * Operation template
> + */
> +static int operation(int device, action_t action, workload_wait_t workload_wait,
> +                    void *priv)
> +{
> +       int dir, bus;
> +
> +       dir = igt_sysfs_open(device);
> +
> +       bus = action(dir);
> +       close(dir);
> +
> +       if (workload_wait)
> +               workload_wait(device, priv);
> +       igt_reset_timeout();
> +
> +       return bus;
> +}
> +
> +
> +/*
> + * Workloads
> + */
> +
> +/* Workload using igt_spin_batch_run() */
> +
> +static void spin_batch_wait(int device, void *priv)
> +{
> +       igt_spin_t *spin = priv;
> +
> +       /* wait until the workload has crashed */
> +       gem_wait(device, spin->handle, NULL);

gem_sync().

> +}
> +
> +static int spin_batch(int device, void *priv, action_t action)
> +{
> +       igt_spin_t *spin;
> +       int bus;
> +
> +       igt_assert(!priv);
> +
> +       /* Start the workload in background.
> +        * Make sure it is running and times out after 90 seconds */
> +       spin = igt_spin_batch_new(device, .flags = IGT_SPIN_POLL_RUN);
> +       igt_spin_busywait_until_running(spin);

We don't need to wait until it's running for it to be an issue. In fact,
testing with requests that can't be run due to unresolved external
dependencies is also required. And also trying to race the actual
submission tasklets running in the background against the unplug.

> +       igt_spin_batch_set_timeout(spin, 90000000000);

Nah, just let it run indefinitely. You have the timeouts in place to
enforce the driver does something. And a hanging batch is about a hard a
task to resolve as you can create (make it a bit harder by disallowing
reset!).

One other thing to do here would be something like
	igt_fork(child, N) {
		uint32_t name = gem_flink(device, spin->handle);
		int fd = gem_reopen_device(device);
		uint32_t handle = gem_open(fd, name);
		gem_sync(fd, handle);
	}


Another once is to run a nop loop (with and without obj/ctx create) as
it unbinds, an mmap loop (over both CPU and GTT / mmap-offset paths),
pread/pwrite. set-tiling and set-caching both have windows of
opportunity for badness, but would be much harder to exercise.

And we need to cover flip queues.

> +       /* run the tested action */
> +       bus = operation(device, action, spin_batch_wait, spin);
> +
> +       /*
> +        * Clean up driver resources possibly not released on workload crash
> +        * so the driver module can be successfully unloaded
> +        */
> +       igt_spin_batch_free(device, spin);
> +
> +       return bus;
> +}
> +
> +
> +/*
> + * Skeleton
> + */
> +
> +static void healthcheck(int chipset, int *device)
> +{
> +       if (chipset == DRIVER_INTEL) {
> +               /*
> +                * We have it perfectly implemented in i915_module_load,
> +                * just use it.
> +                */
> +               igt_assert(igt_system_quiet("i915_module_load --run-subtest reload")
> +                          == IGT_EXIT_SUCCESS);

Ahem.

> +       } else {
> +               /*
> +                * We don't know how to check an unidentified device for health,
> +                * just reopen it.
> +                */
> +       }
> +}
> +
> +static void driver_unload(int chipset, char *driver)
> +{
> +       if (chipset == DRIVER_INTEL)
> +               igt_assert(igt_i915_driver_unload() == IGT_EXIT_SUCCESS);
> +       else
> +               igt_assert(igt_kmod_unload(driver, 0) == 0);
> +}
> +
> +static void __subtest(int device, int chipset, char *driver,
> +                     workload_t workload, void *priv, action_t action)
> +{
> +       int bus = workload(device, priv, action);
> +
> +       close(device);
> +       driver_unload(chipset, driver);
> +
> +       /* Valid file descriptor indicates we should rescan the bus */
> +       if (bus >= 0) {
> +               igt_sysfs_set(bus, "rescan", "1");
> +               close(bus);
> +       }
> +}
> +
> +static void run_subtest(int *device, int chipset, char *driver,
> +                       workload_t workload, void *priv, action_t action)
> +{
> +       __subtest(*device, chipset, driver, workload, priv, action);
> +
> +       sleep(2);

Why 2s?

The subtest is synchronous, the module load in drm_open_driver is also
synchronous, so this should just work without any delay.

> +
> +       *device = __drm_open_driver(chipset);
> +       healthcheck(chipset, device);
> +
> +       igt_assert(*device >= 0);
> +}
> +
> +
> +igt_main {
> +       int device, chipset;
> +       char *driver;
> +
> +       igt_fixture {
> +               char path[PATH_MAX];
> +               int dir, len;
> +
> +               /*
> +                * Since the test depends on successful unload of driver module,
> +                * don't use drm_open_driver() as it keeps a file descriptor
> +                * open for exit handler use that effectively locks the module.
> +                */
> +               device = __drm_open_driver(DRIVER_ANY);
> +               igt_assert(device >= 0);
> +
> +               if (is_i915_device(device)) {
> +                       chipset = DRIVER_INTEL;
> +                       driver = strdup("i915");
> +               } else {
> +                       chipset = DRIVER_ANY;
> +
> +                       /* Capture module name to be unloaded */
> +                       dir = igt_sysfs_open(device);
> +                       len = readlinkat(dir, "device/driver/module", path,
> +                                        sizeof(path) - 1);
> +                       close(dir);
> +                       path[len] = '\0';
> +                       driver = strdup(strrchr(path, '/') + 1);
> +               }
> +               igt_info("Running the test on driver \"%s\", chipset mask %#0x\n",
> +                        driver, chipset);
> +       }
> +
> +       igt_subtest("unplug")
> +               run_subtest(&device, chipset, driver, spin_batch, NULL,
> +                           device_unplug);

Why not just open and close in the test?

> +       igt_subtest("unbind")
> +               run_subtest(&device, chipset, driver, spin_batch, NULL,
> +                           driver_unbind);
> +
> +       igt_fixture {
> +               free(driver);
> +               close(device);
> +               /*
> +                * Now reopen the device with drm_open_driver() for a while
> +                * so exit handler is registered and can do its job on exit.
> +                */

Just opening the device for the exithandler? The exithandler doesn't
need to do anything.

> +               device = drm_open_driver(chipset);
> +               close(device);

As you note, drm_open_driver() will autoreload the module. So will the
next test.
-Chris


More information about the igt-dev mailing list