[Intel-gfx] [igt-dev] [PATCH i-g-t v3] tests/core_hotunplug: Restore i915 debugfs health check

Marcin Bernatowicz marcin.bernatowicz at linux.intel.com
Thu Oct 15 07:15:00 UTC 2020


On Tue, 2020-10-13 at 13:02 +0200, Janusz Krzysztofik wrote:
> Removal of igt_fork_hang_detector() from local_i915_healthcheck() by
> commit 1fbd127bd4e1 ("core_hotplug: Teach the healthcheck how to
> check
> execution status") resulted in unintentional removal of an important
> though implicit test feature of detecting, reporting as failures and
> recovering from potential misses of debugfs subdirs of hot rebound
> i915
> devices.  As a consequence, unexpected failures or skips of other
> unrelated but subsequently run tests have been observed on CI.
> 
> On the other hand, removal of the debugfs issue detection and subtest
> failures from right after hot rebinding the driver enabled the better
> version of the i915 GPU health check fixed by the same commit to
> detect
> and report other issues potentially triggered by device late close.
> 
> Restore the missing test feature by introducing an explicit sysfs
> health check, not limited to i915,  that verifies existence of device
> sysfs and debugfs areas.  Also, split hotrebind/hotreplug scenarios
> into a pair of each, one that performs the health check right after
> hot
> rebind/replug and delegates the device late close step to a follow up
> recovery phase, while the other one checks device health only after
> late closing it.
> 
> v2: Give GPU health check a better chance to detect issues - run it
>     before sysfs health checks.
> v3: Run sysfs health check on any hardware, not only i915.
> 
> Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik at linux.intel.com
> >
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> Even if the root cause has occurred to be sitting on the IGT lib side
> and has been already fixed by commit 937526629344 ("lib: Don't fail
> debugfs lookup on an expected absent drm device"), I think we should
> restore the debugfs health check just in case new issues with similar
> symptoms appear in the future and start affecting subsequent tests
> silently.
> 
> Thanks,
> Janusz
> 
>  tests/core_hotunplug.c | 68 ++++++++++++++++++++++++++++++++++++++
> ----
>  1 file changed, 62 insertions(+), 6 deletions(-)
> 
> diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
> index 70669c590..cdc07c85d 100644
> --- a/tests/core_hotunplug.c
> +++ b/tests/core_hotunplug.c
> @@ -308,7 +308,7 @@ static void node_healthcheck(struct hotunplug
> *priv, unsigned flags)
>  		priv->failure = "Unrecoverable test failure";
>  		if (local_i915_healthcheck(fd_drm, "") &&
>  		    (!(flags & FLAG_RECOVER) ||
> local_i915_recover(fd_drm)))
> -			priv->failure = "Healthcheck failure!";
> +			priv->failure = "GPU healthcheck failure!";
>  		else
>  			priv->failure = NULL;
>  
> @@ -317,6 +317,16 @@ static void node_healthcheck(struct hotunplug
> *priv, unsigned flags)
>  		priv->failure = NULL;
>  	}
>  
> +	if (!priv->failure) {
> +		char path[200];
> +
> +		priv->failure = "Device sysfs healthckeck failure!";
> +		local_debug("%s\n", "running device sysfs
> healthcheck");
> +		igt_assert(igt_sysfs_path(fd_drm, path, sizeof(path)));
> +		igt_assert(igt_debugfs_path(fd_drm, path,
> sizeof(path)));
> +		priv->failure = NULL;
> +	}
> +

LGTM,
Reviewed-by: Marcin Bernatowicz <marcin.bernatowicz at linux.intel.com>

>  	fd_drm = close_device(fd_drm, "", "health checked ");
>  	if (closed || fd_drm < -1)	/* update status for
> post_healthcheck */
>  		priv->fd.drm_hc = fd_drm;
> @@ -437,7 +447,7 @@ static void hotunplug_rescan(struct hotunplug
> *priv)
>  	healthcheck(priv, false);
>  }
>  
> -static void hotrebind_lateclose(struct hotunplug *priv)
> +static void hotrebind(struct hotunplug *priv)
>  {
>  	igt_assert_eq(priv->fd.drm, -1);
>  	igt_assert_eq(priv->fd.drm_hc, -1);
> @@ -448,6 +458,30 @@ static void hotrebind_lateclose(struct hotunplug
> *priv)
>  	driver_bind(priv, 0);
>  
>  	healthcheck(priv, false);
> +}
> +
> +static void hotreplug(struct hotunplug *priv)
> +{
> +	igt_assert_eq(priv->fd.drm, -1);
> +	igt_assert_eq(priv->fd.drm_hc, -1);
> +	priv->fd.drm = local_drm_open_driver(false, "", " for hot
> replug");
> +
> +	device_unplug(priv, "hot ", 60);
> +
> +	bus_rescan(priv, 0);
> +
> +	healthcheck(priv, false);
> +}
> +
> +static void hotrebind_lateclose(struct hotunplug *priv)
> +{
> +	igt_assert_eq(priv->fd.drm, -1);
> +	igt_assert_eq(priv->fd.drm_hc, -1);
> +	priv->fd.drm = local_drm_open_driver(false, "", " for hot
> rebind");
> +
> +	driver_unbind(priv, "hot ", 60);
> +
> +	driver_bind(priv, 0);
>  
>  	priv->fd.drm = close_device(priv->fd.drm, "late ", "unbound ");
>  	igt_assert_eq(priv->fd.drm, -1);
> @@ -465,8 +499,6 @@ static void hotreplug_lateclose(struct hotunplug
> *priv)
>  
>  	bus_rescan(priv, 0);
>  
> -	healthcheck(priv, false);
> -
>  	priv->fd.drm = close_device(priv->fd.drm, "late ", "removed ");
>  	igt_assert_eq(priv->fd.drm, -1);
>  
> @@ -570,7 +602,31 @@ igt_main
>  		post_healthcheck(&priv);
>  
>  	igt_subtest_group {
> -		igt_describe("Check if the driver hot unbound from a
> still open device can be cleanly rebound, then the old instance
> released");
> +		igt_describe("Check if the driver can be cleanly
> rebound to a device with a still open hot unbound driver instance");
> +		igt_subtest("hotrebind")
> +			hotrebind(&priv);
> +
> +		igt_fixture
> +			recover(&priv);
> +	}
> +
> +	igt_fixture
> +		post_healthcheck(&priv);
> +
> +	igt_subtest_group {
> +		igt_describe("Check if a hot unplugged and still open
> device can be cleanly restored");
> +		igt_subtest("hotreplug")
> +			hotreplug(&priv);
> +
> +		igt_fixture
> +			recover(&priv);
> +	}
> +
> +	igt_fixture
> +		post_healthcheck(&priv);
> +
> +	igt_subtest_group {
> +		igt_describe("Check if a hot unbound driver instance
> still open after hot rebind can be cleanly released");
>  		igt_subtest("hotrebind-lateclose")
>  			hotrebind_lateclose(&priv);
>  
> @@ -582,7 +638,7 @@ igt_main
>  		post_healthcheck(&priv);
>  
>  	igt_subtest_group {
> -		igt_describe("Check if a still open while hot unplugged
> device can be cleanly restored, then the old instance released");
> +		igt_describe("Check if an instance of a still open
> while hot replugged device can be cleanly released");
>  		igt_subtest("hotreplug-lateclose")
>  			hotreplug_lateclose(&priv);
>  



More information about the Intel-gfx mailing list