[igt-dev] [PATCH i-g-t 1/2 v2] runner: Add support for aborting on network failure

Daniel Vetter daniel at ffwll.ch
Thu May 16 12:51:39 UTC 2019


On Thu, May 16, 2019 at 02:27:54PM +0300, Petri Latvala wrote:
> If the network goes down while testing, CI tends to interpret that as
> the device being down, cutting its power after a while. This causes an
> incomplete to an innocent test, increasing noise in the results.
> 
> A new flag to --abort-on-monitored-error, "ping", uses liboping to
> ping a host configured in .igtrc with one ping after each test
> execution and aborts the run if there is no reply in a hardcoded
> amount of time.
> 
> v2:
>  - Use a higher timeout
>  - Allow hostname configuration from environment
> 
> Signed-off-by: Petri Latvala <petri.latvala at intel.com>
> Cc: Arkadiusz Hiler <arkadiusz.hiler at intel.com>
> Cc: Martin Peres <martin.peres at linux.intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela at intel.com>
> ---
>  meson.build        |   1 +
>  meson_options.txt  |   6 ++
>  runner/executor.c  | 135 +++++++++++++++++++++++++++++++++++++++++++++
>  runner/meson.build |  12 +++-
>  runner/settings.c  |   4 ++
>  runner/settings.h  |   5 +-
>  6 files changed, 160 insertions(+), 3 deletions(-)
> 
> diff --git a/meson.build b/meson.build
> index a05e912c..aac67f1a 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -100,6 +100,7 @@ build_tests = get_option('build_tests')
>  with_libdrm = get_option('with_libdrm')
>  with_libunwind = get_option('with_libunwind')
>  build_runner = get_option('build_runner')
> +with_oping = get_option('with_oping')
>  
>  _build_overlay = build_overlay != 'false'
>  _overlay_required = build_overlay == 'true'
> diff --git a/meson_options.txt b/meson_options.txt
> index 888efe56..935f883e 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -58,6 +58,12 @@ option('build_runner',
>         choices : ['auto', 'true', 'false'],
>         description : 'Build test runner')
>  
> +option('with_oping',
> +       type : 'combo',
> +       value : 'auto',
> +       choices : ['auto', 'true', 'false'],
> +       description : 'Build igt_runner with liboping')
> +
>  option('use_rpath',
>         type : 'boolean',
>         value : false,
> diff --git a/runner/executor.c b/runner/executor.c
> index 7e5fbe8f..be78474c 100644
> --- a/runner/executor.c
> +++ b/runner/executor.c
> @@ -1,6 +1,10 @@
>  #include <errno.h>
>  #include <fcntl.h>
> +#include <glib.h>
>  #include <linux/watchdog.h>
> +#if HAVE_OPING
> +#include <oping.h>
> +#endif
>  #include <signal.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -108,6 +112,133 @@ static void ping_watchdogs(void)
>  	}
>  }
>  
> +#if HAVE_OPING
> +static pingobj_t *pingobj = NULL;
> +
> +static bool load_ping_config_from_file(void)
> +{
> +	char *key_file_env = NULL;
> +	char *key_file_loc = NULL;
> +	GError *error = NULL;
> +	GKeyFile *key_file = NULL;
> +	const char *ping_hostname;
> +	int ret;
> +
> +	/* Determine igt config path */
> +	key_file_env = getenv("IGT_CONFIG_PATH");
> +	if (key_file_env) {
> +		key_file_loc = strdup(key_file_env);
> +	} else {
> +		asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
> +	}
> +
> +	/* Load igt config file */
> +	key_file = g_key_file_new();
> +	ret = g_key_file_load_from_file(key_file, key_file_loc,
> +					G_KEY_FILE_NONE, &error);
> +	free(key_file_loc);
> +
> +	if (!ret) {
> +		g_error_free(error);
> +		g_key_file_free(key_file);
> +		return false;
> +	}
> +
> +	g_clear_error(&error);
> +
> +	ping_hostname =
> +		g_key_file_get_string(key_file, "DUT",
> +				      "PingHostName", &error);
> +
> +	g_clear_error(&error);
> +	g_key_file_free(key_file);
> +
> +	if (!ping_hostname)
> +		return false;
> +
> +	if (ping_host_add(pingobj, ping_hostname)) {
> +		fprintf(stderr,
> +			"abort on ping: Cannot use hostname from config file\n");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool load_ping_config_from_env(void)
> +{
> +	const char *ping_hostname;
> +
> +	ping_hostname = getenv("IGT_PING_HOSTNAME");
> +	if (!ping_hostname)
> +		return false;
> +
> +	if (ping_host_add(pingobj, ping_hostname)) {
> +		fprintf(stderr,
> +			"abort on ping: Cannot use hostname from environment\n");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +#endif
> +
> +static void ping_config(void)
> +{
> +#if HAVE_OPING
> +	double timeout = 20.0; /* Fair dice roll */
> +
> +	if (pingobj)
> +		return;
> +
> +	pingobj = ping_construct();
> +
> +	/* Try env first, then config file */
> +	if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
> +		fprintf(stderr,
> +			"abort on ping: No host to ping configured\n");
> +		ping_destroy(pingobj);
> +		pingobj = NULL;
> +		return;
> +	}
> +
> +	ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
> +#endif
> +}
> +
> +static char *handle_ping(void)
> +{
> +#if HAVE_OPING
> +	if (pingobj) {
> +		pingobj_iter_t *iter;
> +
> +		ping_send(pingobj);

I think we should try to ping a few times, in case our ping (or the reply
got lost). Networks aren't 100% reliable, even when they work. Throwing
out 3 pings or so should be good enough, maybe try that twice.

As long as we die quicker than jenkins declares us dead we should be good
enough.

btw, could we also somehow check the ssh connection itself that we're
running under? I think that would more directly make sure that Jenkins
still knows we're doing well ...
-Daniel

> +
> +		for (iter = ping_iterator_get(pingobj);
> +		     iter != NULL;
> +		     iter = ping_iterator_next(iter)) {
> +			double latency;
> +			size_t len = sizeof(latency);
> +
> +			ping_iterator_get_info(iter,
> +					       PING_INFO_LATENCY,
> +					       &latency,
> +					       &len);
> +			if (latency < 0.0) {
> +				char *reason;
> +
> +				asprintf(&reason,
> +					 "Ping host did not respond to ping, network down");
> +				return reason;
> +			}
> +		}
> +	}
> +#endif
> +
> +	return NULL;
> +}
> +
>  static char *handle_lockdep(void)
>  {
>  	const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
> @@ -197,6 +328,7 @@ static const struct {
>  } abort_handlers[] = {
>  	{ ABORT_LOCKDEP, handle_lockdep },
>  	{ ABORT_TAINT, handle_taint },
> +	{ ABORT_PING, handle_ping },
>  	{ 0, 0 },
>  };
>  
> @@ -1288,6 +1420,9 @@ bool execute(struct execute_state *state,
>  
>  	init_watchdogs(settings);
>  
> +	if (settings->abort_mask & ABORT_PING)
> +		ping_config();
> +
>  	if (!uname(&unamebuf)) {
>  		dprintf(unamefd, "%s %s %s %s %s\n",
>  			unamebuf.sysname,
> diff --git a/runner/meson.build b/runner/meson.build
> index b658f1d2..a54aaab4 100644
> --- a/runner/meson.build
> +++ b/runner/meson.build
> @@ -1,4 +1,13 @@
>  jsonc = dependency('json-c', required: _runner_required)
> +runner_deps = [jsonc, glib]
> +have_oping = []
> +if with_oping != 'false'
> +	oping = dependency('liboping', required: with_oping == 'true')
> +	if oping.found()
> +		runner_deps += oping
> +		have_oping = '-DHAVE_OPING=1'
> +	endif
> +endif
>  
>  runnerlib_sources = [ 'settings.c',
>  		      'job_list.c',
> @@ -21,7 +30,8 @@ if _build_runner and _build_tests and jsonc.found()
>  
>  	runnerlib = static_library('igt_runner', runnerlib_sources,
>  				   include_directories : inc,
> -				   dependencies : [jsonc, glib])
> +				   c_args : have_oping,
> +				   dependencies : runner_deps)
>  
>  	runner = executable('igt_runner', runner_sources,
>  			    link_with : runnerlib,
> diff --git a/runner/settings.c b/runner/settings.c
> index ad38ae8d..b57d1a6a 100644
> --- a/runner/settings.c
> +++ b/runner/settings.c
> @@ -48,6 +48,7 @@ static struct {
>  } abort_conditions[] = {
>  	{ ABORT_TAINT, "taint" },
>  	{ ABORT_LOCKDEP, "lockdep" },
> +	{ ABORT_PING, "ping" },
>  	{ ABORT_ALL, "all" },
>  	{ 0, 0 },
>  };
> @@ -136,6 +137,9 @@ static const char *usage_str =
>  	"                        Possible conditions:\n"
>  	"                         lockdep - abort when kernel lockdep has been angered.\n"
>  	"                         taint   - abort when kernel becomes fatally tainted.\n"
> +	"                         ping    - abort when a host configured in .igtrc or\n"
> +	"                                   environment variable IGT_PING_HOSTNAME does\n"
> +	"                                   not respond to ping.\n"
>  	"                         all     - abort for all of the above.\n"
>  	"  -s, --sync            Sync results to disk after every test\n"
>  	"  -l {quiet,verbose,dummy}, --log-level {quiet,verbose,dummy}\n"
> diff --git a/runner/settings.h b/runner/settings.h
> index 0a1ee08d..2b6e65d0 100644
> --- a/runner/settings.h
> +++ b/runner/settings.h
> @@ -15,9 +15,10 @@ enum {
>  
>  #define ABORT_TAINT   (1 << 0)
>  #define ABORT_LOCKDEP (1 << 1)
> -#define ABORT_ALL     (ABORT_TAINT | ABORT_LOCKDEP)
> +#define ABORT_PING    (1 << 2)
> +#define ABORT_ALL     (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING)
>  
> -_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP), "ABORT_ALL must be all conditions bitwise or'd");
> +_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING), "ABORT_ALL must be all conditions bitwise or'd");
>  
>  struct regex_list {
>  	char **regex_strings;
> -- 
> 2.19.1
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


More information about the igt-dev mailing list