[igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure

Arkadiusz Hiler arkadiusz.hiler at intel.com
Mon Sep 9 10:53:44 UTC 2019


On Mon, May 20, 2019 at 01:16:22PM +0300, Petri Latvala wrote:
> If the network goes down while testing, CI tends to interpret that as
> the device being down, cutting its power after a while. This causes an
> incomplete to an innocent test, increasing noise in the results.
> 
> A new flag to --abort-on-monitored-error, "ping", uses liboping to
> ping a host configured in .igtrc with one ping after each test
> execution and aborts the run if there is no reply in a hardcoded
> amount of time.
> 
> v2:
>  - Use a higher timeout
>  - Allow hostname configuration from environment
> v3:
>  - Use runner_c_args for holding c args for runner
>  - Handle runner's meson options in runner/meson.build
>  - Instead of one ping with 20 second timeout, ping with 1 second timeout
>    for a duration of 20 seconds
> 
> Signed-off-by: Petri Latvala <petri.latvala at intel.com>
> Cc: Arkadiusz Hiler <arkadiusz.hiler at intel.com>
> Cc: Martin Peres <martin.peres at linux.intel.com>
> Cc: Tomi Sarvela <tomi.p.sarvela at intel.com>
> Cc: Daniel Vetter <daniel at ffwll.ch>
> ---
>  meson_options.txt  |   6 ++
>  runner/executor.c  | 150 +++++++++++++++++++++++++++++++++++++++++++++
>  runner/meson.build |  14 ++++-
>  runner/settings.c  |   4 ++
>  runner/settings.h  |   5 +-
>  5 files changed, 176 insertions(+), 3 deletions(-)
> 
> diff --git a/meson_options.txt b/meson_options.txt
> index 888efe56..935f883e 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -58,6 +58,12 @@ option('build_runner',
>         choices : ['auto', 'true', 'false'],
>         description : 'Build test runner')
>  
> +option('with_oping',
> +       type : 'combo',
> +       value : 'auto',
> +       choices : ['auto', 'true', 'false'],
> +       description : 'Build igt_runner with liboping')
> +
>  option('use_rpath',
>         type : 'boolean',
>         value : false,
> diff --git a/runner/executor.c b/runner/executor.c
> index 7e5fbe8f..c07e53fa 100644
> --- a/runner/executor.c
> +++ b/runner/executor.c
> @@ -1,6 +1,10 @@
>  #include <errno.h>
>  #include <fcntl.h>
> +#include <glib.h>
>  #include <linux/watchdog.h>
> +#if HAVE_OPING
> +#include <oping.h>
> +#endif
>  #include <signal.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -16,6 +20,7 @@
>  #include <time.h>
>  #include <unistd.h>
>  
> +#include "igt_aux.h"
>  #include "igt_core.h"
>  #include "executor.h"
>  #include "output_strings.h"
> @@ -108,6 +113,147 @@ static void ping_watchdogs(void)
>  	}
>  }
>  
> +#if HAVE_OPING
> +static pingobj_t *pingobj = NULL;
> +
> +static bool load_ping_config_from_file(void)
> +{
> +	char *key_file_env = NULL;
> +	char *key_file_loc = NULL;
> +	GError *error = NULL;
> +	GKeyFile *key_file = NULL;
> +	const char *ping_hostname;
> +	int ret;
> +
> +	/* Determine igt config path */
> +	key_file_env = getenv("IGT_CONFIG_PATH");
> +	if (key_file_env) {
> +		key_file_loc = strdup(key_file_env);
> +	} else {
> +		asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
> +	}
> +
> +	/* Load igt config file */
> +	key_file = g_key_file_new();
> +	ret = g_key_file_load_from_file(key_file, key_file_loc,
> +					G_KEY_FILE_NONE, &error);
> +	free(key_file_loc);
> +
> +	if (!ret) {
> +		g_error_free(error);
> +		g_key_file_free(key_file);
> +		return false;
> +	}
> +
> +	g_clear_error(&error);

put that in

GKeyFile *igt_open_igtrc();

and use it in igt_core, so we have only one implementation

> +
> +	ping_hostname =
> +		g_key_file_get_string(key_file, "DUT",
> +				      "PingHostName", &error);
> +
> +	g_clear_error(&error);
> +	g_key_file_free(key_file);
> +
> +	if (!ping_hostname)
> +		return false;
> +
> +	if (ping_host_add(pingobj, ping_hostname)) {
> +		fprintf(stderr,
> +			"abort on ping: Cannot use hostname from config file\n");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool load_ping_config_from_env(void)
> +{
> +	const char *ping_hostname;
> +
> +	ping_hostname = getenv("IGT_PING_HOSTNAME");
> +	if (!ping_hostname)
> +		return false;
> +
> +	if (ping_host_add(pingobj, ping_hostname)) {
> +		fprintf(stderr,
> +			"abort on ping: Cannot use hostname from environment\n");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool can_ping(void)
> +{
> +	/*
> +	 * On some hosts, getting network back up after suspend takes
> +	 * upwards of 10 seconds. 20 seconds should be enough to see
> +	 * if network comes back at all, and hopefully not too long to
> +	 * make external monitoring freak out.
> +	 */
> +	igt_until_timeout(20) {

make 20 a #define

> +		pingobj_iter_t *iter;
> +
> +		ping_send(pingobj);
> +
> +		for (iter = ping_iterator_get(pingobj);
> +		     iter != NULL;
> +		     iter = ping_iterator_next(iter)) {
> +			double latency;
> +			size_t len = sizeof(latency);
> +
> +			ping_iterator_get_info(iter,
> +					       PING_INFO_LATENCY,
> +					       &latency,
> +					       &len);
> +			if (latency >= 0.0)
> +				return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +#endif
> +
> +static void ping_config(void)
> +{
> +#if HAVE_OPING
> +	double timeout = 1.0;

single_attempt_timeout

> +
> +	if (pingobj)
> +		return;
> +
> +	pingobj = ping_construct();
> +
> +	/* Try env first, then config file */
> +	if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
> +		fprintf(stderr,
> +			"abort on ping: No host to ping configured\n");
> +		ping_destroy(pingobj);
> +		pingobj = NULL;
> +		return;
> +	}
> +
> +	ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
> +#endif
> +}
> +
> +static char *handle_ping(void)
> +{
> +#if HAVE_OPING
> +	if (pingobj && !can_ping()) {
> +		char *reason;
> +
> +		asprintf(&reason,
> +			 "Ping host did not respond to ping, network down");
> +		return reason;
> +	}
> +#endif
> +
> +	return NULL;
> +}
> +
>  static char *handle_lockdep(void)
>  {
>  	const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
> @@ -197,6 +343,7 @@ static const struct {
>  } abort_handlers[] = {
>  	{ ABORT_LOCKDEP, handle_lockdep },
>  	{ ABORT_TAINT, handle_taint },
> +	{ ABORT_PING, handle_ping },
>  	{ 0, 0 },
>  };

So the handlers are used need_to_abort():
 * when we start execution
 * after each tests

What worries me here are the suspend tests. You have mentioned that
getting network up can take up to 17s, so potentially we are adding
17s * numer_of_executed_suspend_tests to the total execution time.

Have you checked the actual impact?

Also this does not cover any network issues that happen when executing
longer tests - we still can get killed by any external watchdogs.

What is the maximum ping timeout on those external watchdogs? Maybe we
should get strict on tests' time budget for BAT/shards first...

- Arek

>  
> @@ -1288,6 +1435,9 @@ bool execute(struct execute_state *state,
>  
>  	init_watchdogs(settings);
>  
> +	if (settings->abort_mask & ABORT_PING)
> +		ping_config();
> +
>  	if (!uname(&unamebuf)) {
>  		dprintf(unamefd, "%s %s %s %s %s\n",
>  			unamebuf.sysname,
> diff --git a/runner/meson.build b/runner/meson.build
> index b658f1d2..6d60c530 100644
> --- a/runner/meson.build
> +++ b/runner/meson.build
> @@ -1,4 +1,15 @@
>  jsonc = dependency('json-c', required: _runner_required)
> +runner_deps = [jsonc, glib]
> +runner_c_args = []
> +
> +with_oping = get_option('with_oping')
> +if with_oping != 'false'
> +	oping = dependency('liboping', required: with_oping == 'true')
> +	if oping.found()
> +		runner_deps += oping
> +		runner_c_args += '-DHAVE_OPING=1'
> +	endif
> +endif
>  
>  runnerlib_sources = [ 'settings.c',
>  		      'job_list.c',
> @@ -21,7 +32,8 @@ if _build_runner and _build_tests and jsonc.found()
>  
>  	runnerlib = static_library('igt_runner', runnerlib_sources,
>  				   include_directories : inc,
> -				   dependencies : [jsonc, glib])
> +				   c_args : runner_c_args,
> +				   dependencies : runner_deps)
>  
>  	runner = executable('igt_runner', runner_sources,
>  			    link_with : runnerlib,
> diff --git a/runner/settings.c b/runner/settings.c
> index ad38ae8d..b57d1a6a 100644
> --- a/runner/settings.c
> +++ b/runner/settings.c
> @@ -48,6 +48,7 @@ static struct {
>  } abort_conditions[] = {
>  	{ ABORT_TAINT, "taint" },
>  	{ ABORT_LOCKDEP, "lockdep" },
> +	{ ABORT_PING, "ping" },
>  	{ ABORT_ALL, "all" },
>  	{ 0, 0 },
>  };
> @@ -136,6 +137,9 @@ static const char *usage_str =
>  	"                        Possible conditions:\n"
>  	"                         lockdep - abort when kernel lockdep has been angered.\n"
>  	"                         taint   - abort when kernel becomes fatally tainted.\n"
> +	"                         ping    - abort when a host configured in .igtrc or\n"
> +	"                                   environment variable IGT_PING_HOSTNAME does\n"
> +	"                                   not respond to ping.\n"
>  	"                         all     - abort for all of the above.\n"
>  	"  -s, --sync            Sync results to disk after every test\n"
>  	"  -l {quiet,verbose,dummy}, --log-level {quiet,verbose,dummy}\n"
> diff --git a/runner/settings.h b/runner/settings.h
> index 0a1ee08d..2b6e65d0 100644
> --- a/runner/settings.h
> +++ b/runner/settings.h
> @@ -15,9 +15,10 @@ enum {
>  
>  #define ABORT_TAINT   (1 << 0)
>  #define ABORT_LOCKDEP (1 << 1)
> -#define ABORT_ALL     (ABORT_TAINT | ABORT_LOCKDEP)
> +#define ABORT_PING    (1 << 2)
> +#define ABORT_ALL     (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING)
>  
> -_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP), "ABORT_ALL must be all conditions bitwise or'd");
> +_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING), "ABORT_ALL must be all conditions bitwise or'd");
>  
>  struct regex_list {
>  	char **regex_strings;


More information about the igt-dev mailing list