[igt-dev] [i-g-t, v3, 1/2] runner: Add support for aborting on network failure

Petri Latvala petri.latvala at intel.com
Mon Sep 9 11:27:50 UTC 2019


On Mon, Sep 09, 2019 at 01:53:44PM +0300, Arkadiusz Hiler wrote:
> On Mon, May 20, 2019 at 01:16:22PM +0300, Petri Latvala wrote:
> > If the network goes down while testing, CI tends to interpret that as
> > the device being down, cutting its power after a while. This causes an
> > incomplete to an innocent test, increasing noise in the results.
> > 
> > A new flag to --abort-on-monitored-error, "ping", uses liboping to
> > ping a host configured in .igtrc with one ping after each test
> > execution and aborts the run if there is no reply in a hardcoded
> > amount of time.
> > 
> > v2:
> >  - Use a higher timeout
> >  - Allow hostname configuration from environment
> > v3:
> >  - Use runner_c_args for holding c args for runner
> >  - Handle runner's meson options in runner/meson.build
> >  - Instead of one ping with 20 second timeout, ping with 1 second timeout
> >    for a duration of 20 seconds
> > 
> > Signed-off-by: Petri Latvala <petri.latvala at intel.com>
> > Cc: Arkadiusz Hiler <arkadiusz.hiler at intel.com>
> > Cc: Martin Peres <martin.peres at linux.intel.com>
> > Cc: Tomi Sarvela <tomi.p.sarvela at intel.com>
> > Cc: Daniel Vetter <daniel at ffwll.ch>
> > ---
> >  meson_options.txt  |   6 ++
> >  runner/executor.c  | 150 +++++++++++++++++++++++++++++++++++++++++++++
> >  runner/meson.build |  14 ++++-
> >  runner/settings.c  |   4 ++
> >  runner/settings.h  |   5 +-
> >  5 files changed, 176 insertions(+), 3 deletions(-)
> > 
> > diff --git a/meson_options.txt b/meson_options.txt
> > index 888efe56..935f883e 100644
> > --- a/meson_options.txt
> > +++ b/meson_options.txt
> > @@ -58,6 +58,12 @@ option('build_runner',
> >         choices : ['auto', 'true', 'false'],
> >         description : 'Build test runner')
> >  
> > +option('with_oping',
> > +       type : 'combo',
> > +       value : 'auto',
> > +       choices : ['auto', 'true', 'false'],
> > +       description : 'Build igt_runner with liboping')
> > +
> >  option('use_rpath',
> >         type : 'boolean',
> >         value : false,
> > diff --git a/runner/executor.c b/runner/executor.c
> > index 7e5fbe8f..c07e53fa 100644
> > --- a/runner/executor.c
> > +++ b/runner/executor.c
> > @@ -1,6 +1,10 @@
> >  #include <errno.h>
> >  #include <fcntl.h>
> > +#include <glib.h>
> >  #include <linux/watchdog.h>
> > +#if HAVE_OPING
> > +#include <oping.h>
> > +#endif
> >  #include <signal.h>
> >  #include <stdio.h>
> >  #include <stdlib.h>
> > @@ -16,6 +20,7 @@
> >  #include <time.h>
> >  #include <unistd.h>
> >  
> > +#include "igt_aux.h"
> >  #include "igt_core.h"
> >  #include "executor.h"
> >  #include "output_strings.h"
> > @@ -108,6 +113,147 @@ static void ping_watchdogs(void)
> >  	}
> >  }
> >  
> > +#if HAVE_OPING
> > +static pingobj_t *pingobj = NULL;
> > +
> > +static bool load_ping_config_from_file(void)
> > +{
> > +	char *key_file_env = NULL;
> > +	char *key_file_loc = NULL;
> > +	GError *error = NULL;
> > +	GKeyFile *key_file = NULL;
> > +	const char *ping_hostname;
> > +	int ret;
> > +
> > +	/* Determine igt config path */
> > +	key_file_env = getenv("IGT_CONFIG_PATH");
> > +	if (key_file_env) {
> > +		key_file_loc = strdup(key_file_env);
> > +	} else {
> > +		asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
> > +	}
> > +
> > +	/* Load igt config file */
> > +	key_file = g_key_file_new();
> > +	ret = g_key_file_load_from_file(key_file, key_file_loc,
> > +					G_KEY_FILE_NONE, &error);
> > +	free(key_file_loc);
> > +
> > +	if (!ret) {
> > +		g_error_free(error);
> > +		g_key_file_free(key_file);
> > +		return false;
> > +	}
> > +
> > +	g_clear_error(&error);
> 
> put that in
> 
> GKeyFile *igt_open_igtrc();
> 
> and use it in igt_core, so we have only one implementation
> 
> > +
> > +	ping_hostname =
> > +		g_key_file_get_string(key_file, "DUT",
> > +				      "PingHostName", &error);
> > +
> > +	g_clear_error(&error);
> > +	g_key_file_free(key_file);
> > +
> > +	if (!ping_hostname)
> > +		return false;
> > +
> > +	if (ping_host_add(pingobj, ping_hostname)) {
> > +		fprintf(stderr,
> > +			"abort on ping: Cannot use hostname from config file\n");
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +static bool load_ping_config_from_env(void)
> > +{
> > +	const char *ping_hostname;
> > +
> > +	ping_hostname = getenv("IGT_PING_HOSTNAME");
> > +	if (!ping_hostname)
> > +		return false;
> > +
> > +	if (ping_host_add(pingobj, ping_hostname)) {
> > +		fprintf(stderr,
> > +			"abort on ping: Cannot use hostname from environment\n");
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +static bool can_ping(void)
> > +{
> > +	/*
> > +	 * On some hosts, getting network back up after suspend takes
> > +	 * upwards of 10 seconds. 20 seconds should be enough to see
> > +	 * if network comes back at all, and hopefully not too long to
> > +	 * make external monitoring freak out.
> > +	 */
> > +	igt_until_timeout(20) {
> 
> make 20 a #define
> 
> > +		pingobj_iter_t *iter;
> > +
> > +		ping_send(pingobj);
> > +
> > +		for (iter = ping_iterator_get(pingobj);
> > +		     iter != NULL;
> > +		     iter = ping_iterator_next(iter)) {
> > +			double latency;
> > +			size_t len = sizeof(latency);
> > +
> > +			ping_iterator_get_info(iter,
> > +					       PING_INFO_LATENCY,
> > +					       &latency,
> > +					       &len);
> > +			if (latency >= 0.0)
> > +				return true;
> > +		}
> > +	}
> > +
> > +	return false;
> > +}
> > +
> > +#endif
> > +
> > +static void ping_config(void)
> > +{
> > +#if HAVE_OPING
> > +	double timeout = 1.0;
> 
> single_attempt_timeout
> 
> > +
> > +	if (pingobj)
> > +		return;
> > +
> > +	pingobj = ping_construct();
> > +
> > +	/* Try env first, then config file */
> > +	if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
> > +		fprintf(stderr,
> > +			"abort on ping: No host to ping configured\n");
> > +		ping_destroy(pingobj);
> > +		pingobj = NULL;
> > +		return;
> > +	}
> > +
> > +	ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
> > +#endif
> > +}
> > +
> > +static char *handle_ping(void)
> > +{
> > +#if HAVE_OPING
> > +	if (pingobj && !can_ping()) {
> > +		char *reason;
> > +
> > +		asprintf(&reason,
> > +			 "Ping host did not respond to ping, network down");
> > +		return reason;
> > +	}
> > +#endif
> > +
> > +	return NULL;
> > +}
> > +
> >  static char *handle_lockdep(void)
> >  {
> >  	const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
> > @@ -197,6 +343,7 @@ static const struct {
> >  } abort_handlers[] = {
> >  	{ ABORT_LOCKDEP, handle_lockdep },
> >  	{ ABORT_TAINT, handle_taint },
> > +	{ ABORT_PING, handle_ping },
> >  	{ 0, 0 },
> >  };
> 
> So the handlers are used need_to_abort():
>  * when we start execution
>  * after each tests
> 
> What worries me here are the suspend tests. You have mentioned that
> getting network up can take up to 17s, so potentially we are adding
> 17s * numer_of_executed_suspend_tests to the total execution time.
> 
> Have you checked the actual impact?

No, let's take a look at that after I send v4.

Even with that extra overhead, an argument can be made that getting
incompletes without clear indicators why they happen is worse.


> Also this does not cover any network issues that happen when executing
> longer tests - we still can get killed by any external watchdogs.

Yeah, and we don't get easily machine-parseable timing data on how
long those running tests were fine for. Note to self: Make runner dump
periodical still-alive marks to journal. Unrelated to this patch
though...

Needs to be said that this patch won't make much of a difference if
you only look at the passrates or amount of incompletes. What this
does is allowing us to close that one silly semi-catchall cibuglog
filter for incompletes without good data, and thus forcing proper
issue binning.

> 
> What is the maximum ping timeout on those external watchdogs? Maybe we
> should get strict on tests' time budget for BAT/shards first...

Don't know actually. Tomi?


-- 
Petri Latvala


More information about the igt-dev mailing list