[igt-dev] [PATCH i-g-t v3 1/2] runner: Add support for aborting on network failure
Petri Latvala
petri.latvala at intel.com
Mon May 20 10:16:22 UTC 2019
If the network goes down while testing, CI tends to interpret that as
the device being down, cutting its power after a while. This causes an
incomplete to an innocent test, increasing noise in the results.
A new flag to --abort-on-monitored-error, "ping", uses liboping to
ping a host configured in .igtrc with one ping after each test
execution and aborts the run if there is no reply in a hardcoded
amount of time.
v2:
- Use a higher timeout
- Allow hostname configuration from environment
v3:
- Use runner_c_args for holding c args for runner
- Handle runner's meson options in runner/meson.build
- Instead of one ping with 20 second timeout, ping with 1 second timeout
for a duration of 20 seconds
Signed-off-by: Petri Latvala <petri.latvala at intel.com>
Cc: Arkadiusz Hiler <arkadiusz.hiler at intel.com>
Cc: Martin Peres <martin.peres at linux.intel.com>
Cc: Tomi Sarvela <tomi.p.sarvela at intel.com>
Cc: Daniel Vetter <daniel at ffwll.ch>
---
meson_options.txt | 6 ++
runner/executor.c | 150 +++++++++++++++++++++++++++++++++++++++++++++
runner/meson.build | 14 ++++-
runner/settings.c | 4 ++
runner/settings.h | 5 +-
5 files changed, 176 insertions(+), 3 deletions(-)
diff --git a/meson_options.txt b/meson_options.txt
index 888efe56..935f883e 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -58,6 +58,12 @@ option('build_runner',
choices : ['auto', 'true', 'false'],
description : 'Build test runner')
+option('with_oping',
+ type : 'combo',
+ value : 'auto',
+ choices : ['auto', 'true', 'false'],
+ description : 'Build igt_runner with liboping')
+
option('use_rpath',
type : 'boolean',
value : false,
diff --git a/runner/executor.c b/runner/executor.c
index 7e5fbe8f..c07e53fa 100644
--- a/runner/executor.c
+++ b/runner/executor.c
@@ -1,6 +1,10 @@
#include <errno.h>
#include <fcntl.h>
+#include <glib.h>
#include <linux/watchdog.h>
+#if HAVE_OPING
+#include <oping.h>
+#endif
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
@@ -16,6 +20,7 @@
#include <time.h>
#include <unistd.h>
+#include "igt_aux.h"
#include "igt_core.h"
#include "executor.h"
#include "output_strings.h"
@@ -108,6 +113,147 @@ static void ping_watchdogs(void)
}
}
+#if HAVE_OPING
+static pingobj_t *pingobj = NULL;
+
+static bool load_ping_config_from_file(void)
+{
+ char *key_file_env = NULL;
+ char *key_file_loc = NULL;
+ GError *error = NULL;
+ GKeyFile *key_file = NULL;
+ const char *ping_hostname;
+ int ret;
+
+ /* Determine igt config path */
+ key_file_env = getenv("IGT_CONFIG_PATH");
+ if (key_file_env) {
+ key_file_loc = strdup(key_file_env);
+ } else {
+ asprintf(&key_file_loc, "%s/.igtrc", g_get_home_dir());
+ }
+
+ /* Load igt config file */
+ key_file = g_key_file_new();
+ ret = g_key_file_load_from_file(key_file, key_file_loc,
+ G_KEY_FILE_NONE, &error);
+ free(key_file_loc);
+
+ if (!ret) {
+ g_error_free(error);
+ g_key_file_free(key_file);
+ return false;
+ }
+
+ g_clear_error(&error);
+
+ ping_hostname =
+ g_key_file_get_string(key_file, "DUT",
+ "PingHostName", &error);
+
+ g_clear_error(&error);
+ g_key_file_free(key_file);
+
+ if (!ping_hostname)
+ return false;
+
+ if (ping_host_add(pingobj, ping_hostname)) {
+ fprintf(stderr,
+ "abort on ping: Cannot use hostname from config file\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool load_ping_config_from_env(void)
+{
+ const char *ping_hostname;
+
+ ping_hostname = getenv("IGT_PING_HOSTNAME");
+ if (!ping_hostname)
+ return false;
+
+ if (ping_host_add(pingobj, ping_hostname)) {
+ fprintf(stderr,
+ "abort on ping: Cannot use hostname from environment\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool can_ping(void)
+{
+ /*
+ * On some hosts, getting network back up after suspend takes
+ * upwards of 10 seconds. 20 seconds should be enough to see
+ * if network comes back at all, and hopefully not too long to
+ * make external monitoring freak out.
+ */
+ igt_until_timeout(20) {
+ pingobj_iter_t *iter;
+
+ ping_send(pingobj);
+
+ for (iter = ping_iterator_get(pingobj);
+ iter != NULL;
+ iter = ping_iterator_next(iter)) {
+ double latency;
+ size_t len = sizeof(latency);
+
+ ping_iterator_get_info(iter,
+ PING_INFO_LATENCY,
+ &latency,
+ &len);
+ if (latency >= 0.0)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+#endif
+
+static void ping_config(void)
+{
+#if HAVE_OPING
+ double timeout = 1.0;
+
+ if (pingobj)
+ return;
+
+ pingobj = ping_construct();
+
+ /* Try env first, then config file */
+ if (!load_ping_config_from_env() && !load_ping_config_from_file()) {
+ fprintf(stderr,
+ "abort on ping: No host to ping configured\n");
+ ping_destroy(pingobj);
+ pingobj = NULL;
+ return;
+ }
+
+ ping_setopt(pingobj, PING_OPT_TIMEOUT, &timeout);
+#endif
+}
+
+static char *handle_ping(void)
+{
+#if HAVE_OPING
+ if (pingobj && !can_ping()) {
+ char *reason;
+
+ asprintf(&reason,
+ "Ping host did not respond to ping, network down");
+ return reason;
+ }
+#endif
+
+ return NULL;
+}
+
static char *handle_lockdep(void)
{
const char *header = "Lockdep not active\n\n/proc/lockdep_stats contents:\n";
@@ -197,6 +343,7 @@ static const struct {
} abort_handlers[] = {
{ ABORT_LOCKDEP, handle_lockdep },
{ ABORT_TAINT, handle_taint },
+ { ABORT_PING, handle_ping },
{ 0, 0 },
};
@@ -1288,6 +1435,9 @@ bool execute(struct execute_state *state,
init_watchdogs(settings);
+ if (settings->abort_mask & ABORT_PING)
+ ping_config();
+
if (!uname(&unamebuf)) {
dprintf(unamefd, "%s %s %s %s %s\n",
unamebuf.sysname,
diff --git a/runner/meson.build b/runner/meson.build
index b658f1d2..6d60c530 100644
--- a/runner/meson.build
+++ b/runner/meson.build
@@ -1,4 +1,15 @@
jsonc = dependency('json-c', required: _runner_required)
+runner_deps = [jsonc, glib]
+runner_c_args = []
+
+with_oping = get_option('with_oping')
+if with_oping != 'false'
+ oping = dependency('liboping', required: with_oping == 'true')
+ if oping.found()
+ runner_deps += oping
+ runner_c_args += '-DHAVE_OPING=1'
+ endif
+endif
runnerlib_sources = [ 'settings.c',
'job_list.c',
@@ -21,7 +32,8 @@ if _build_runner and _build_tests and jsonc.found()
runnerlib = static_library('igt_runner', runnerlib_sources,
include_directories : inc,
- dependencies : [jsonc, glib])
+ c_args : runner_c_args,
+ dependencies : runner_deps)
runner = executable('igt_runner', runner_sources,
link_with : runnerlib,
diff --git a/runner/settings.c b/runner/settings.c
index ad38ae8d..b57d1a6a 100644
--- a/runner/settings.c
+++ b/runner/settings.c
@@ -48,6 +48,7 @@ static struct {
} abort_conditions[] = {
{ ABORT_TAINT, "taint" },
{ ABORT_LOCKDEP, "lockdep" },
+ { ABORT_PING, "ping" },
{ ABORT_ALL, "all" },
{ 0, 0 },
};
@@ -136,6 +137,9 @@ static const char *usage_str =
" Possible conditions:\n"
" lockdep - abort when kernel lockdep has been angered.\n"
" taint - abort when kernel becomes fatally tainted.\n"
+ " ping - abort when a host configured in .igtrc or\n"
+ " environment variable IGT_PING_HOSTNAME does\n"
+ " not respond to ping.\n"
" all - abort for all of the above.\n"
" -s, --sync Sync results to disk after every test\n"
" -l {quiet,verbose,dummy}, --log-level {quiet,verbose,dummy}\n"
diff --git a/runner/settings.h b/runner/settings.h
index 0a1ee08d..2b6e65d0 100644
--- a/runner/settings.h
+++ b/runner/settings.h
@@ -15,9 +15,10 @@ enum {
#define ABORT_TAINT (1 << 0)
#define ABORT_LOCKDEP (1 << 1)
-#define ABORT_ALL (ABORT_TAINT | ABORT_LOCKDEP)
+#define ABORT_PING (1 << 2)
+#define ABORT_ALL (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING)
-_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP), "ABORT_ALL must be all conditions bitwise or'd");
+_Static_assert(ABORT_ALL == (ABORT_TAINT | ABORT_LOCKDEP | ABORT_PING), "ABORT_ALL must be all conditions bitwise or'd");
struct regex_list {
char **regex_strings;
--
2.19.1
More information about the igt-dev
mailing list