[PATCH i-g-t v2] runner/executor: Abort if dmesg is flooded
Peter Senna Tschudin
peter.senna at linux.intel.com
Tue Jul 1 10:25:33 UTC 2025
On 6/30/2025 9:02 PM, Kamil Konieczny wrote:
> Current disk limit triggers once when it is exceeded during test
> monitoring. After that happens executor no longer checks if
> kernel is still printing plenty of messages.
>
> Create a way to abort test in such scenarios with the help of
> measureing kernel kmsg activity before first test is executed
> and then also check it twice, first time after exceeding disk
> limit and second time after each test ends.
>
> v2: fix error when kmsg open fails, fix reading proc (Kamil)
> changed calculation of max bps (Peter)
Tested-by: Peter Senna Tschudin <peter.senna at linux.intel.com>
>
> Cc: Ewelina Musial <ewelina.musial at intel.com>
> Cc: Karol Krol <karol.krol at intel.com>
> Cc: Petri Latvala <adrinael at adrinael.net>
> Cc: Peter Senna Tschudin <peter.senna at linux.intel.com>
> Cc: Ryszard Knop <ryszard.knop at intel.com>
> Cc: Vitaly Prosyak <vitaly.prosyak at amd.com>
> Cc: "Zbigniew Kempczyński" <zbigniew.kempczynski at intel.com>
> Signed-off-by: Kamil Konieczny <kamil.konieczny at linux.intel.com>
> ---
> runner/executor.c | 206 +++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 204 insertions(+), 2 deletions(-)
>
> diff --git a/runner/executor.c b/runner/executor.c
> index 13180a0a4..c50e2ca7e 100644
> --- a/runner/executor.c
> +++ b/runner/executor.c
> @@ -637,6 +637,11 @@ const char *get_out_filename(int fid)
> return "output-filename-index-error";
> }
>
> +static int open_kmsg_rdonly(void)
> +{
> + return open("/dev/kmsg", O_RDONLY | O_CLOEXEC | O_NONBLOCK);
> +}
> +
> /* Returns the number of bytes written to disk, or a negative number on error */
> static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
> {
> @@ -734,6 +739,90 @@ static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
> }
> }
>
> +/*
> + * Measure how many bytes appears in dmesg.
> + * This is the same as above dump_dmesg() but with reading only.
> + */
> +static unsigned long long read_dmesg_to_null(int kmsgfd, unsigned long long size)
> +{
> + bool underflow_once = false;
> + int comparefd;
> + char buf[2048];
> + unsigned int flags;
> + unsigned long long readed = 0;
> + unsigned long long seq, cmpseq, usec;
> + char cont;
> + ssize_t r;
> +
> + if (kmsgfd < 0)
> + return 0;
> +
> + comparefd = open_kmsg_rdonly();
> + if (comparefd < 0) {
> + errf("Error opening another fd for /dev/kmsg\n");
> + return 0;
> + }
> +
> + lseek(comparefd, 0, SEEK_END);
> +
> + while (1) {
> + if (comparefd >= 0) {
> + r = read(comparefd, buf, sizeof(buf) - 1);
> + if (r < 0) {
> + if (errno != EAGAIN && errno != EPIPE) {
> + errf("Warning: Error reading kmsg comparison record: %m\n");
> + close(comparefd);
> + return readed;
> + }
> + } else {
> + buf[r] = '\0';
> + if (sscanf(buf, "%u,%llu,%llu,%c;",
> + &flags, &cmpseq, &usec, &cont) == 4) {
> + close(comparefd);
> + comparefd = -1;
> + }
> + }
> + }
> +
> + r = read(kmsgfd, buf, sizeof(buf));
> + if (r < 0) {
> + if (errno == EPIPE) {
> + if (!underflow_once) {
> + errf("Warning: kernel log ringbuffer underflow, some records lost.\n");
> + underflow_once = true;
> + }
> + continue;
> + } else if (errno == EINVAL) {
> + errf("Warning: Buffer too small for kernel log record, record lost.\n");
> + continue;
> + } else if (errno != EAGAIN) {
> + errf("Error reading from kmsg: %m\n");
> + return 0;
> + }
> +
> + /* EAGAIN, so we're done reading */
> + close(comparefd);
> +
> + return readed;
> + }
> +
> + readed += r;
> +
> + if (comparefd < 0 && sscanf(buf, "%u,%llu,%llu,%c;",
> + &flags, &seq, &usec, &cont) == 4) {
> + if (seq >= cmpseq)
> + return readed;
> + }
> +
> + if (size && readed >= size) {
> + if (comparefd >= 0)
> + close(comparefd);
> +
> + return readed;
> + }
> + } /* while(1) */
> +}
> +
> static bool kill_child(int sig, pid_t child)
> {
> /*
> @@ -943,6 +1032,86 @@ static size_t calc_last_dmesg_chunk(size_t limit, size_t disk_usage)
> return dt != 0 ? dt : -1;
> }
>
> +/*
> + * Returns:
> + * =>0.0 - Success, measured kmsg activity in bytes/second
> + * -1.0 - Failure
> + */
> +static double measure_dmesg_bytes_per_sec(void)
> +{
> + struct timespec time_beg, time_now, nsec_sleep;
> + unsigned long long readed, rnow;
> + double time;
> + int kmsgfd, cnt, zero;
> +
> + if ((kmsgfd = open_kmsg_rdonly()) < 0) {
> + errf("Warning: Cannot open /dev/kmsg\n");
> +
> + return -1.0;
> + }
> +
> + lseek(kmsgfd, 0, SEEK_END);
> + nsec_sleep.tv_sec = 0;
> + nsec_sleep.tv_nsec = 1000ULL * 1000ULL; /* 10^6 nanoseconds = 10^-3 sec */
> + runner_gettime(&time_beg);
> + for (cnt = 0, zero = 0, time = 0.0, readed = 0.0; cnt < 11 && time < 0.01; ++cnt) {
> + if (zero && !nanosleep(&nsec_sleep, NULL))
> + break;
> +
> + rnow = read_dmesg_to_null(kmsgfd, 128 * 1024); /* 64KB max */
> + readed += rnow;
> + runner_gettime(&time_now);
> + time = igt_time_elapsed(&time_beg, &time_now);
> + if (time <= 0.0) {
> + errf("Warning: Time underflow\n");
> +
> + return -1.0;
> + }
> +
> + if (rnow == 0.0)
> + ++zero;
> + else
> + zero = 0;
> +
> + if (zero > 2)
> + return 0.0;
> + }
> +
> + runner_gettime(&time_now);
> + time = igt_time_elapsed(&time_beg, &time_now);
> + if (time <= 0.0) {
> + errf("Warning: Time underflow\n");
> +
> + return -1.0;
> + }
> +
> + return (double)readed / time;
> +}
> +
> +/* Returns: true if ratio lower than maxratio */
> +static bool check_dmesg_ratio(double maxratio, int log_level, bool sync)
> +{
> + double new_dmesg_bps = measure_dmesg_bytes_per_sec();
> +
> + if (new_dmesg_bps > maxratio) {
> + char msg[1024];
> +
> + snprintf(msg, sizeof(msg), "Dmesg rate overflow %0.3fKB/s > %0.3fKB/s, stopping.\n",
> + new_dmesg_bps / 1024, maxratio / 1024);
> + if (log_level >= LOG_LEVEL_NORMAL) {
> + outf("%s", msg);
> + if (sync)
> + fflush(stdout);
> + }
> +
> + errf("%s", msg);
> +
> + return false;
> + }
> +
> + return true;
> +}
> +
> /*
> * Returns:
> * =0 - Success
> @@ -955,6 +1124,7 @@ static int monitor_output(pid_t child,
> int *outputs,
> double *time_spent,
> struct settings *settings,
> + double max_dmesg_ratio,
> char **abortreason,
> bool *abort_already_written)
> {
> @@ -1596,6 +1766,11 @@ static int monitor_output(pid_t child,
> }
> }
>
> + if (!aborting && !check_dmesg_ratio(max_dmesg_ratio, settings->log_level, settings->sync)) {
> + asprintf(abortreason, "Dmesg ratio exceeded.");
> + aborting = true;
> + }
> +
> free(buf);
> free(outbuf);
> close(outfd);
> @@ -1760,6 +1935,7 @@ static int execute_next_entry(struct execute_state *state,
> size_t total,
> double *time_spent,
> struct settings *settings,
> + double max_dmesg_ratio,
> struct job_list_entry *entry,
> int testdirfd, int resdirfd,
> int sigfd, sigset_t *sigmask,
> @@ -1808,7 +1984,7 @@ static int execute_next_entry(struct execute_state *state,
> goto out_pipe;
> }
>
> - if ((kmsgfd = open("/dev/kmsg", O_RDONLY | O_CLOEXEC | O_NONBLOCK)) < 0) {
> + if ((kmsgfd = open_kmsg_rdonly()) < 0) {
> errf("Warning: Cannot open /dev/kmsg\n");
> } else {
> /* TODO: Checking of abort conditions in pre-execute dmesg */
> @@ -1879,7 +2055,7 @@ static int execute_next_entry(struct execute_state *state,
>
> result = monitor_output(child, outfd, errfd, socketfd,
> kmsgfd, sigfd,
> - outputs, time_spent, settings,
> + outputs, time_spent, settings, max_dmesg_ratio,
> abortreason, abort_already_written);
>
> out_kmsgfd:
> @@ -2412,6 +2588,8 @@ bool execute(struct execute_state *state,
> struct settings *settings,
> struct job_list *job_list)
> {
> + static double dmesg_bps = -1.0;
> + static double max_dmesg_bps = -1.0;
> int resdirfd, testdirfd, unamefd, timefd, sigfd;
> struct environment_variable *env_var;
> struct utsname unamebuf;
> @@ -2554,6 +2732,26 @@ bool execute(struct execute_state *state,
> }
> }
>
> + if (max_dmesg_bps < 0.0) {
> + double ncpu_bps = 4 * 1024 * max_t(size_t, sysconf(_SC_NPROCESSORS_ONLN), 4);
> + double set_bps = 0.0;
> +
> + dmesg_bps = measure_dmesg_bytes_per_sec();
> +
> + if (settings->disk_usage_limit > 0 && settings->per_test_timeout > 0)
> + set_bps = (double)settings->disk_usage_limit / (double)settings->per_test_timeout;
> +
> + outf("Dmesg KB/s ratio settings:%0.3f ncpu:%0.3f current:%0.3f\n",
> + set_bps / 1024, ncpu_bps / 1024, dmesg_bps / 1024);
> +
> + if (set_bps > 0.0)
> + max_dmesg_bps = set_bps;
> + else
> + max_dmesg_bps = dmesg_bps > ncpu_bps ? dmesg_bps : ncpu_bps;
> +
> + outf("Using max level dmesg ratio %0.3fKB/s\n", max_dmesg_bps / 1024);
> + }
> +
> for (; state->next < job_list->size;
> state->next++) {
> char *reason = NULL;
> @@ -2590,6 +2788,7 @@ bool execute(struct execute_state *state,
> job_list->size,
> &time_spent,
> settings,
> + max_dmesg_bps,
> &job_list->entries[state->next],
> testdirfd, resdirfd,
> sigfd, &sigmask,
> @@ -2645,6 +2844,9 @@ bool execute(struct execute_state *state,
> break;
> }
>
> + if (!check_dmesg_ratio(max_dmesg_bps, settings->log_level, settings->sync))
> + break;
> +
> if (result > 0) {
> double time_left = state->time_left;
>
More information about the igt-dev
mailing list