[PATCH i-g-t v4] runner/executor: Abort if dmesg is flooded

Kamil Konieczny kamil.konieczny at linux.intel.com
Wed Aug 27 07:31:54 UTC 2025


Hi Vitaly,
On 2025-08-26 at 20:28:25 -0400, vitaly prosyak wrote:
> Reviewed-by Vitaly Prosyak <vitaly.prosyak at amd.com>
> 

Could you look into v5? I fixed a reading loop there.
Thank you in advance!

Regards,
Kamil

> On 2025-07-24 07:55, Peter Senna Tschudin wrote:
> >
> > On 7/23/2025 6:06 PM, Kamil Konieczny wrote:
> >> Current disk limit triggers once when it is exceeded during test
> >> monitoring. After that happens executor no longer checks if
> >> kernel is still printing plenty of messages.
> >>
> >> Create a way to abort test in such scenarios with the help of
> >> measureing kernel kmsg activity before first test is executed
> >> and then also check it after exceeding disk limit.
> >>
> >> Also print info about it after each test ends.
> >>
> >> v2: fix error when kmsg open fails, fix reading proc (Kamil)
> >>   changed calculation of max bps (Peter)
> >> v3: abort only when disk limit was actually exceeded,
> >>   print info instead of aborting after every test (Kamil)
> >> v4: added abort at main monitoring loop, also cleaning up
> >>   printing messages (Kamil)
> >>
> >> Cc: Ewelina Musial <ewelina.musial at intel.com>
> >> Cc: Karol Krol <karol.krol at intel.com>
> >> Cc: Petri Latvala <adrinael at adrinael.net>
> >> Cc: Peter Senna Tschudin <peter.senna at linux.intel.com>
> >> Cc: Ryszard Knop <ryszard.knop at intel.com>
> >> Cc: Vitaly Prosyak <vitaly.prosyak at amd.com>
> >> Cc: "Zbigniew Kempczyński" <zbigniew.kempczynski at intel.com>
> > Tested-by: Peter Senna Tschudin <peter.senna at linux.intel.com>>
> > Signed-off-by: Kamil Konieczny <kamil.konieczny at linux.intel.com>
> >> ---
> >>  runner/executor.c | 228 +++++++++++++++++++++++++++++++++++++++++++---
> >>  1 file changed, 214 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/runner/executor.c b/runner/executor.c
> >> index 847abe481..db2c05b03 100644
> >> --- a/runner/executor.c
> >> +++ b/runner/executor.c
> >> @@ -637,6 +637,11 @@ const char *get_out_filename(int fid)
> >>  	return "output-filename-index-error";
> >>  }
> >>  
> >> +static int open_kmsg_rdonly(void)
> >> +{
> >> +	return open("/dev/kmsg", O_RDONLY | O_CLOEXEC | O_NONBLOCK);
> >> +}
> >> +
> >>  /* Returns the number of bytes written to disk, or a negative number on error */
> >>  static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
> >>  {
> >> @@ -734,6 +739,95 @@ static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
> >>  	}
> >>  }
> >>  
> >> +/*
> >> + * Measure how many bytes appears in dmesg.
> >> + * This is the same as above dump_dmesg() but with reading only.
> >> + */
> >> +static unsigned long long read_dmesg2(int kmsgfd, unsigned long long size, int wrfd)
> >> +{
> >> +	bool underflow_once = false;
> >> +	int comparefd;
> >> +	char buf[2048];
> >> +	unsigned int flags;
> >> +	unsigned long long readed = 0;
> >> +	unsigned long long seq, cmpseq, usec;
> >> +	char cont;
> >> +	ssize_t r;
> >> +
> >> +	if (kmsgfd < 0)
> >> +		return 0;
> >> +
> >> +	if (wrfd < 0)
> >> +		wrfd = 0;
> >> +
> >> +	comparefd = open_kmsg_rdonly();
> >> +	if (comparefd < 0) {
> >> +		errf("Error opening another fd for /dev/kmsg\n");
> >> +		return 0;
> >> +	}
> >> +
> >> +	lseek(comparefd, 0, SEEK_END);
> >> +
> >> +	while (1) {
> >> +		if (comparefd >= 0) {
> >> +			r = read(comparefd, buf, sizeof(buf) - 1);
> >> +			if (r < 0) {
> >> +				if (errno != EAGAIN && errno != EPIPE) {
> >> +					errf("Warning: Error reading kmsg comparison record: %m\n");
> >> +					close(comparefd);
> >> +					return readed;
> >> +				}
> >> +			} else {
> >> +				buf[r] = '\0';
> >> +				if (sscanf(buf, "%u,%llu,%llu,%c;",
> >> +					   &flags, &cmpseq, &usec, &cont) == 4) {
> >> +					close(comparefd);
> >> +					comparefd = -1;
> >> +				}
> >> +			}
> >> +		}
> >> +
> >> +		r = read(kmsgfd, buf, sizeof(buf));
> >> +		if (r < 0) {
> >> +			if (errno == EPIPE) {
> >> +				if (!underflow_once) {
> >> +					errf("Warning: kernel log ringbuffer underflow, some records lost.\n");
> >> +					underflow_once = true;
> >> +				}
> >> +				continue;
> >> +			} else if (errno == EINVAL) {
> >> +				errf("Warning: Buffer too small for kernel log record, record lost.\n");
> >> +				continue;
> >> +			} else if (errno != EAGAIN) {
> >> +				errf("Error reading from kmsg: %m\n");
> >> +				return 0;
> >> +			}
> >> +
> >> +			/* EAGAIN, so we're done reading */
> >> +			close(comparefd);
> >> +
> >> +			return readed;
> >> +		}
> >> +
> >> +		readed += r;
> >> +		if (wrfd && r)
> >> +			write(wrfd, buf, r);
> >> +
> >> +		if (comparefd < 0 && sscanf(buf, "%u,%llu,%llu,%c;",
> >> +					    &flags, &seq, &usec, &cont) == 4) {
> >> +			if (seq >= cmpseq)
> >> +				return readed;
> >> +		}
> >> +
> >> +		if (size && readed >= size) {
> >> +			if (comparefd >= 0)
> >> +				close(comparefd);
> >> +
> >> +			return readed;
> >> +		}
> >> +	} /* while(1) */
> >> +}
> >> +
> >>  static bool kill_child(int sig, pid_t child)
> >>  {
> >>  	/*
> >> @@ -948,6 +1042,78 @@ static size_t calc_last_dmesg_chunk(size_t limit, size_t disk_usage)
> >>  	return dt != 0 ? dt : -1;
> >>  }
> >>  
> >> +/*
> >> + * Returns:
> >> + *  =>0.0 - Success, measured kmsg activity in bytes/second
> >> + *  -1.0 - Failure
> >> + */
> >> +static double measure_dmesg_bytes_per_sec(int wrfd)
> >> +{
> >> +	struct timespec time_beg, time_now, nsec_sleep;
> >> +	unsigned long long readed, rnow;
> >> +	double time, ratio;
> >> +	int kmsgfd, cnt, zero;
> >> +
> >> +	if ((kmsgfd = open_kmsg_rdonly()) < 0) {
> >> +		errf("Warning: Cannot open /dev/kmsg\n");
> >> +
> >> +		return -1.0;
> >> +	}
> >> +
> >> +	if (wrfd < 0)
> >> +		wrfd = 0;
> >> +
> >> +	lseek(kmsgfd, 0, SEEK_END);
> >> +	nsec_sleep.tv_sec = 0;
> >> +	nsec_sleep.tv_nsec = 10ULL * 1000ULL * 1000ULL; /* 10^7 nanoseconds = 10^-2 sec */
> >> +	runner_gettime(&time_beg);
> >> +	for (cnt = 0, zero = 0, time = 0.0, readed = 0.0; cnt < 11 && time < 0.01; ++cnt) {
> >> +		if (zero)
> >> +			nanosleep(&nsec_sleep, NULL);
> >> +
> >> +		rnow = read_dmesg2(kmsgfd, 128 * 1024, wrfd); /* 64KB max */
> >> +		readed += rnow;
> >> +		runner_gettime(&time_now);
> >> +		time = igt_time_elapsed(&time_beg, &time_now);
> >> +		if (time <= 0.0) {
> >> +			errf("Warning: Time underflow\n");
> >> +
> >> +			return -1.0;
> >> +		}
> >> +
> >> +		if (rnow == 0)
> >> +			++zero;
> >> +		else
> >> +			zero = 0;
> >> +	}
> >> +
> >> +	runner_gettime(&time_now);
> >> +	time = igt_time_elapsed(&time_beg, &time_now);
> >> +	if (time <= 0.0) {
> >> +		errf("Warning: Time underflow\n");
> >> +
> >> +		return -1.0;
> >> +	}
> >> +
> >> +	ratio = (double)readed / time;
> >> +
> >> +	return ratio;
> >> +}
> >> +
> >> +/* Returns: true if ratio lower or equal than maxratio */
> >> +static bool check_dmesg_ratio(double maxratio, int log_level, bool sync, int wrfd)
> >> +{
> >> +	double new_dmesg_bps = measure_dmesg_bytes_per_sec(wrfd);
> >> +
> >> +	if (log_level >= LOG_LEVEL_NORMAL) {
> >> +		outf("Dmesg ratio %0.3fKB/s\n", new_dmesg_bps / 1024);
> >> +		if (sync)
> >> +			fflush(stdout);
> >> +	}
> >> +
> >> +	return new_dmesg_bps <= maxratio;
> >> +}
> >> +
> >>  /*
> >>   * Returns:
> >>   *  =0 - Success
> >> @@ -960,6 +1126,7 @@ static int monitor_output(pid_t child,
> >>  			  int *outputs,
> >>  			  double *time_spent,
> >>  			  struct settings *settings,
> >> +			  double max_dmesg_ratio,
> >>  			  char **abortreason,
> >>  			  bool *abort_already_written)
> >>  {
> >> @@ -1463,29 +1630,31 @@ static int monitor_output(pid_t child,
> >>  				 * exceeded the disk usage limit.
> >>  				 */
> >>  				if (killed && disk_usage_limit_exceeded(settings, disk_usage)) {
> >> +					char killmsg[256];
> >> +
> >>  					timeoutresult = false;
> >> +					snprintf(killmsg, sizeof(killmsg),
> >> +						 "runner: This test was killed due to exceeding disk usage limit. "
> >> +						 "(Used %zd bytes, limit %zd)\n",
> >> +						 disk_usage,
> >> +						 settings->disk_usage_limit);
> >>  
> >>  					if (socket_comms_used) {
> >>  						struct runnerpacket *message;
> >> -						char killmsg[256];
> >>  
> >> -						snprintf(killmsg, sizeof(killmsg),
> >> -							 "runner: This test was killed due to exceeding disk usage limit. "
> >> -							 "(Used %zd bytes, limit %zd)\n",
> >> -							 disk_usage,
> >> -							 settings->disk_usage_limit);
> >>  						message = runnerpacket_log(STDOUT_FILENO, killmsg);
> >>  						write_packet_with_canary(outputs[_F_SOCKET], message, settings->sync);
> >>  						free(message);
> >>  					} else {
> >> -						dprintf(outputs[_F_OUT],
> >> -							"\nrunner: This test was killed due to exceeding disk usage limit. "
> >> -							"(Used %zd bytes, limit %zd)\n",
> >> -							disk_usage,
> >> -							settings->disk_usage_limit);
> >> +						dprintf(outputs[_F_OUT], "%s", killmsg);
> >>  						if (settings->sync)
> >>  							fdatasync(outputs[_F_OUT]);
> >>  					}
> >> +
> >> +					if (!check_dmesg_ratio(max_dmesg_ratio, settings->log_level, settings->sync, outputs[_F_DMESG])) {
> >> +						asprintf(abortreason, "Dmesg ratio exceeded during test run.");
> >> +						aborting = true;
> >> +					}
> >>  				}
> >>  
> >>  				if (socket_comms_used) {
> >> @@ -1587,7 +1756,7 @@ static int monitor_output(pid_t child,
> >>  	dmesgwritten = dump_dmesg(kmsgfd, outputs[_F_DMESG], dmsg_chunk_size);
> >>  	if (settings->sync)
> >>  		fdatasync(outputs[_F_DMESG]);
> >> -	if (dmesgwritten > 0) {
> >> +	if (dmesgwritten > 0 && !aborting) {
> >>  		disk_usage += dmesgwritten;
> >>  		if (settings->disk_usage_limit && disk_usage > settings->disk_usage_limit) {
> >>  			char disk[1024];
> >> @@ -1599,6 +1768,11 @@ static int monitor_output(pid_t child,
> >>  			} else if (killed) {
> >>  				errf("%s", disk);
> >>  			}
> >> +
> >> +			if (!check_dmesg_ratio(max_dmesg_ratio, settings->log_level, settings->sync, outputs[_F_DMESG])) {
> >> +				asprintf(abortreason, "Dmesg ratio exceeded after test ends.");
> >> +				aborting = true;
> >> +			}
> >>  		}
> >>  	}
> >>  
> >> @@ -1766,6 +1940,7 @@ static int execute_next_entry(struct execute_state *state,
> >>  			      size_t total,
> >>  			      double *time_spent,
> >>  			      struct settings *settings,
> >> +			      double max_dmesg_ratio,
> >>  			      struct job_list_entry *entry,
> >>  			      int testdirfd, int resdirfd,
> >>  			      int sigfd, sigset_t *sigmask,
> >> @@ -1814,7 +1989,7 @@ static int execute_next_entry(struct execute_state *state,
> >>  		goto out_pipe;
> >>  	}
> >>  
> >> -	if ((kmsgfd = open("/dev/kmsg", O_RDONLY | O_CLOEXEC | O_NONBLOCK)) < 0) {
> >> +	if ((kmsgfd = open_kmsg_rdonly()) < 0) {
> >>  		errf("Warning: Cannot open /dev/kmsg\n");
> >>  	} else {
> >>  		/* TODO: Checking of abort conditions in pre-execute dmesg */
> >> @@ -1885,7 +2060,7 @@ static int execute_next_entry(struct execute_state *state,
> >>  
> >>  	result = monitor_output(child, outfd, errfd, socketfd,
> >>  				kmsgfd, sigfd,
> >> -				outputs, time_spent, settings,
> >> +				outputs, time_spent, settings, max_dmesg_ratio,
> >>  				abortreason, abort_already_written);
> >>  
> >>  out_kmsgfd:
> >> @@ -2418,6 +2593,8 @@ bool execute(struct execute_state *state,
> >>  	     struct settings *settings,
> >>  	     struct job_list *job_list)
> >>  {
> >> +	static double dmesg_bps = -1.0;
> >> +	static double max_dmesg_bps = -1.0;
> >>  	int resdirfd, testdirfd, unamefd, timefd, sigfd;
> >>  	struct environment_variable *env_var;
> >>  	struct utsname unamebuf;
> >> @@ -2560,6 +2737,26 @@ bool execute(struct execute_state *state,
> >>  		}
> >>  	}
> >>  
> >> +	if (max_dmesg_bps < 0.0) {
> >> +		double ncpu_bps = 4 * 1024 * max_t(size_t, sysconf(_SC_NPROCESSORS_ONLN), 4);
> >> +		double set_bps = 0.0;
> >> +
> >> +		dmesg_bps = measure_dmesg_bytes_per_sec(0);
> >> +
> >> +		if (settings->disk_usage_limit > 0 && settings->per_test_timeout > 0)
> >> +			set_bps = (double)settings->disk_usage_limit / (double)settings->per_test_timeout;
> >> +
> >> +		outf("Dmesg KB/s ratio settings:%0.3f ncpu:%0.3f current:%0.3f\n",
> >> +		     set_bps / 1024, ncpu_bps / 1024, dmesg_bps / 1024);
> >> +
> >> +		if (set_bps > 0.0)
> >> +			max_dmesg_bps = set_bps;
> >> +		else
> >> +			max_dmesg_bps = dmesg_bps > ncpu_bps ? dmesg_bps : ncpu_bps;
> >> +
> >> +		outf("Using max level dmesg ratio %0.3fKB/s\n", max_dmesg_bps / 1024);
> >> +	}
> >> +
> >>  	for (; state->next < job_list->size;
> >>  	     state->next++) {
> >>  		char *reason = NULL;
> >> @@ -2596,6 +2793,7 @@ bool execute(struct execute_state *state,
> >>  						    job_list->size,
> >>  						    &time_spent,
> >>  						    settings,
> >> +						    max_dmesg_bps,
> >>  						    &job_list->entries[state->next],
> >>  						    testdirfd, resdirfd,
> >>  						    sigfd, &sigmask,
> >> @@ -2651,6 +2849,8 @@ bool execute(struct execute_state *state,
> >>  			break;
> >>  		}
> >>  
> >> +		check_dmesg_ratio(max_dmesg_bps, settings->log_level, settings->sync, 0);
> >> +
> >>  		if (result > 0) {
> >>  			double time_left = state->time_left;
> >>  


More information about the igt-dev mailing list