[PATCH i-g-t v4] runner/executor: Abort if dmesg is flooded

Thu Jul 24 11:23:05 UTC 2025

Hi Kamil,

I posted some comments below.

[...]
> Create a way to abort test in such scenarios with the help of
> measureing kernel kmsg activity before first test is executed
measureing -> measuring

[...]
> +++ b/runner/executor.c
> +static int open_kmsg_rdonly(void)
> +{
> +	return open("/dev/kmsg", O_RDONLY | O_CLOEXEC | O_NONBLOCK);
> +}
> +
>  /* Returns the number of bytes written to disk, or a negative number on error */
>  static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
>  {
> @@ -734,6 +739,95 @@ static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
>  	}
>  }
>  
> +/*
> + * Measure how many bytes appears in dmesg.
> + * This is the same as above dump_dmesg() but with reading only.
> + */
> +static unsigned long long read_dmesg2(int kmsgfd, unsigned long long size, int wrfd)
> +{
> +	bool underflow_once = false;
> +	int comparefd;
> +	char buf[2048];
> +	unsigned int flags;
> +	unsigned long long readed = 0;
readed -> read, as this is irregular verb.

> +	unsigned long long seq, cmpseq, usec;
> +	char cont;
> +	ssize_t r;
> +
> +	if (kmsgfd < 0)
> +		return 0;
> +
> +	if (wrfd < 0)
> +		wrfd = 0;
> +
> +	comparefd = open_kmsg_rdonly();
> +	if (comparefd < 0) {
> +		errf("Error opening another fd for /dev/kmsg\n");
> +		return 0;
> +	}
> +
> +	lseek(comparefd, 0, SEEK_END);
> +
> +	while (1) {
> +		if (comparefd >= 0) {
> +			r = read(comparefd, buf, sizeof(buf) - 1);
> +			if (r < 0) {
> +				if (errno != EAGAIN && errno != EPIPE) {
> +					errf("Warning: Error reading kmsg comparison record: %m\n");
> +					close(comparefd);
> +					return readed;
> +				}
> +			} else {
> +				buf[r] = '\0';
> +				if (sscanf(buf, "%u,%llu,%llu,%c;",
> +					   &flags, &cmpseq, &usec, &cont) == 4) {
> +					close(comparefd);
> +					comparefd = -1;
> +				}
What would happen if the first call to
r = read(comparefd, buf, sizeof(buf) - 1);
failed and then
r = read(kmsgfd, buf, sizeof(buf));
would succeed? Would we be counting the comparefd bytes after we
already got some bytes from kmsgfd? Or is the comparison
operation optional?
> +			}
> +		}
> +
> +		r = read(kmsgfd, buf, sizeof(buf));
> +		if (r < 0) {
> +			if (errno == EPIPE) {
> +				if (!underflow_once) {
> +					errf("Warning: kernel log ringbuffer underflow, some records lost.\n");
> +					underflow_once = true;
> +				}
> +				continue;
> +			} else if (errno == EINVAL) {
> +				errf("Warning: Buffer too small for kernel log record, record lost.\n");
> +				continue;
> +			} else if (errno != EAGAIN) {
> +				errf("Error reading from kmsg: %m\n");
> +				return 0;
> +			}
> +
> +			/* EAGAIN, so we're done reading */
> +			close(comparefd);
> +
> +			return readed;
> +		}
> +
> +		readed += r;
> +		if (wrfd && r)
> +			write(wrfd, buf, r);
> +
> +		if (comparefd < 0 && sscanf(buf, "%u,%llu,%llu,%c;",
> +					    &flags, &seq, &usec, &cont) == 4) {
> +			if (seq >= cmpseq)
I wonder if seq can be a lower value than cmpseq - should we
care if that happens? If not, then maybe move the "seq >= cmpseq"
check into the if before, to reduce indentation a bit.

> +				return readed;
> +		}
> +
> +		if (size && readed >= size) {
> +			if (comparefd >= 0)
> +				close(comparefd);
> +
> +			return readed;
> +		}
> +	} /* while(1) */
> +}
> +
>  static bool kill_child(int sig, pid_t child)
>  {
>  	/*
> @@ -948,6 +1042,78 @@ static size_t calc_last_dmesg_chunk(size_t limit, size_t disk_usage)
>  	return dt != 0 ? dt : -1;
>  }
>  
> +/*
> + * Returns:
> + *  =>0.0 - Success, measured kmsg activity in bytes/second
> + *  -1.0 - Failure
> + */
> +static double measure_dmesg_bytes_per_sec(int wrfd)
> +{
> +	struct timespec time_beg, time_now, nsec_sleep;
> +	unsigned long long readed, rnow;
> +	double time, ratio;
> +	int kmsgfd, cnt, zero;
> +
> +	if ((kmsgfd = open_kmsg_rdonly()) < 0) {
> +		errf("Warning: Cannot open /dev/kmsg\n");
> +
> +		return -1.0;
> +	}
> +
> +	if (wrfd < 0)
> +		wrfd = 0;
> +
> +	lseek(kmsgfd, 0, SEEK_END);
> +	nsec_sleep.tv_sec = 0;
> +	nsec_sleep.tv_nsec = 10ULL * 1000ULL * 1000ULL; /* 10^7 nanoseconds = 10^-2 sec */
> +	runner_gettime(&time_beg);
> +	for (cnt = 0, zero = 0, time = 0.0, readed = 0.0; cnt < 11 && time < 0.01; ++cnt) {
> +		if (zero)
If "zero" only serves as a flag then I think below sleep could
be moved to the end of this for loop, because from execution
standpoint this will never trigger during the first pass
> +			nanosleep(&nsec_sleep, NULL);
> +
> +		rnow = read_dmesg2(kmsgfd, 128 * 1024, wrfd); /* 64KB max */
> +		readed += rnow;
> +		runner_gettime(&time_now);
> +		time = igt_time_elapsed(&time_beg, &time_now);
> +		if (time <= 0.0) {
> +			errf("Warning: Time underflow\n");
> +
> +			return -1.0;
> +		}
> +
> +		if (rnow == 0)
Move the nanosleep here and perhaps remove "zero" variable
completely. Then the code here would only contain a single if:
	if (rnow == 0)
		nanosleep(&nsec_sleep, NULL);

> +			++zero;
> +		else
> +			zero = 0;
> +	}
> +
> +	runner_gettime(&time_now);
> +	time = igt_time_elapsed(&time_beg, &time_now);
> +	if (time <= 0.0) {
> +		errf("Warning: Time underflow\n");
> +
You could skip this newline here.

> +		return -1.0;
> +	}
> +
> +	ratio = (double)readed / time;
> +
> +	return ratio;
You could return (double)readed / time; and remove ratio
variable.

Best Regards,
Krzysztof