[i-g-t,v2,2/4] runner/executor: Limit reading dmesg to chunks

Ryszard Knop rk at dragonic.eu
Wed Nov 27 15:12:13 UTC 2024


LGTM, with one question below.

Reviewed-by: Ryszard Knop <rk at dragonic.eu>

On Wed, 2024-11-20 at 19:32 +0100, Kamil Konieczny wrote:
> There was no disk limit checks in reading kernel dmesg and that
> could lead to writing really huge dumps longer than 400MB,
> greatly exceeding disk limits used by CI and hardly useful for
> developers. Make a dmesg dumping in chunks, size depending on
> number of CPUs present, with a minimum of 64KB.
> This could also allow to kick in disk limits checks if a driver
> starts spilling messages into dmesg.
> 
> v2: correct size at last dump, also inform user about exceeding
>   limits (Kamil)
> 
> Closes: https://gitlab.freedesktop.org/drm/igt-gpu-tools/-/issues/129
> Cc: Petri Latvala <adrinael at adrinael.net>
> Cc: Karol Krol <karol.krol at intel.com>
> Cc: Ewelina Musial <ewelina.musial at intel.com>
> Signed-off-by: Kamil Konieczny <kamil.konieczny at linux.intel.com>
> ---
>  runner/executor.c | 64 ++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 55 insertions(+), 9 deletions(-)
> 
> diff --git a/runner/executor.c b/runner/executor.c
> index e4d1fc323..a5b5c0eab 100644
> --- a/runner/executor.c
> +++ b/runner/executor.c
> @@ -585,13 +585,14 @@ void close_outputs(int *fds)
>  }
>  
>  /* Returns the number of bytes written to disk, or a negative number on error */
> -static long dump_dmesg(int kmsgfd, int outfd)
> +static long dump_dmesg(int kmsgfd, int outfd, ssize_t size)
>  {
>  	/*
>  	 * Write kernel messages to the log file until we reach
> -	 * 'now'. Unfortunately, /dev/kmsg doesn't support seeking to
> -	 * -1 from SEEK_END so we need to use a second fd to read a
> -	 * message to match against, or stop when we reach EAGAIN.
> +	 * 'now' or we read at least size bytes. Unfortunately,
> +	 * /dev/kmsg doesn't support seeking to -1 from SEEK_END
> +	 * so we need to use a second fd to read a message to
> +	 *  match against, or stop when we reach EAGAIN.
>  	 */
>  
>  	int comparefd;
> @@ -606,6 +607,9 @@ static long dump_dmesg(int kmsgfd, int outfd)
>  	if (kmsgfd < 0)
>  		return 0;
>  
> +	if (size < 0)
> +		return 0;
> +
>  	comparefd = open("/dev/kmsg", O_RDONLY | O_NONBLOCK);
>  	if (comparefd < 0) {
>  		errf("Error opening another fd for /dev/kmsg\n");
> @@ -690,6 +694,13 @@ static long dump_dmesg(int kmsgfd, int outfd)
>  			if (seq >= cmpseq)
>  				return written;
>  		}
> +
> +		if (size && written >= size) {
> +			if (comparefd >= 0)
> +				close(comparefd);
> +
> +			return written;
> +		}
>  	}
>  }
>  
> @@ -883,6 +894,25 @@ static void write_packet_with_canary(int fd, struct runnerpacket *packet,
> bool s
>  /* TODO: Refactor this macro from here and from various tests to lib */
>  #define KB(x) ((x) * 1024)
>  
> +#if !defined(SIZE_MAX)
> +#define SIZE_MAX (((size_t)-1) ^ (1 << (8 * sizeof(size_t) - 1)))
> +#endif
> +
> +/* Calculates disk limit to use when dumping dmesg, returns:
> + * number > 0	: size of limit to use
> + * number < 0	: negative number when limit exceeded, no more dumping
> + **/
> +static size_t calc_last_dmesg_chunk(size_t limit, size_t disk_usage)
> +{
> +	size_t dt = limit - disk_usage;
> +
> +	assert(SIZE_MAX > 0);
> +	if (!limit)
> +		return SIZE_MAX; /* no limit */
> +
> +	return dt != 0 ? dt : -1;
> +}
> +
>  /*
>   * Returns:
>   *  =0 - Success
> @@ -915,6 +945,8 @@ static int monitor_output(pid_t child,
>  	unsigned long taints = 0;
>  	bool aborting = false;
>  	size_t disk_usage = 0;
> +	size_t dmsg_chunk_size = 4096 * max_t(size_t, sysconf(_SC_NPROCESSORS_ONLN), 16);

Any reason for 4096 here? If it's supposed to be the page size, maybe getpagesize() here?

> +	long dmesgwritten;
>  	bool socket_comms_used = false; /* whether the test actually uses comms */
>  	bool results_received = false; /* whether we already have test results that might need
> overriding if we detect an abort condition */
>  
> @@ -1240,11 +1272,9 @@ static int monitor_output(pid_t child,
>  	socket_end:
>  
>  		if (kmsgfd >= 0 && FD_ISSET(kmsgfd, &set)) {
> -			long dmesgwritten;
> -
>  			time_last_activity = time_now;
>  
> -			dmesgwritten = dump_dmesg(kmsgfd, outputs[_F_DMESG]);
> +			dmesgwritten = dump_dmesg(kmsgfd, outputs[_F_DMESG], dmsg_chunk_size);
>  			if (settings->sync)
>  				fdatasync(outputs[_F_DMESG]);
>  
> @@ -1482,7 +1512,8 @@ static int monitor_output(pid_t child,
>  					asprintf(abortreason, "Child refuses to die, tainted
> 0x%lx.", taints);
>  				}
>  
> -				dump_dmesg(kmsgfd, outputs[_F_DMESG]);
> +				dmsg_chunk_size = calc_last_dmesg_chunk(settings-
> >disk_usage_limit, disk_usage);
> +				dump_dmesg(kmsgfd, outputs[_F_DMESG], dmsg_chunk_size);
>  				if (settings->sync)
>  					fdatasync(outputs[_F_DMESG]);
>  
> @@ -1508,9 +1539,24 @@ static int monitor_output(pid_t child,
>  		}
>  	}
>  
> -	dump_dmesg(kmsgfd, outputs[_F_DMESG]);
> +	dmsg_chunk_size = calc_last_dmesg_chunk(settings->disk_usage_limit, disk_usage);
> +	dmesgwritten = dump_dmesg(kmsgfd, outputs[_F_DMESG], dmsg_chunk_size);
>  	if (settings->sync)
>  		fdatasync(outputs[_F_DMESG]);
> +	if (dmesgwritten > 0) {
> +		disk_usage += dmesgwritten;
> +		if (settings->disk_usage_limit && disk_usage > settings->disk_usage_limit) {
> +			char disk[1024];
> +
> +			snprintf(disk, sizeof(disk), "igt_runner: disk limit exceeded at dmesg
> dump, %ld > %ld\n", disk_usage, settings->disk_usage_limit);
> +			if (settings->log_level >= LOG_LEVEL_NORMAL) {
> +				outf("%s", disk);
> +				fflush(stdout);
> +			} else if (killed) {
> +				errf("%s", disk);
> +			}
> +		}
> +	}
>  
>  	free(buf);
>  	free(outbuf);



More information about the igt-dev mailing list