[igt-dev] [PATCH i-g-t 3/3] runner: Make sure that we are closing watchdogs on signals
Arkadiusz Hiler
arkadiusz.hiler at intel.com
Fri Jul 19 11:25:52 UTC 2019
On Thu, Jul 18, 2019 at 01:57:20PM +0300, Ser, Simon wrote:
> On Tue, 2019-07-09 at 15:23 +0300, Arkadiusz Hiler wrote:
> > There are few short windows of opportunity when watchdogs are primed but
> > there is no signal handling in place, so the process may exit without
> > proper shutdown sequence.
> >
> > This patch rearranges the existing code so that we set up the signalfd
> > and BLOCK the signals before setting up watchdogs and UNBLOCK only after
> > the watchdogs are closed properly.
> >
> > If igt_runner exits due to signal, non-zero status code is returned.
> >
> > Cc: Petri Latvala <petri.latvala at intel.com>
> > Signed-off-by: Arkadiusz Hiler <arkadiusz.hiler at intel.com>
> > ---
> > runner/executor.c | 100 +++++++++++++++++++++++++++++++++-------------
> > 1 file changed, 73 insertions(+), 27 deletions(-)
> >
> > diff --git a/runner/executor.c b/runner/executor.c
> > index 6463ab96..62303ff8 100644
> > --- a/runner/executor.c
> > +++ b/runner/executor.c
> > @@ -7,6 +7,7 @@
> > #include <string.h>
> > #include <sys/ioctl.h>
> > #include <sys/select.h>
> > +#include <sys/poll.h>
> > #include <sys/signalfd.h>
> > #include <sys/stat.h>
> > #include <sys/time.h>
> > @@ -604,7 +605,6 @@ static int monitor_output(pid_t child,
> > close(outfd);
> > close(errfd);
> > close(kmsgfd);
> > - close(sigfd);
> > return -1;
> > }
> >
> > @@ -776,9 +776,8 @@ static int monitor_output(pid_t child,
> > *time_spent = time;
> > }
> >
> > - close(sigfd);
> > - sigfd = -1;
> > child = 0;
> > + sigfd = -1; /* we are dying, no signal handling for now */
> > }
> > }
> >
> > @@ -790,7 +789,6 @@ static int monitor_output(pid_t child,
> > close(outfd);
> > close(errfd);
> > close(kmsgfd);
> > - close(sigfd);
> >
> > if (aborting)
> > return -1;
> > @@ -908,13 +906,12 @@ static int execute_next_entry(struct execute_state *state,
> > double *time_spent,
> > struct settings *settings,
> > struct job_list_entry *entry,
> > - int testdirfd, int resdirfd)
> > + int testdirfd, int resdirfd,
> > + int sigfd, sigset_t *sigmask)
> > {
> > int dirfd;
> > int outputs[_F_LAST];
> > int kmsgfd;
> > - int sigfd;
> > - sigset_t mask;
> > int outpipe[2] = { -1, -1 };
> > int errpipe[2] = { -1, -1 };
> > int outfd, errfd;
> > @@ -954,21 +951,6 @@ static int execute_next_entry(struct execute_state *state,
> > lseek(kmsgfd, 0, SEEK_END);
> > }
> >
> > - sigemptyset(&mask);
> > - sigaddset(&mask, SIGCHLD);
> > - sigaddset(&mask, SIGINT);
> > - sigaddset(&mask, SIGTERM);
> > - sigaddset(&mask, SIGQUIT);
> > - sigaddset(&mask, SIGHUP);
> > - sigprocmask(SIG_BLOCK, &mask, NULL);
> > - sigfd = signalfd(-1, &mask, O_CLOEXEC);
> > -
> > - if (sigfd < 0) {
> > - /* TODO: Handle better */
> > - fprintf(stderr, "Cannot monitor child process with signalfd\n");
> > - result = -1;
> > - goto out_kmsgfd;
> > - }
> >
> > if (settings->log_level >= LOG_LEVEL_NORMAL) {
> > char *displayname;
> > @@ -1002,7 +984,7 @@ static int execute_next_entry(struct execute_state *state,
> > close(outpipe[0]);
> > close(errpipe[0]);
> >
> > - sigprocmask(SIG_UNBLOCK, &mask, NULL);
> > + sigprocmask(SIG_UNBLOCK, sigmask, NULL);
> >
> > setenv("IGT_SENTINEL_ON_STDERR", "1", 1);
> >
> > @@ -1261,12 +1243,41 @@ static void oom_immortal(void)
> > close(fd);
> > }
> >
> > +static bool should_die_because_signal(int sigfd)
> > +{
> > + struct signalfd_siginfo siginfo;
> > + int ret;
> > +
> > + struct pollfd sigpoll = { .fd = sigfd, .events = POLLIN | POLLRDBAND };
> > +
> > + if ((ret = poll(&sigpoll, 1, 0)) != 0) {
> > + if (ret == -1) {
>
> Seems like this is unintentionally left blank.
do {
ret = poll(&sigpoll, 1, 0);
} while (ret == -1 && (errno == EAGAIN || errno == EINTR));
if (ret != 0) {
if (ret == -1) {
fprintf(stderr, "Poll on signalfd failed with %s\n", strerror(errno));
return true; /* something is wrong, let's die */
}
Although this do-while is probalby an overkill for special fds.
> > + }
> > +
> > + ret = read(sigfd, &siginfo, sizeof(siginfo));
>
> Error handling is missing (ret == sizeof(siginfo)).
I am not sure about this check. This is not a normal fd and I partial
reads should not not be possible.
The other place we read it in is:
s = read(sigfd, &siginfo, sizeof(siginfo));
if (s < 0) {
fprintf(stderr, "Error reading from signalfd: %s\n",
strerror(errno));
continue;
I can add:
if (ret == -1) {
fprintf(stderr, "Error reading from signalfd: %s\n", strerror(errno));
return false; /* we may want to retry later */
}
> > + if (siginfo.ssi_signo == SIGCHLD) {
> > + fprintf(stderr, "Runner got stray SIGCHLD while not executing any tests.\n");
> > +
>
> Nit: extra blank line here
>
> > + } else {
> > + fprintf(stderr, "Runner is being killed by %s\n",
> > + strsignal(siginfo.ssi_signo));
> > + return true;
> > + }
> > +
> > + }
> > +
> > + return false;
> > +}
> > +
> > bool execute(struct execute_state *state,
> > struct settings *settings,
> > struct job_list *job_list)
> > {
> > struct utsname unamebuf;
> > int resdirfd, testdirfd, unamefd, timefd;
> > + sigset_t sigmask;
> > + int sigfd;
> > double time_spent = 0.0;
> > bool status = true;
> >
> > @@ -1310,6 +1321,22 @@ bool execute(struct execute_state *state,
> >
> > oom_immortal();
> >
> > + sigemptyset(&sigmask);
> > + sigaddset(&sigmask, SIGCHLD);
> > + sigaddset(&sigmask, SIGINT);
> > + sigaddset(&sigmask, SIGTERM);
> > + sigaddset(&sigmask, SIGQUIT);
> > + sigaddset(&sigmask, SIGHUP);
> > + sigfd = signalfd(-1, &sigmask, O_CLOEXEC);
> > + sigprocmask(SIG_BLOCK, &sigmask, NULL);
> > +
> > + if (sigfd < 0) {
> > + /* TODO: Handle better */
> > + fprintf(stderr, "Cannot mask signals\n");
> > + status = -1;
>
> This should probably be `status = false`. -1 is a truthy value.
>
> > + goto end;
> > + }
> > +
> > init_watchdogs(settings);
> >
> > if (!uname(&unamebuf)) {
> > @@ -1345,12 +1372,18 @@ bool execute(struct execute_state *state,
> > char *reason;
> > int result;
> >
> > + if (should_die_because_signal(sigfd)) {
> > + status = false;
>
> Should we close_watchdogs at this point?
"end" closes watchdgos:
end:
close_watchdogs(settings);
>
> > + goto end;
> > + }
> > +
> > result = execute_next_entry(state,
> > job_list->size,
> > &time_spent,
> > settings,
> > &job_list->entries[state->next],
> > - testdirfd, resdirfd);
> > + testdirfd, resdirfd,
> > + sigfd, &sigmask);
>
> The argument list is getting quite large. At some point it may be worth
> it to put everything (or part of these) in a struct.
>
> (In general I feel like this file could be improved a lot, these long
> functions are hard to read.)
Agreed. If I am going to do another substantial change to this file (or
another iteration of this series) it will get its own patch.
Cheers,
Arek
More information about the igt-dev
mailing list