[Ezbench-dev] [PATCH 21/25] utils/owatch: Overwatch, a watchdog wrapper

Martin Peres martin.peres at free.fr
Fri Feb 24 11:57:15 UTC 2017


On 24/02/17 13:19, Petri Latvala wrote:
> Overwatch will execute a program given as its parameters, quite like
> `timeout'. It will then monitor the stdout and stderr output of the
> child, and any activity on either will act as a heartbeat. If there is
> no heartbeat in the given timeout period, the child process (group)
> will be killed.
>
> If Overwatch is run as root, it will also use a hardware watchdog
> (/dev/watchdog[0-9]*) if it can open one. It will signal the hardware
> watchdog on child process heartbeat and will just stop pinging it
> (thus causing a reboot) if the child process does not heartbeat.

Can we get some description about this in the top-level README?

I would like to see dependencies and tell that this will only be used
on the tests that make use of it on their own.

> Signed-off-by: Petri Latvala <petri.latvala at intel.com>
> ---
>   utils/owatch/.gitignore |   2 +
>   utils/owatch/Makefile   |   8 ++
>   utils/owatch/owatch.c   | 258 ++++++++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 268 insertions(+)
>   create mode 100644 utils/owatch/.gitignore
>   create mode 100644 utils/owatch/Makefile
>   create mode 100644 utils/owatch/owatch.c
>
> diff --git a/utils/owatch/.gitignore b/utils/owatch/.gitignore
> new file mode 100644
> index 0000000..209df90
> --- /dev/null
> +++ b/utils/owatch/.gitignore
> @@ -0,0 +1,2 @@
> +owatch.o
> +owatch
> diff --git a/utils/owatch/Makefile b/utils/owatch/Makefile
> new file mode 100644
> index 0000000..c78666c
> --- /dev/null
> +++ b/utils/owatch/Makefile
> @@ -0,0 +1,8 @@
> +
> +all: owatch
> +
> +owatch.o: owatch.c
> +	$(CC) -c -Wall -o $@ $<
> +
> +owatch: owatch.o
> +	$(CC) -o $@ $<
> diff --git a/utils/owatch/owatch.c b/utils/owatch/owatch.c
> new file mode 100644
> index 0000000..488341c
> --- /dev/null
> +++ b/utils/owatch/owatch.c
> @@ -0,0 +1,258 @@
> +#include <fcntl.h>
> +#include <signal.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <linux/watchdog.h>
> +#include <sys/ioctl.h>
> +#include <sys/select.h>
> +#include <sys/types.h>
> +#include <sys/wait.h>
> +
> +int usage(const char* exe)
> +{
> +  printf("Usage: %s timeout command [parameters]\n", exe);
> +  printf(" Executes command and watches for output.\n");
> +  printf(" timeout - maximum time to wait for output before the process is killed.\n");
> +  return 1;
> +}
> +
> +int watchdogfd = -1;
> +
> +void wd_settimeout(int timeout)
> +{
> +  if (watchdogfd >= 0)
> +    ioctl(watchdogfd, WDIOC_SETTIMEOUT, &timeout);
> +}
> +
> +void wd_heartbeat()
> +{
> +  if (watchdogfd >= 0)
> +    ioctl(watchdogfd, WDIOC_KEEPALIVE, 0);
> +}
> +
> +void wd_close()
> +{
> +  if (watchdogfd < 0)
> +    return;
> +
> +  write(watchdogfd, "V", 1);
> +  close(watchdogfd);
> +  watchdogfd = -1;
> +}
> +
> +void open_watchdog_dev(int timeout)
> +{
> +  int fd;
> +  char buf[255];
> +  int i;
> +
> +  for (i = 0; i < 25; ++i) {
> +    snprintf(buf, 255, "/dev/watchdog%d", i);
> +    fd = open(buf, O_WRONLY);
> +    if (fd >= 0) {
> +      printf("owatch: Using watchdog device %s\n", buf);
> +      watchdogfd = fd;
> +      wd_settimeout(timeout);
> +      return;
> +    }
> +  }
> +}
> +
> +/*
> + * return 0 if no output occurred, 1 if it did.
> + * -1 on eof on either fd, -2 for other errors
> + */
> +int pipe_output(int timeout, int out, int err)
> +{
> +  struct timeval tv = { .tv_sec = timeout };
> +  fd_set set;
> +  int nfds = out > err ? out + 1 : err + 1;
> +  int n, ret;
> +  char buf[512];
> +
> +  FD_ZERO(&set);
> +  FD_SET(out, &set);
> +  FD_SET(err, &set);
> +
> +  n = select(nfds, &set, NULL, NULL, &tv);
> +  if (n < 0) {
> +    perror("select");
> +    return -2;
> +  }
> +  if (!n) {
> +    return 0;
> +  }
> +
> +  ret = -1;
> +  if (FD_ISSET(out, &set)) {
> +    ssize_t s = read(out, buf, sizeof(buf));
> +    if (s < 0) {
> +      perror("read");
> +      return -2;
> +    }
> +
> +    if (s > 0) {
> +      write(STDOUT_FILENO, buf, s);
> +      ret = 1;
> +    }
> +  }
> +  if (FD_ISSET(err, &set)) {
> +    ssize_t s = read(err, buf, sizeof(buf));
> +    if (s < 0) {
> +      perror("read");
> +      return -2;
> +    }
> +
> +    if (s > 0) {
> +      write(STDERR_FILENO, buf, s);
> +      ret = 1;
> +    }
> +  }
> +
> +  return ret;
> +}
> +
> +void overwatch(pid_t child, int timeout, int outpipe[2], int errpipe[2])
> +{
> +  int n = 1;
> +  int wstatus;
> +  pid_t r;
> +  pid_t killtarget = child;
> +
> +  close(outpipe[1]);
> +  close(errpipe[1]);
> +
> +  open_watchdog_dev(timeout);
> +
> +  while (n > 0) {
> +    wd_heartbeat();
> +    n = pipe_output(timeout, outpipe[0], errpipe[0]);
> +  }
> +
> +  wd_heartbeat();
> +
> +  close(outpipe[0]);
> +  close(errpipe[0]);
> +
> +  if (n == 0) {
> +    printf("owatch: TIMEOUT!\n");
> +
> +    /* Hack: If we have a hw watchdog, don't bother killing children. Just stop the heartbeat. */
> +    if (watchdogfd >= 0) {
> +      wd_settimeout(1);
> +      sleep(3);
> +    }
> +
> +    printf("owatch: Killing children\n");
> +
> +    if (!kill(-child, 0)) {
> +      /* Child was able to setsid, process group exists.
> +       * Use process group as kill target.
> +       */
> +      killtarget = -child;
> +    }
> +
> +    kill(killtarget, 15);
> +  }
> +
> +  r = waitpid(child, &wstatus, WNOHANG);
> +
> +  if (r == 0) {
> +    wd_settimeout(30);
> +    wd_heartbeat();
> +    kill(killtarget, 9);
> +    r = waitpid(child, &wstatus, 0);
> +    wd_heartbeat();
> +  }
> +
> +  wd_close();
> +
> +  if (r != child) {
> +    printf("Child turned undead, hire a priest\n");
> +    exit(1);
> +  }
> +
> +  if (n == -1) {
> +    /* normal termination */
> +    if (WIFEXITED(wstatus)) {
> +      exit(WEXITSTATUS(wstatus));
> +    } else {
> +      exit(1);
> +    }
> +  }
> +
> +  if (n == -2) {
> +    /* error occurred */
> +    exit(1);
> +  }
> +
> +  /* shouldn't be reached */
> +  exit(7);
> +}
> +
> +void launch_child(int outpipe[2], int errpipe[2], char** argv)
> +{
> +  pid_t sid;
> +
> +  close(outpipe[0]);
> +  close(errpipe[0]);
> +
> +  sid = setsid();
> +  if (sid < 0) {
> +    perror("setsid");
> +    /* continue anyway */
> +  }
> +
> +  if (dup2(outpipe[1], STDOUT_FILENO) < 0 ||
> +      dup2(errpipe[1], STDERR_FILENO) < 0) {
> +    perror("dup2");
> +    exit(1);
> +  }
> +
> +  close(outpipe[1]);
> +  close(errpipe[1]);
> +
> +  execvp(argv[0], argv);
> +
> +  perror("execvp");
> +  exit(1);
> +}
> +
> +int main(int argc, char** argv)
> +{
> +  int outpipe[2];
> +  int errpipe[2];
> +  int timeout;
> +  pid_t child;
> +
> +  if (argc < 3) {
> +    exit(usage(argv[0]));
> +  }
> +
> +  timeout = atoi(argv[1]);
> +  if (timeout <= 0) {
> +    fprintf(stderr, "Error: timeout must be positive and non-zero\n");
> +    exit(1);
> +  }
> +
> +  if (pipe(outpipe) || pipe(errpipe)) {
> +    perror("pipe");
> +    exit(1);
> +  }
> +
> +  if ((child = fork())) {
> +    if (child < 0) {
> +      perror("fork");
> +      exit(1);
> +    }
> +
> +    overwatch(child, timeout, outpipe, errpipe);
> +  } else {
> +    launch_child(outpipe, errpipe, &argv[2]);
> +  }
> +
> +  __builtin_unreachable();
> +  return 255;
> +}




More information about the Ezbench-dev mailing list