[Ezbench-dev] [PATCH 21/25] utils/owatch: Overwatch, a watchdog wrapper

Petri Latvala petri.latvala at intel.com
Fri Feb 24 11:19:22 UTC 2017


Overwatch will execute a program given as its parameters, quite like
`timeout'. It will then monitor the stdout and stderr output of the
child, and any activity on either will act as a heartbeat. If there is
no heartbeat in the given timeout period, the child process (group)
will be killed.

If Overwatch is run as root, it will also use a hardware watchdog
(/dev/watchdog[0-9]*) if it can open one. It will signal the hardware
watchdog on child process heartbeat and will just stop pinging it
(thus causing a reboot) if the child process does not heartbeat.

Signed-off-by: Petri Latvala <petri.latvala at intel.com>
---
 utils/owatch/.gitignore |   2 +
 utils/owatch/Makefile   |   8 ++
 utils/owatch/owatch.c   | 258 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 268 insertions(+)
 create mode 100644 utils/owatch/.gitignore
 create mode 100644 utils/owatch/Makefile
 create mode 100644 utils/owatch/owatch.c

diff --git a/utils/owatch/.gitignore b/utils/owatch/.gitignore
new file mode 100644
index 0000000..209df90
--- /dev/null
+++ b/utils/owatch/.gitignore
@@ -0,0 +1,2 @@
+owatch.o
+owatch
diff --git a/utils/owatch/Makefile b/utils/owatch/Makefile
new file mode 100644
index 0000000..c78666c
--- /dev/null
+++ b/utils/owatch/Makefile
@@ -0,0 +1,8 @@
+
+all: owatch
+
+owatch.o: owatch.c
+	$(CC) -c -Wall -o $@ $<
+
+owatch: owatch.o
+	$(CC) -o $@ $<
diff --git a/utils/owatch/owatch.c b/utils/owatch/owatch.c
new file mode 100644
index 0000000..488341c
--- /dev/null
+++ b/utils/owatch/owatch.c
@@ -0,0 +1,258 @@
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/watchdog.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+int usage(const char* exe)
+{
+  printf("Usage: %s timeout command [parameters]\n", exe);
+  printf(" Executes command and watches for output.\n");
+  printf(" timeout - maximum time to wait for output before the process is killed.\n");
+  return 1;
+}
+
+int watchdogfd = -1;
+
+void wd_settimeout(int timeout)
+{
+  if (watchdogfd >= 0)
+    ioctl(watchdogfd, WDIOC_SETTIMEOUT, &timeout);
+}
+
+void wd_heartbeat()
+{
+  if (watchdogfd >= 0)
+    ioctl(watchdogfd, WDIOC_KEEPALIVE, 0);
+}
+
+void wd_close()
+{
+  if (watchdogfd < 0)
+    return;
+
+  write(watchdogfd, "V", 1);
+  close(watchdogfd);
+  watchdogfd = -1;
+}
+
+void open_watchdog_dev(int timeout)
+{
+  int fd;
+  char buf[255];
+  int i;
+
+  for (i = 0; i < 25; ++i) {
+    snprintf(buf, 255, "/dev/watchdog%d", i);
+    fd = open(buf, O_WRONLY);
+    if (fd >= 0) {
+      printf("owatch: Using watchdog device %s\n", buf);
+      watchdogfd = fd;
+      wd_settimeout(timeout);
+      return;
+    }
+  }
+}
+
+/* 
+ * return 0 if no output occurred, 1 if it did.
+ * -1 on eof on either fd, -2 for other errors
+ */
+int pipe_output(int timeout, int out, int err)
+{
+  struct timeval tv = { .tv_sec = timeout };
+  fd_set set;
+  int nfds = out > err ? out + 1 : err + 1;
+  int n, ret;
+  char buf[512];
+
+  FD_ZERO(&set);
+  FD_SET(out, &set);
+  FD_SET(err, &set);
+
+  n = select(nfds, &set, NULL, NULL, &tv);
+  if (n < 0) {
+    perror("select");
+    return -2;
+  }
+  if (!n) {
+    return 0;
+  }
+
+  ret = -1;
+  if (FD_ISSET(out, &set)) {
+    ssize_t s = read(out, buf, sizeof(buf));
+    if (s < 0) {
+      perror("read");
+      return -2;
+    }
+
+    if (s > 0) {
+      write(STDOUT_FILENO, buf, s);
+      ret = 1;
+    }
+  }
+  if (FD_ISSET(err, &set)) {
+    ssize_t s = read(err, buf, sizeof(buf));
+    if (s < 0) {
+      perror("read");
+      return -2;
+    }
+
+    if (s > 0) {
+      write(STDERR_FILENO, buf, s);
+      ret = 1;
+    }
+  }
+
+  return ret;
+}
+
+void overwatch(pid_t child, int timeout, int outpipe[2], int errpipe[2])
+{
+  int n = 1;
+  int wstatus;
+  pid_t r;
+  pid_t killtarget = child;
+
+  close(outpipe[1]);
+  close(errpipe[1]);
+
+  open_watchdog_dev(timeout);
+  
+  while (n > 0) {
+    wd_heartbeat();
+    n = pipe_output(timeout, outpipe[0], errpipe[0]);
+  }
+
+  wd_heartbeat();
+  
+  close(outpipe[0]);
+  close(errpipe[0]);
+
+  if (n == 0) {
+    printf("owatch: TIMEOUT!\n");
+
+    /* Hack: If we have a hw watchdog, don't bother killing children. Just stop the heartbeat. */
+    if (watchdogfd >= 0) {
+      wd_settimeout(1);
+      sleep(3);
+    }
+
+    printf("owatch: Killing children\n");
+
+    if (!kill(-child, 0)) {
+      /* Child was able to setsid, process group exists.
+       * Use process group as kill target.
+       */
+      killtarget = -child;
+    }
+
+    kill(killtarget, 15);
+  }
+
+  r = waitpid(child, &wstatus, WNOHANG);
+
+  if (r == 0) {
+    wd_settimeout(30);
+    wd_heartbeat();
+    kill(killtarget, 9);
+    r = waitpid(child, &wstatus, 0);
+    wd_heartbeat();
+  }
+
+  wd_close();
+  
+  if (r != child) {
+    printf("Child turned undead, hire a priest\n");
+    exit(1);
+  }
+  
+  if (n == -1) {
+    /* normal termination */
+    if (WIFEXITED(wstatus)) {
+      exit(WEXITSTATUS(wstatus));
+    } else {
+      exit(1);
+    }
+  }
+
+  if (n == -2) {
+    /* error occurred */
+    exit(1);
+  }
+
+  /* shouldn't be reached */
+  exit(7);
+}
+
+void launch_child(int outpipe[2], int errpipe[2], char** argv)
+{
+  pid_t sid;
+
+  close(outpipe[0]);
+  close(errpipe[0]);
+
+  sid = setsid();
+  if (sid < 0) {
+    perror("setsid");
+    /* continue anyway */
+  }
+
+  if (dup2(outpipe[1], STDOUT_FILENO) < 0 ||
+      dup2(errpipe[1], STDERR_FILENO) < 0) {
+    perror("dup2");
+    exit(1);
+  }
+
+  close(outpipe[1]);
+  close(errpipe[1]);
+
+  execvp(argv[0], argv);
+  
+  perror("execvp");
+  exit(1);
+}
+
+int main(int argc, char** argv)
+{
+  int outpipe[2];
+  int errpipe[2];
+  int timeout;
+  pid_t child;
+
+  if (argc < 3) {
+    exit(usage(argv[0]));
+  }
+
+  timeout = atoi(argv[1]);
+  if (timeout <= 0) {
+    fprintf(stderr, "Error: timeout must be positive and non-zero\n");
+    exit(1);
+  }
+
+  if (pipe(outpipe) || pipe(errpipe)) {
+    perror("pipe");
+    exit(1);
+  }
+
+  if ((child = fork())) {
+    if (child < 0) {
+      perror("fork");
+      exit(1);
+    }
+    
+    overwatch(child, timeout, outpipe, errpipe);
+  } else {
+    launch_child(outpipe, errpipe, &argv[2]);
+  }
+
+  __builtin_unreachable();
+  return 255;
+}
-- 
2.9.3



More information about the Ezbench-dev mailing list