[Ezbench-dev] [PATCH 21/25] utils/owatch: Overwatch, a watchdog wrapper
Petri Latvala
petri.latvala at intel.com
Fri Feb 24 11:19:22 UTC 2017
Overwatch will execute a program given as its parameters, quite like
`timeout'. It will then monitor the stdout and stderr output of the
child, and any activity on either will act as a heartbeat. If there is
no heartbeat in the given timeout period, the child process (group)
will be killed.
If Overwatch is run as root, it will also use a hardware watchdog
(/dev/watchdog[0-9]*) if it can open one. It will signal the hardware
watchdog on child process heartbeat and will just stop pinging it
(thus causing a reboot) if the child process does not heartbeat.
Signed-off-by: Petri Latvala <petri.latvala at intel.com>
---
utils/owatch/.gitignore | 2 +
utils/owatch/Makefile | 8 ++
utils/owatch/owatch.c | 258 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 268 insertions(+)
create mode 100644 utils/owatch/.gitignore
create mode 100644 utils/owatch/Makefile
create mode 100644 utils/owatch/owatch.c
diff --git a/utils/owatch/.gitignore b/utils/owatch/.gitignore
new file mode 100644
index 0000000..209df90
--- /dev/null
+++ b/utils/owatch/.gitignore
@@ -0,0 +1,2 @@
+owatch.o
+owatch
diff --git a/utils/owatch/Makefile b/utils/owatch/Makefile
new file mode 100644
index 0000000..c78666c
--- /dev/null
+++ b/utils/owatch/Makefile
@@ -0,0 +1,8 @@
+
+all: owatch
+
+owatch.o: owatch.c
+ $(CC) -c -Wall -o $@ $<
+
+owatch: owatch.o
+ $(CC) -o $@ $<
diff --git a/utils/owatch/owatch.c b/utils/owatch/owatch.c
new file mode 100644
index 0000000..488341c
--- /dev/null
+++ b/utils/owatch/owatch.c
@@ -0,0 +1,258 @@
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/watchdog.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+int usage(const char* exe)
+{
+ printf("Usage: %s timeout command [parameters]\n", exe);
+ printf(" Executes command and watches for output.\n");
+ printf(" timeout - maximum time to wait for output before the process is killed.\n");
+ return 1;
+}
+
+int watchdogfd = -1;
+
+void wd_settimeout(int timeout)
+{
+ if (watchdogfd >= 0)
+ ioctl(watchdogfd, WDIOC_SETTIMEOUT, &timeout);
+}
+
+void wd_heartbeat()
+{
+ if (watchdogfd >= 0)
+ ioctl(watchdogfd, WDIOC_KEEPALIVE, 0);
+}
+
+void wd_close()
+{
+ if (watchdogfd < 0)
+ return;
+
+ write(watchdogfd, "V", 1);
+ close(watchdogfd);
+ watchdogfd = -1;
+}
+
+void open_watchdog_dev(int timeout)
+{
+ int fd;
+ char buf[255];
+ int i;
+
+ for (i = 0; i < 25; ++i) {
+ snprintf(buf, 255, "/dev/watchdog%d", i);
+ fd = open(buf, O_WRONLY);
+ if (fd >= 0) {
+ printf("owatch: Using watchdog device %s\n", buf);
+ watchdogfd = fd;
+ wd_settimeout(timeout);
+ return;
+ }
+ }
+}
+
+/*
+ * return 0 if no output occurred, 1 if it did.
+ * -1 on eof on either fd, -2 for other errors
+ */
+int pipe_output(int timeout, int out, int err)
+{
+ struct timeval tv = { .tv_sec = timeout };
+ fd_set set;
+ int nfds = out > err ? out + 1 : err + 1;
+ int n, ret;
+ char buf[512];
+
+ FD_ZERO(&set);
+ FD_SET(out, &set);
+ FD_SET(err, &set);
+
+ n = select(nfds, &set, NULL, NULL, &tv);
+ if (n < 0) {
+ perror("select");
+ return -2;
+ }
+ if (!n) {
+ return 0;
+ }
+
+ ret = -1;
+ if (FD_ISSET(out, &set)) {
+ ssize_t s = read(out, buf, sizeof(buf));
+ if (s < 0) {
+ perror("read");
+ return -2;
+ }
+
+ if (s > 0) {
+ write(STDOUT_FILENO, buf, s);
+ ret = 1;
+ }
+ }
+ if (FD_ISSET(err, &set)) {
+ ssize_t s = read(err, buf, sizeof(buf));
+ if (s < 0) {
+ perror("read");
+ return -2;
+ }
+
+ if (s > 0) {
+ write(STDERR_FILENO, buf, s);
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+void overwatch(pid_t child, int timeout, int outpipe[2], int errpipe[2])
+{
+ int n = 1;
+ int wstatus;
+ pid_t r;
+ pid_t killtarget = child;
+
+ close(outpipe[1]);
+ close(errpipe[1]);
+
+ open_watchdog_dev(timeout);
+
+ while (n > 0) {
+ wd_heartbeat();
+ n = pipe_output(timeout, outpipe[0], errpipe[0]);
+ }
+
+ wd_heartbeat();
+
+ close(outpipe[0]);
+ close(errpipe[0]);
+
+ if (n == 0) {
+ printf("owatch: TIMEOUT!\n");
+
+ /* Hack: If we have a hw watchdog, don't bother killing children. Just stop the heartbeat. */
+ if (watchdogfd >= 0) {
+ wd_settimeout(1);
+ sleep(3);
+ }
+
+ printf("owatch: Killing children\n");
+
+ if (!kill(-child, 0)) {
+ /* Child was able to setsid, process group exists.
+ * Use process group as kill target.
+ */
+ killtarget = -child;
+ }
+
+ kill(killtarget, 15);
+ }
+
+ r = waitpid(child, &wstatus, WNOHANG);
+
+ if (r == 0) {
+ wd_settimeout(30);
+ wd_heartbeat();
+ kill(killtarget, 9);
+ r = waitpid(child, &wstatus, 0);
+ wd_heartbeat();
+ }
+
+ wd_close();
+
+ if (r != child) {
+ printf("Child turned undead, hire a priest\n");
+ exit(1);
+ }
+
+ if (n == -1) {
+ /* normal termination */
+ if (WIFEXITED(wstatus)) {
+ exit(WEXITSTATUS(wstatus));
+ } else {
+ exit(1);
+ }
+ }
+
+ if (n == -2) {
+ /* error occurred */
+ exit(1);
+ }
+
+ /* shouldn't be reached */
+ exit(7);
+}
+
+void launch_child(int outpipe[2], int errpipe[2], char** argv)
+{
+ pid_t sid;
+
+ close(outpipe[0]);
+ close(errpipe[0]);
+
+ sid = setsid();
+ if (sid < 0) {
+ perror("setsid");
+ /* continue anyway */
+ }
+
+ if (dup2(outpipe[1], STDOUT_FILENO) < 0 ||
+ dup2(errpipe[1], STDERR_FILENO) < 0) {
+ perror("dup2");
+ exit(1);
+ }
+
+ close(outpipe[1]);
+ close(errpipe[1]);
+
+ execvp(argv[0], argv);
+
+ perror("execvp");
+ exit(1);
+}
+
+int main(int argc, char** argv)
+{
+ int outpipe[2];
+ int errpipe[2];
+ int timeout;
+ pid_t child;
+
+ if (argc < 3) {
+ exit(usage(argv[0]));
+ }
+
+ timeout = atoi(argv[1]);
+ if (timeout <= 0) {
+ fprintf(stderr, "Error: timeout must be positive and non-zero\n");
+ exit(1);
+ }
+
+ if (pipe(outpipe) || pipe(errpipe)) {
+ perror("pipe");
+ exit(1);
+ }
+
+ if ((child = fork())) {
+ if (child < 0) {
+ perror("fork");
+ exit(1);
+ }
+
+ overwatch(child, timeout, outpipe, errpipe);
+ } else {
+ launch_child(outpipe, errpipe, &argv[2]);
+ }
+
+ __builtin_unreachable();
+ return 255;
+}
--
2.9.3
More information about the Ezbench-dev
mailing list