[Ezbench-dev] [PATCH 21/25] utils/owatch: Overwatch, a watchdog wrapper
Martin Peres
martin.peres at free.fr
Fri Feb 24 11:57:15 UTC 2017
On 24/02/17 13:19, Petri Latvala wrote:
> Overwatch will execute a program given as its parameters, quite like
> `timeout'. It will then monitor the stdout and stderr output of the
> child, and any activity on either will act as a heartbeat. If there is
> no heartbeat in the given timeout period, the child process (group)
> will be killed.
>
> If Overwatch is run as root, it will also use a hardware watchdog
> (/dev/watchdog[0-9]*) if it can open one. It will signal the hardware
> watchdog on child process heartbeat and will just stop pinging it
> (thus causing a reboot) if the child process does not heartbeat.
Can we get some description about this in the top-level README?
I would like to see dependencies and tell that this will only be used
on the tests that make use of it on their own.
> Signed-off-by: Petri Latvala <petri.latvala at intel.com>
> ---
> utils/owatch/.gitignore | 2 +
> utils/owatch/Makefile | 8 ++
> utils/owatch/owatch.c | 258 ++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 268 insertions(+)
> create mode 100644 utils/owatch/.gitignore
> create mode 100644 utils/owatch/Makefile
> create mode 100644 utils/owatch/owatch.c
>
> diff --git a/utils/owatch/.gitignore b/utils/owatch/.gitignore
> new file mode 100644
> index 0000000..209df90
> --- /dev/null
> +++ b/utils/owatch/.gitignore
> @@ -0,0 +1,2 @@
> +owatch.o
> +owatch
> diff --git a/utils/owatch/Makefile b/utils/owatch/Makefile
> new file mode 100644
> index 0000000..c78666c
> --- /dev/null
> +++ b/utils/owatch/Makefile
> @@ -0,0 +1,8 @@
> +
> +all: owatch
> +
> +owatch.o: owatch.c
> + $(CC) -c -Wall -o $@ $<
> +
> +owatch: owatch.o
> + $(CC) -o $@ $<
> diff --git a/utils/owatch/owatch.c b/utils/owatch/owatch.c
> new file mode 100644
> index 0000000..488341c
> --- /dev/null
> +++ b/utils/owatch/owatch.c
> @@ -0,0 +1,258 @@
> +#include <fcntl.h>
> +#include <signal.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <linux/watchdog.h>
> +#include <sys/ioctl.h>
> +#include <sys/select.h>
> +#include <sys/types.h>
> +#include <sys/wait.h>
> +
> +int usage(const char* exe)
> +{
> + printf("Usage: %s timeout command [parameters]\n", exe);
> + printf(" Executes command and watches for output.\n");
> + printf(" timeout - maximum time to wait for output before the process is killed.\n");
> + return 1;
> +}
> +
> +int watchdogfd = -1;
> +
> +void wd_settimeout(int timeout)
> +{
> + if (watchdogfd >= 0)
> + ioctl(watchdogfd, WDIOC_SETTIMEOUT, &timeout);
> +}
> +
> +void wd_heartbeat()
> +{
> + if (watchdogfd >= 0)
> + ioctl(watchdogfd, WDIOC_KEEPALIVE, 0);
> +}
> +
> +void wd_close()
> +{
> + if (watchdogfd < 0)
> + return;
> +
> + write(watchdogfd, "V", 1);
> + close(watchdogfd);
> + watchdogfd = -1;
> +}
> +
> +void open_watchdog_dev(int timeout)
> +{
> + int fd;
> + char buf[255];
> + int i;
> +
> + for (i = 0; i < 25; ++i) {
> + snprintf(buf, 255, "/dev/watchdog%d", i);
> + fd = open(buf, O_WRONLY);
> + if (fd >= 0) {
> + printf("owatch: Using watchdog device %s\n", buf);
> + watchdogfd = fd;
> + wd_settimeout(timeout);
> + return;
> + }
> + }
> +}
> +
> +/*
> + * return 0 if no output occurred, 1 if it did.
> + * -1 on eof on either fd, -2 for other errors
> + */
> +int pipe_output(int timeout, int out, int err)
> +{
> + struct timeval tv = { .tv_sec = timeout };
> + fd_set set;
> + int nfds = out > err ? out + 1 : err + 1;
> + int n, ret;
> + char buf[512];
> +
> + FD_ZERO(&set);
> + FD_SET(out, &set);
> + FD_SET(err, &set);
> +
> + n = select(nfds, &set, NULL, NULL, &tv);
> + if (n < 0) {
> + perror("select");
> + return -2;
> + }
> + if (!n) {
> + return 0;
> + }
> +
> + ret = -1;
> + if (FD_ISSET(out, &set)) {
> + ssize_t s = read(out, buf, sizeof(buf));
> + if (s < 0) {
> + perror("read");
> + return -2;
> + }
> +
> + if (s > 0) {
> + write(STDOUT_FILENO, buf, s);
> + ret = 1;
> + }
> + }
> + if (FD_ISSET(err, &set)) {
> + ssize_t s = read(err, buf, sizeof(buf));
> + if (s < 0) {
> + perror("read");
> + return -2;
> + }
> +
> + if (s > 0) {
> + write(STDERR_FILENO, buf, s);
> + ret = 1;
> + }
> + }
> +
> + return ret;
> +}
> +
> +void overwatch(pid_t child, int timeout, int outpipe[2], int errpipe[2])
> +{
> + int n = 1;
> + int wstatus;
> + pid_t r;
> + pid_t killtarget = child;
> +
> + close(outpipe[1]);
> + close(errpipe[1]);
> +
> + open_watchdog_dev(timeout);
> +
> + while (n > 0) {
> + wd_heartbeat();
> + n = pipe_output(timeout, outpipe[0], errpipe[0]);
> + }
> +
> + wd_heartbeat();
> +
> + close(outpipe[0]);
> + close(errpipe[0]);
> +
> + if (n == 0) {
> + printf("owatch: TIMEOUT!\n");
> +
> + /* Hack: If we have a hw watchdog, don't bother killing children. Just stop the heartbeat. */
> + if (watchdogfd >= 0) {
> + wd_settimeout(1);
> + sleep(3);
> + }
> +
> + printf("owatch: Killing children\n");
> +
> + if (!kill(-child, 0)) {
> + /* Child was able to setsid, process group exists.
> + * Use process group as kill target.
> + */
> + killtarget = -child;
> + }
> +
> + kill(killtarget, 15);
> + }
> +
> + r = waitpid(child, &wstatus, WNOHANG);
> +
> + if (r == 0) {
> + wd_settimeout(30);
> + wd_heartbeat();
> + kill(killtarget, 9);
> + r = waitpid(child, &wstatus, 0);
> + wd_heartbeat();
> + }
> +
> + wd_close();
> +
> + if (r != child) {
> + printf("Child turned undead, hire a priest\n");
> + exit(1);
> + }
> +
> + if (n == -1) {
> + /* normal termination */
> + if (WIFEXITED(wstatus)) {
> + exit(WEXITSTATUS(wstatus));
> + } else {
> + exit(1);
> + }
> + }
> +
> + if (n == -2) {
> + /* error occurred */
> + exit(1);
> + }
> +
> + /* shouldn't be reached */
> + exit(7);
> +}
> +
> +void launch_child(int outpipe[2], int errpipe[2], char** argv)
> +{
> + pid_t sid;
> +
> + close(outpipe[0]);
> + close(errpipe[0]);
> +
> + sid = setsid();
> + if (sid < 0) {
> + perror("setsid");
> + /* continue anyway */
> + }
> +
> + if (dup2(outpipe[1], STDOUT_FILENO) < 0 ||
> + dup2(errpipe[1], STDERR_FILENO) < 0) {
> + perror("dup2");
> + exit(1);
> + }
> +
> + close(outpipe[1]);
> + close(errpipe[1]);
> +
> + execvp(argv[0], argv);
> +
> + perror("execvp");
> + exit(1);
> +}
> +
> +int main(int argc, char** argv)
> +{
> + int outpipe[2];
> + int errpipe[2];
> + int timeout;
> + pid_t child;
> +
> + if (argc < 3) {
> + exit(usage(argv[0]));
> + }
> +
> + timeout = atoi(argv[1]);
> + if (timeout <= 0) {
> + fprintf(stderr, "Error: timeout must be positive and non-zero\n");
> + exit(1);
> + }
> +
> + if (pipe(outpipe) || pipe(errpipe)) {
> + perror("pipe");
> + exit(1);
> + }
> +
> + if ((child = fork())) {
> + if (child < 0) {
> + perror("fork");
> + exit(1);
> + }
> +
> + overwatch(child, timeout, outpipe, errpipe);
> + } else {
> + launch_child(outpipe, errpipe, &argv[2]);
> + }
> +
> + __builtin_unreachable();
> + return 255;
> +}
More information about the Ezbench-dev
mailing list