[systemd-commits] 2 commits - Makefile.am src/nspawn src/shared

Lennart Poettering lennart at kemper.freedesktop.org
Sat May 24 20:25:57 PDT 2014


 Makefile.am               |    4 
 src/nspawn/nspawn.c       |  207 ++++++++++++++++++++++++++++++++--------------
 src/shared/eventfd-util.c |  169 +++++++++++++++++++++++++++++++++++++
 src/shared/eventfd-util.h |   43 +++++++++
 4 files changed, 360 insertions(+), 63 deletions(-)

New commits:
commit e866af3acc30fcd1183a028ea3ef552b7237cc55
Author: Djalal Harouni <tixxdz at opendz.org>
Date:   Sat May 24 14:58:55 2014 +0100

    nspawn: make nspawn robust to container failure
    
    nspawn and the container child use eventfd to wait and notify each other
    that they are ready so the container setup can be completed.
    
    However in its current form the wait/notify event ignore errors that
    may especially affect the child (container).
    
    On errors the child will jump to the "child_fail" label and terminate
    with _exit(EXIT_FAILURE) without notifying the parent. Since the eventfd
    is created without the "EFD_NONBLOCK" flag, this leaves the parent
    blocking on the eventfd_read() call. The container can also be killed
    at any moment before execv() and the parent will not receive
    notifications.
    
    We can fix this by using cheap mechanisms, the new high level eventfd
    API and handle SIGCHLD signals:
    
    * Keep the cheap eventfd and EFD_NONBLOCK flag.
    
    * Introduce eventfd states for parent and child to sync.
    Child notifies parent with EVENTFD_CHILD_SUCCEEDED on success or
    EVENTFD_CHILD_FAILED on failure and before _exit(). This prevents the
    parent from waiting on an event that will never come.
    
    * If the child is killed before execv() or before notifying the parent,
    we install a NOP handler for SIGCHLD which will interrupt blocking calls
    with EINTR. This gives a chance to the parent to call wait() and
    terminate in main().
    
    * If there are no errors, parent will block SIGCHLD, restore default
    handler and notify child which will do execv(), then parent will pass
    control to process_pty() to do its magic.
    
    This was exposed in part by:
    https://bugs.freedesktop.org/show_bug.cgi?id=76193
    
    Reported-by: Tobias Hunger tobias.hunger at gmail.com

diff --git a/Makefile.am b/Makefile.am
index f517f19..4b60735 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -805,7 +805,9 @@ libsystemd_shared_la_SOURCES = \
 	src/shared/ring.c \
 	src/shared/ring.h \
 	src/shared/async.c \
-	src/shared/async.h
+	src/shared/async.h \
+	src/shared/eventfd-util.c \
+	src/shared/eventfd-util.h
 
 nodist_libsystemd_shared_la_SOURCES = \
 	src/shared/errno-from-name.h \
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 0cd476c..a1d7724 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -84,6 +84,7 @@
 #include "def.h"
 #include "rtnl-util.h"
 #include "udev-util.h"
+#include "eventfd-util.h"
 #include "blkid-util.h"
 #include "gpt.h"
 #include "siphash24.h"
@@ -2642,6 +2643,8 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) {
         return r;
 }
 
+static void nop_handler(int sig) {}
+
 int main(int argc, char *argv[]) {
 
         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
@@ -2653,8 +2656,8 @@ int main(int argc, char *argv[]) {
         const char *console = NULL;
         char veth_name[IFNAMSIZ];
         bool secondary = false;
+        sigset_t mask, mask_chld;
         pid_t pid = 0;
-        sigset_t mask;
 
         log_parse_environment();
         log_open();
@@ -2816,36 +2819,44 @@ int main(int argc, char *argv[]) {
         sd_notify(0, "READY=1");
 
         assert_se(sigemptyset(&mask) == 0);
+        assert_se(sigemptyset(&mask_chld) == 0);
+        sigaddset(&mask_chld, SIGCHLD);
         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
 
         for (;;) {
                 ContainerStatus container_status;
-                int parent_ready_fd = -1, child_ready_fd = -1;
-                eventfd_t x;
-
-                parent_ready_fd = eventfd(0, EFD_CLOEXEC);
-                if (parent_ready_fd < 0) {
-                        log_error("Failed to create event fd: %m");
+                int eventfds[2] = { -1, -1 };
+                struct sigaction sa = {
+                        .sa_handler = nop_handler,
+                        .sa_flags = SA_NOCLDSTOP,
+                };
+
+                /* Child can be killed before execv(), so handle SIGCHLD
+                 * in order to interrupt parent's blocking calls and
+                 * give it a chance to call wait() and terminate. */
+                r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
+                if (r < 0) {
+                        log_error("Failed to change the signal mask: %m");
                         goto finish;
                 }
 
-                child_ready_fd = eventfd(0, EFD_CLOEXEC);
-                if (child_ready_fd < 0) {
-                        log_error("Failed to create event fd: %m");
+                r = sigaction(SIGCHLD, &sa, NULL);
+                if (r < 0) {
+                        log_error("Failed to install SIGCHLD handler: %m");
                         goto finish;
                 }
 
-                pid = syscall(__NR_clone,
-                              SIGCHLD|CLONE_NEWNS|
-                              (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
-                              (arg_private_network ? CLONE_NEWNET : 0), NULL);
+                pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
+                                         (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
+                                         (arg_private_network ? CLONE_NEWNET : 0), eventfds);
                 if (pid < 0) {
                         if (errno == EINVAL)
                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
                         else
                                 log_error("clone() failed: %m");
 
+                        r = pid;
                         goto finish;
                 }
 
@@ -2986,8 +2997,11 @@ int main(int argc, char *argv[]) {
                         /* Tell the parent that we are ready, and that
                          * it can cgroupify us to that we lack access
                          * to certain devices and resources. */
-                        eventfd_write(child_ready_fd, 1);
-                        child_ready_fd = safe_close(child_ready_fd);
+                        r = eventfd_send_state(eventfds[1],
+                                               EVENTFD_CHILD_SUCCEEDED);
+                        eventfds[1] = safe_close(eventfds[1]);
+                        if (r < 0)
+                                goto child_fail;
 
                         if (chdir(arg_directory) < 0) {
                                 log_error("chdir(%s) failed: %m", arg_directory);
@@ -3089,8 +3103,10 @@ int main(int argc, char *argv[]) {
                                 env_use = (char**) envp;
 
                         /* Wait until the parent is ready with the setup, too... */
-                        eventfd_read(parent_ready_fd, &x);
-                        parent_ready_fd = safe_close(parent_ready_fd);
+                        r = eventfd_parent_succeeded(eventfds[0]);
+                        eventfds[0] = safe_close(eventfds[0]);
+                        if (r < 0)
+                                goto child_fail;
 
                         if (arg_boot) {
                                 char **a;
@@ -3121,17 +3137,27 @@ int main(int argc, char *argv[]) {
                         log_error("execv() failed: %m");
 
                 child_fail:
+                        /* Tell the parent that the setup failed, so he
+                         * can clean up resources and terminate. */
+                        if (eventfds[1] != -1)
+                                eventfd_send_state(eventfds[1],
+                                                   EVENTFD_CHILD_FAILED);
                         _exit(EXIT_FAILURE);
                 }
 
                 fdset_free(fds);
                 fds = NULL;
 
-                /* Wait until the child reported that it is ready with
-                 * all it needs to do with privileges. After we got
-                 * the notification we can make the process join its
-                 * cgroup which might limit what it can do */
-                eventfd_read(child_ready_fd, &x);
+                /* Wait for the child event:
+                 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
+                 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
+                 * it is ready with all it needs to do with priviliges.
+                 * After we got the notification we can make the process
+                 * join its cgroup which might limit what it can do */
+                r = eventfd_child_succeeded(eventfds[1]);
+                eventfds[1] = safe_close(eventfds[1]);
+                if (r < 0)
+                        goto check_container_status;
 
                 r = register_machine(pid);
                 if (r < 0)
@@ -3153,10 +3179,25 @@ int main(int argc, char *argv[]) {
                 if (r < 0)
                         goto finish;
 
+                /* Block SIGCHLD here, before notifying child.
+                 * process_pty() will handle it with the other signals. */
+                r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
+                if (r < 0)
+                        goto finish;
+
+                /* Reset signal to default */
+                r = default_signals(SIGCHLD, -1);
+                if (r < 0)
+                        goto finish;
+
                 /* Notify the child that the parent is ready with all
-                 * its setup, and thtat the child can now hand over
+                 * its setup, and that the child can now hand over
                  * control to the code to run inside the container. */
-                eventfd_write(parent_ready_fd, 1);
+                r = eventfd_send_state(eventfds[0],
+                                       EVENTFD_PARENT_SUCCEEDED);
+                eventfds[0] = safe_close(eventfds[0]);
+                if (r < 0)
+                        goto finish;
 
                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
                 if (k < 0) {
@@ -3170,6 +3211,7 @@ int main(int argc, char *argv[]) {
                 /* Kill if it is not dead yet anyway */
                 terminate_machine(pid);
 
+check_container_status:
                 /* Redundant, but better safe than sorry */
                 kill(pid, SIGKILL);
 
diff --git a/src/shared/eventfd-util.c b/src/shared/eventfd-util.c
new file mode 100644
index 0000000..27b7cf7
--- /dev/null
+++ b/src/shared/eventfd-util.c
@@ -0,0 +1,169 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+  This file is part of systemd.
+
+  Copyright 2014 Djalal Harouni
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <assert.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/eventfd.h>
+#include <sys/syscall.h>
+
+#include "eventfd-util.h"
+#include "util.h"
+
+
+/*
+ * Use this to create processes that need to setup a full context
+ * and sync it with their parents using cheap mechanisms.
+ *
+ * This will create two blocking eventfd(s). A pair for the parent and
+ * the other for the child so they can be used as a notify mechanism.
+ * Each process will gets its copy of the parent and child eventfds.
+ *
+ * This is useful in case:
+ * 1) If the parent fails or dies, the child must die.
+ * 2) Child will install PR_SET_PDEATHSIG as soon as possible.
+ * 3) Parent and child need to sync using less resources.
+ * 4) If parent is not able to install a SIGCHLD handler:
+ *    parent will wait using a blocking eventfd_read() or
+ *    eventfd_child_succeeded() call on the child eventfd.
+ *
+ *    * If the child setup succeeded, child should notify with an
+ *      EVENTFD_CHILD_SUCCEEDED, parent will continue.
+ *    * If the child setup failed, child should notify with an
+ *      EVENTFD_CHILD_FAILED before any _exit(). This avoids blocking
+ *      the parent.
+ *
+ * 5) If parent is able to install a SIGCHLD handler:
+ *    An empty signal handler without SA_RESTART will do it, since the
+ *    blocking eventfd_read() or eventfd_parent_succeeded() of the
+ *    parent will be interrupted by SIGCHLD and the call will fail with
+ *    EINTR. This is useful in case the child dies abnormaly and did
+ *    not have a chance to notify its parent using EVENTFD_CHILD_FAILED.
+ *
+ * 6) Call wait*() in the main instead of the signal handler in order
+ *    to: 1) reduce side effects and 2) have a better handling for
+ *    child termination in order to reduce various race conditions.
+ *
+ *
+ * The return value of clone_with_eventfd() is the same of clone().
+ * On success the eventfds[] will contain the two eventfd(s). These
+ * file descriptors can be closed later with safe_close(). On failure,
+ * a negative value is returned in the caller's context, and errno will
+ * be set appropriately.
+ *
+ * Extra preliminary work:
+ * 1) Child can wait before starting its setup by using the
+ *    eventfd_recv_start() call on the parent eventfd, in that case the
+ *    parent must notify with EVENTFD_START, after doing any preliminary
+ *    work.
+ *
+ * Note: this function depends on systemd internal functions
+ * safe_close() and it should be used only by direct binaries, no
+ * libraries.
+ */
+pid_t clone_with_eventfd(int flags, int eventfds[2]) {
+        pid_t pid;
+
+        assert(eventfds);
+
+        eventfds[0] = eventfd(EVENTFD_INIT, EFD_CLOEXEC);
+        if (eventfds[0] < 0)
+                return -1;
+
+        eventfds[1] = eventfd(EVENTFD_INIT, EFD_CLOEXEC);
+        if (eventfds[1] < 0)
+                goto err_eventfd0;
+
+        pid = syscall(__NR_clone, flags, NULL);
+        if (pid < 0)
+                goto err_eventfd1;
+
+        return pid;
+
+err_eventfd1:
+        eventfds[1] = safe_close(eventfds[1]);
+err_eventfd0:
+        eventfds[0] = safe_close(eventfds[0]);
+        return -1;
+}
+
+int eventfd_send_state(int efd, eventfd_t s) {
+        return eventfd_write(efd, s);
+}
+
+/*
+ * Receive an eventfd state on the eventfd file descriptor.
+ *
+ * If the third argument is set to a value other than zero, then this
+ * function will compare the received value with this argument and set
+ * the return value.
+ *
+ * On success return 0. On error, -1 will be returned, and errno will
+ * be set appropriately.
+ */
+int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s) {
+        int ret;
+
+        ret = eventfd_read(efd, e);
+        if (ret < 0)
+                return ret;
+        else if (s != 0 && *e != s) {
+                errno = EINVAL;
+                return -1;
+        }
+
+        return 0;
+}
+
+/*
+ * Receive the EVENTFD_START state on the eventfd file descriptor.
+ *
+ * On Success return 0. On error, -1 will be returned, and errno will
+ * be set appropriately.
+ */
+int eventfd_recv_start(int efd) {
+        eventfd_t e = EVENTFD_INIT;
+        return eventfd_recv_state(efd, &e, EVENTFD_START);
+}
+
+/*
+ * Receive the EVENTFD_PARENT_SUCCEEDED state on the eventfd file
+ * descriptor.
+ *
+ * On Success return 0. On error, -1 will be returned, and errno will
+ * be set appropriately.
+ */
+int eventfd_parent_succeeded(int efd) {
+        eventfd_t e = EVENTFD_INIT;
+        return eventfd_recv_state(efd, &e, EVENTFD_PARENT_SUCCEEDED);
+}
+
+/*
+ * Receive the EVENTFD_CHILD_SUCCEEDED state on the eventfd file
+ * descriptor.
+ *
+ * On Success return 0. On error, -1 will be returned, and errno will
+ * be set appropriately.
+ */
+int eventfd_child_succeeded(int efd) {
+        eventfd_t e = EVENTFD_INIT;
+        return eventfd_recv_state(efd, &e, EVENTFD_CHILD_SUCCEEDED);
+}
diff --git a/src/shared/eventfd-util.h b/src/shared/eventfd-util.h
new file mode 100644
index 0000000..0120f04
--- /dev/null
+++ b/src/shared/eventfd-util.h
@@ -0,0 +1,43 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+#pragma once
+
+/***
+  This file is part of systemd.
+
+  Copyright 2014 Djalal Harouni
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/types.h>
+#include <sys/eventfd.h>
+
+enum {
+        EVENTFD_INIT,
+        EVENTFD_START,
+        EVENTFD_PARENT_SUCCEEDED,
+        EVENTFD_PARENT_FAILED,
+        EVENTFD_CHILD_SUCCEEDED,
+        EVENTFD_CHILD_FAILED,
+};
+
+pid_t clone_with_eventfd(int flags, int eventfds[2]);
+
+int eventfd_send_state(int efd, eventfd_t s);
+int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s);
+
+int eventfd_recv_start(int efd);
+int eventfd_parent_succeeded(int efd);
+int eventfd_child_succeeded(int efd);

commit 113cea802db444beab4783538d39966f707be788
Author: Djalal Harouni <tixxdz at opendz.org>
Date:   Sat May 24 14:58:54 2014 +0100

    nspawn: move container wait logic into wait_for_container()
    
    Move the container wait logic into its own wait_for_container() function
    and add two status codes: CONTAINER_TERMINATED or CONTAINER_REBOOTED.
    The status will be stored in its argument, this way we handle:
    a) Return negative on failures.
    b) Return zero on success and set the status to either
       CONTAINER_REBOOTED or CONTAINER_TERMINATED.
    
    These status codes are used to terminate nspawn or loop again in case of
    CONTAINER_REBOOTED.

diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 73158a0..0cd476c 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -92,6 +92,11 @@
 #include "seccomp-util.h"
 #endif
 
+typedef enum ContainerStatus {
+        CONTAINER_TERMINATED,
+        CONTAINER_REBOOTED
+} ContainerStatus;
+
 typedef enum LinkJournal {
         LINK_NO,
         LINK_AUTO,
@@ -2569,6 +2574,74 @@ static int change_uid_gid(char **_home) {
         return 0;
 }
 
+/*
+ * Return 0 in case the container is being rebooted, has been shut
+ * down or exited successfully. On failures a negative value is
+ * returned.
+ *
+ * The status of the container "CONTAINER_TERMINATED" or
+ * "CONTAINER_REBOOTED" will be saved in the container argument
+ */
+static int wait_for_container(pid_t pid, ContainerStatus *container) {
+        int r;
+        siginfo_t status;
+
+        r = wait_for_terminate(pid, &status);
+        if (r < 0)
+                return r;
+
+        switch (status.si_code) {
+        case CLD_EXITED:
+                r = status.si_status;
+                if (r == 0) {
+                        if (!arg_quiet)
+                                log_debug("Container %s exited successfully.",
+                                          arg_machine);
+
+                        *container = CONTAINER_TERMINATED;
+                } else {
+                        log_error("Container %s failed with error code %i.",
+                                  arg_machine, status.si_status);
+                        r = -1;
+                }
+                break;
+
+        case CLD_KILLED:
+                if (status.si_status == SIGINT) {
+                        if (!arg_quiet)
+                                log_info("Container %s has been shut down.",
+                                         arg_machine);
+
+                        *container = CONTAINER_TERMINATED;
+                        r = 0;
+                        break;
+                } else if (status.si_status == SIGHUP) {
+                        if (!arg_quiet)
+                                log_info("Container %s is being rebooted.",
+                                         arg_machine);
+
+                        *container = CONTAINER_REBOOTED;
+                        r = 0;
+                        break;
+                }
+                /* CLD_KILLED fallthrough */
+
+        case CLD_DUMPED:
+                log_error("Container %s terminated by signal %s.",
+                          arg_machine, signal_to_string(status.si_status));
+                r = -1;
+                break;
+
+        default:
+                log_error("Container %s failed due to unknown reason.",
+                          arg_machine);
+                r = -1;
+                break;
+        }
+
+        return r;
+}
+
 int main(int argc, char *argv[]) {
 
         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
@@ -2747,8 +2820,8 @@ int main(int argc, char *argv[]) {
         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
 
         for (;;) {
+                ContainerStatus container_status;
                 int parent_ready_fd = -1, child_ready_fd = -1;
-                siginfo_t status;
                 eventfd_t x;
 
                 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
@@ -3100,48 +3173,16 @@ int main(int argc, char *argv[]) {
                 /* Redundant, but better safe than sorry */
                 kill(pid, SIGKILL);
 
-                k = wait_for_terminate(pid, &status);
+                r = wait_for_container(pid, &container_status);
                 pid = 0;
 
-                if (k < 0) {
+                if (r < 0) {
                         r = EXIT_FAILURE;
                         break;
-                }
-
-                if (status.si_code == CLD_EXITED) {
-                        r = status.si_status;
-                        if (status.si_status != 0) {
-                                log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
-                                break;
-                        }
-
-                        if (!arg_quiet)
-                                log_debug("Container %s exited successfully.", arg_machine);
-                        break;
-                } else if (status.si_code == CLD_KILLED &&
-                           status.si_status == SIGINT) {
-
-                        if (!arg_quiet)
-                                log_info("Container %s has been shut down.", arg_machine);
-                        r = 0;
+                } else if (container_status == CONTAINER_TERMINATED)
                         break;
-                } else if (status.si_code == CLD_KILLED &&
-                           status.si_status == SIGHUP) {
-
-                        if (!arg_quiet)
-                                log_info("Container %s is being rebooted.", arg_machine);
-                        continue;
-                } else if (status.si_code == CLD_KILLED ||
-                           status.si_code == CLD_DUMPED) {
 
-                        log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
-                        r = EXIT_FAILURE;
-                        break;
-                } else {
-                        log_error("Container %s failed due to unknown reason.", arg_machine);
-                        r = EXIT_FAILURE;
-                        break;
-                }
+                /* CONTAINER_REBOOTED, loop again */
         }
 
 finish:



More information about the systemd-commits mailing list