[systemd-commits] 3 commits - .gitignore Makefile.am src/nspawn src/shared src/test

Thu Jul 17 02:43:12 PDT 2014

.gitignore                |    2 
 Makefile.am               |   20 +
 src/nspawn/nspawn.c       |  136 ++++-----
 src/shared/barrier.c      |  440 +++++++++++++++++++++++++++++++
 src/shared/barrier.h      |   92 ++++++
 src/shared/eventfd-util.c |  169 ------------
 src/shared/eventfd-util.h |   43 ---
 src/shared/pty.c          |  640 ++++++++++++++++++++++++++++++++++++++++++++++
 src/shared/pty.h          |   77 +++++
 src/test/test-barrier.c   |  460 +++++++++++++++++++++++++++++++++
 src/test/test-pty.c       |  143 ++++++++++
 11 files changed, 1932 insertions(+), 290 deletions(-)

New commits:
commit a47d1dfd0823cd3978dd10e217dadcee7e01b265
Author: David Herrmann <dh.herrmann at gmail.com>
Date:   Fri Jul 11 16:29:56 2014 +0200

    shared: add PTY helper
    
    This Pty API wraps the ugliness that is POSIX PTY. It takes care of:
      - edge-triggered HUP handling (avoid heavy CPU-usage on vhangup)
      - HUP vs. input-queue draining (handle HUP _after_ draining the whole
        input queue)
      - SIGCHLD vs. HUP (HUP is no reliable way to catch PTY deaths, always
        use SIGCHLD. Otherwise, vhangup() and friends will break.)
      - Output queue buffering (async EPOLLOUT handling)
      - synchronous setup (via Barrier API)
    
    At the same time, the PTY API does not execve(). It simply fork()s and
    leaves everything else to the caller. Usually, they execve() but we
    support other setups, too.
    
    This will be needed by multiple UI binaries (systemd-console, systemd-er,
    ...) so it's placed in src/shared/. It's not strictly related to
    libsystemd-terminal, so it's not included there.

diff --git a/.gitignore b/.gitignore
index 4860631..aecc6ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -204,6 +204,7 @@
 /test-path-util
 /test-prioq
 /test-ratelimit
+/test-pty
 /test-replace-var
 /test-resolve
 /test-ring
diff --git a/Makefile.am b/Makefile.am
index fe680b0..a9ee8b0 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -843,6 +843,8 @@ libsystemd_shared_la_SOURCES = \
 	src/shared/ring.h \
 	src/shared/barrier.c \
 	src/shared/barrier.h \
+	src/shared/pty.c \
+	src/shared/pty.h \
 	src/shared/async.c \
 	src/shared/async.h \
 	src/shared/copy.c \
@@ -1252,6 +1254,7 @@ tests += \
 	test-util \
 	test-ring \
 	test-barrier \
+	test-pty \
 	test-tmpfiles \
 	test-namespace \
 	test-date \
@@ -1428,6 +1431,12 @@ test_barrier_SOURCES = \
 test_barrier_LDADD = \
 	libsystemd-core.la
 
+test_pty_SOURCES = \
+	src/test/test-pty.c
+
+test_pty_LDADD = \
+	libsystemd-core.la
+
 test_tmpfiles_SOURCES = \
 	src/test/test-tmpfiles.c
 
diff --git a/src/shared/pty.c b/src/shared/pty.c
new file mode 100644
index 0000000..11d76f8
--- /dev/null
+++ b/src/shared/pty.c
@@ -0,0 +1,640 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+  This file is part of systemd.
+
+  Copyright 2014 David Herrmann <dh.herrmann at gmail.com>
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+/*
+ * PTY
+ * A PTY object represents a single PTY connection between a master and a
+ * child. The child process is fork()ed so the caller controls what program
+ * will be run.
+ *
+ * Programs like /bin/login tend to perform a vhangup() on their TTY
+ * before running the login procedure. This also causes the pty master
+ * to get a EPOLLHUP event as long as no client has the TTY opened.
+ * This means, we cannot use the TTY connection as reliable way to track
+ * the client. Instead, we _must_ rely on the PID of the client to track
+ * them.
+ * However, this has the side effect that if the client forks and the
+ * parent exits, we loose them and restart the client. But this seems to
+ * be the expected behavior so we implement it here.
+ *
+ * Unfortunately, epoll always polls for EPOLLHUP so as long as the
+ * vhangup() is ongoing, we will _always_ get EPOLLHUP and cannot sleep.
+ * This gets worse if the client closes the TTY but doesn't exit.
+ * Therefore, the fd must be edge-triggered in the epoll-set so we
+ * only get the events once they change.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <pty.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <termios.h>
+#include <unistd.h>
+
+#include "barrier.h"
+#include "macro.h"
+#include "pty.h"
+#include "ring.h"
+#include "util.h"
+
+#define PTY_BUFSIZE 16384
+
+enum {
+        PTY_ROLE_UNKNOWN,
+        PTY_ROLE_PARENT,
+        PTY_ROLE_CHILD,
+};
+
+struct Pty {
+        unsigned long ref;
+        Barrier barrier;
+        int fd;
+        pid_t child;
+        sd_event_source *fd_source;
+        sd_event_source *child_source;
+
+        char in_buf[PTY_BUFSIZE];
+        Ring out_buf;
+
+        pty_event_t event_fn;
+        void *event_fn_userdata;
+
+        bool needs_requeue : 1;
+        unsigned int role : 2;
+};
+
+int pty_new(Pty **out) {
+        _pty_unref_ Pty *pty = NULL;
+        int r;
+
+        assert_return(out, -EINVAL);
+
+        pty = new0(Pty, 1);
+        if (!pty)
+                return -ENOMEM;
+
+        pty->ref = 1;
+        pty->fd = -1;
+
+        pty->fd = posix_openpt(O_RDWR | O_NOCTTY | O_CLOEXEC | O_NONBLOCK);
+        if (pty->fd < 0)
+                return -errno;
+
+        /*
+         * The slave-node is initialized to uid/gid of the caller of
+         * posix_openpt(). Only if devpts is mounted with fixed uid/gid this is
+         * skipped. In that case, grantpt() can overwrite these, but then you
+         * have to be root to use chown() (or a pt_chown helper has to be
+         * present). In those cases grantpt() really does something,
+         * otherwise it's a no-op. We call grantpt() here to try supporting
+         * those cases, even though no-one uses that, I guess. If you need other
+         * access-rights, set them yourself after this call returns (no, this is
+         * not racy, it looks racy, but races regarding your own UID are never
+         * important as an attacker could ptrace you; and the slave-pty is also
+         * still locked).
+         */
+        r = grantpt(pty->fd);
+        if (r < 0)
+                return -errno;
+
+        r = barrier_init(&pty->barrier);
+        if (r < 0)
+                return r;
+
+        *out = pty;
+        pty = NULL;
+        return 0;
+}
+
+Pty *pty_ref(Pty *pty) {
+        if (!pty || pty->ref < 1)
+                return NULL;
+
+        ++pty->ref;
+        return pty;
+}
+
+Pty *pty_unref(Pty *pty) {
+        if (!pty || pty->ref < 1 || --pty->ref > 0)
+                return NULL;
+
+        pty_close(pty);
+        pty->child_source = sd_event_source_unref(pty->child_source);
+        barrier_destroy(&pty->barrier);
+        ring_clear(&pty->out_buf);
+        free(pty);
+
+        return NULL;
+}
+
+Barrier *pty_get_barrier(Pty *pty) {
+        assert(pty);
+        return &pty->barrier;
+}
+
+bool pty_is_unknown(Pty *pty) {
+        return pty && pty->role == PTY_ROLE_UNKNOWN;
+}
+
+bool pty_is_parent(Pty *pty) {
+        return pty && pty->role == PTY_ROLE_PARENT;
+}
+
+bool pty_is_child(Pty *pty) {
+        return pty && pty->role == PTY_ROLE_CHILD;
+}
+
+bool pty_has_child(Pty *pty) {
+        return pty_is_parent(pty) && pty->child > 0;
+}
+
+pid_t pty_get_child(Pty *pty) {
+        return pty_has_child(pty) ? pty->child : -ECHILD;
+}
+
+bool pty_is_open(Pty *pty) {
+        return pty && pty->fd >= 0;
+}
+
+int pty_get_fd(Pty *pty) {
+        assert_return(pty, -EINVAL);
+
+        return pty_is_open(pty) ? pty->fd : -EPIPE;
+}
+
+int pty_make_child(Pty *pty) {
+        char slave_name[1024];
+        int r, fd;
+
+        assert_return(pty, -EINVAL);
+        assert_return(pty_is_unknown(pty), -EALREADY);
+
+        r = ptsname_r(pty->fd, slave_name, sizeof(slave_name));
+        if (r < 0)
+                return -errno;
+
+        fd = open(slave_name, O_RDWR | O_CLOEXEC | O_NOCTTY);
+        if (fd < 0)
+                return -errno;
+
+        safe_close(pty->fd);
+        pty->fd = fd;
+        pty->child = getpid();
+        pty->role = PTY_ROLE_CHILD;
+        barrier_set_role(&pty->barrier, BARRIER_CHILD);
+
+        return 0;
+}
+
+int pty_make_parent(Pty *pty, pid_t child) {
+        assert_return(pty, -EINVAL);
+        assert_return(pty_is_unknown(pty), -EALREADY);
+
+        pty->child = child;
+        pty->role = PTY_ROLE_PARENT;
+
+        return 0;
+}
+
+int pty_unlock(Pty *pty) {
+        assert_return(pty, -EINVAL);
+        assert_return(pty_is_unknown(pty) || pty_is_parent(pty), -EINVAL);
+        assert_return(pty_is_open(pty), -ENODEV);
+
+        return unlockpt(pty->fd) < 0 ? -errno : 0;
+}
+
+int pty_setup_child(Pty *pty) {
+        struct termios attr;
+        pid_t pid;
+        int r;
+
+        assert_return(pty, -EINVAL);
+        assert_return(pty_is_child(pty), -EINVAL);
+        assert_return(pty_is_open(pty), -EALREADY);
+
+        r = sigprocmask_many(SIG_SETMASK, -1);
+        if (r < 0)
+                return r;
+
+        r = reset_all_signal_handlers();
+        if (r < 0)
+                return r;
+
+        pid = setsid();
+        if (pid < 0 && errno != EPERM)
+                return -errno;
+
+        r = ioctl(pty->fd, TIOCSCTTY, 0);
+        if (r < 0)
+                return -errno;
+
+        r = tcgetattr(pty->fd, &attr);
+        if (r < 0)
+                return -errno;
+
+        /* erase character should be normal backspace, PLEASEEE! */
+        attr.c_cc[VERASE] = 010;
+        /* always set UTF8 flag */
+        attr.c_iflag |= IUTF8;
+
+        r = tcsetattr(pty->fd, TCSANOW, &attr);
+        if (r < 0)
+                return -errno;
+
+        if (dup2(pty->fd, STDIN_FILENO) != STDIN_FILENO ||
+            dup2(pty->fd, STDOUT_FILENO) != STDOUT_FILENO ||
+            dup2(pty->fd, STDERR_FILENO) != STDERR_FILENO)
+                return -errno;
+
+        /* only close FD if it's not a std-fd */
+        pty->fd = (pty->fd > 2) ? safe_close(pty->fd) : -1;
+
+        return 0;
+}
+
+void pty_close(Pty *pty) {
+        if (!pty_is_open(pty))
+                return;
+
+        pty->fd_source = sd_event_source_unref(pty->fd_source);
+        pty->fd = safe_close(pty->fd);
+}
+
+/*
+ * Drain input-queue and dispatch data via the event-handler. Returns <0 on
+ * error, 0 if queue is empty and 1 if we couldn't empty the input queue fast
+ * enough and there's still data left.
+ */
+static int pty_dispatch_read(Pty *pty) {
+        unsigned int i;
+        ssize_t len;
+        int r;
+
+        /*
+         * We're edge-triggered, means we need to read the whole queue. This,
+         * however, might cause us to stall if the writer is faster than we
+         * are. Therefore, we read twice and if the second read still returned
+         * data, we reschedule.
+         */
+
+        for (i = 0; i < 2; ++i) {
+                len = read(pty->fd, pty->in_buf, sizeof(pty->in_buf) - 1);
+                if (len < 0) {
+                        if (errno == EINTR)
+                                continue;
+
+                        return (errno == EAGAIN) ? 0 : -errno;
+                } else if (len == 0) {
+                        continue;
+                }
+
+                /* set terminating zero for debugging safety */
+                pty->in_buf[len] = 0;
+                r = pty->event_fn(pty, pty->event_fn_userdata, PTY_DATA, pty->in_buf, len);
+                if (r < 0)
+                        return r;
+        }
+
+        /* still data left, make sure we're queued again */
+        pty->needs_requeue = true;
+
+        return 1;
+}
+
+/*
+ * Drain output-queue by writing data to the pty. Returns <0 on error, 0 if the
+ * output queue is empty now and 1 if we couldn't empty the output queue fast
+ * enough and there's still data left.
+ */
+static int pty_dispatch_write(Pty *pty) {
+        struct iovec vec[2];
+        unsigned int i;
+        ssize_t len;
+        size_t num;
+
+        /*
+         * Same as pty_dispatch_read(), we're edge-triggered so we need to call
+         * write() until either all data is written or it returns EAGAIN. We
+         * call it twice and if it still writes successfully, we reschedule.
+         */
+
+        for (i = 0; i < 2; ++i) {
+                num = ring_peek(&pty->out_buf, vec);
+                if (num < 1)
+                        return 0;
+
+                len = writev(pty->fd, vec, (int)num);
+                if (len < 0) {
+                        if (errno == EINTR)
+                                continue;
+
+                        return (errno == EAGAIN) ? 1 : -errno;
+                } else if (len == 0) {
+                        continue;
+                }
+
+                ring_pull(&pty->out_buf, (size_t)len);
+        }
+
+        /* still data left, make sure we're queued again */
+        if (ring_get_size(&pty->out_buf) > 0) {
+                pty->needs_requeue = true;
+                return 1;
+        }
+
+        return 0;
+}
+
+static int pty_fd_fn(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+        Pty *pty = userdata;
+        int r_hup = 0, r_write = 0, r_read = 0, r;
+
+        /*
+         * Whenever we encounter I/O errors, we have to make sure to drain the
+         * input queue first, before we handle any HUP. A child might send us
+         * a message and immediately close the queue. We must not handle the
+         * HUP first or we loose data.
+         * Therefore, if we read a message successfully, we always return
+         * success and wait for the next event-loop iteration. Furthermore,
+         * whenever there is a write-error, we must try reading from the input
+         * queue even if EPOLLIN is not set. The input might have arrived in
+         * between epoll_wait() and write(). Therefore, write-errors are only
+         * ever handled if the input-queue is empty. In all other cases they
+         * are ignored until either reading fails or the input queue is empty.
+         */
+
+        if (revents & (EPOLLHUP | EPOLLERR))
+                r_hup = -EPIPE;
+
+        if (revents & EPOLLOUT)
+                r_write = pty_dispatch_write(pty);
+
+        /* Awesome! Kernel signals HUP without IN but queues are not empty.. */
+        if ((revents & EPOLLIN) || r_hup < 0 || r_write < 0) {
+                r_read = pty_dispatch_read(pty);
+                if (r_read > 0)
+                        return 0; /* still data left to fetch next round */
+        }
+
+        if (r_hup < 0 || r_write < 0 || r_read < 0) {
+                /* PTY closed and input-queue drained */
+                pty_close(pty);
+                r = pty->event_fn(pty, pty->event_fn_userdata, PTY_HUP, NULL, 0);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int pty_fd_prepare_fn(sd_event_source *source, void *userdata) {
+        Pty *pty = userdata;
+        int r;
+
+        if (pty->needs_requeue) {
+                /*
+                 * We're edge-triggered. In case we couldn't handle all events
+                 * or in case new write-data is queued, we set needs_requeue.
+                 * Before going asleep, we set the io-events *again*. sd-event
+                 * notices that we're edge-triggered and forwards the call to
+                 * the kernel even if the events didn't change. The kernel will
+                 * check the events and re-queue us on the ready queue in case
+                 * an event is pending.
+                 */
+                r = sd_event_source_set_io_events(source, EPOLLHUP | EPOLLERR | EPOLLIN | EPOLLOUT | EPOLLET);
+                if (r >= 0)
+                        pty->needs_requeue = false;
+        }
+
+        return 0;
+}
+
+static int pty_child_fn(sd_event_source *source, const siginfo_t *si, void *userdata) {
+        Pty *pty = userdata;
+        int r;
+
+        pty->child = 0;
+
+        r = pty->event_fn(pty, pty->event_fn_userdata, PTY_CHILD, si, sizeof(*si));
+        if (r < 0)
+                return r;
+
+        return 0;
+}
+
+int pty_attach_event(Pty *pty, sd_event *event, pty_event_t event_fn, void *event_fn_userdata) {
+        int r;
+
+        assert_return(pty, -EINVAL);
+        assert_return(event, -EINVAL);
+        assert_return(event_fn, -EINVAL);
+        assert_return(pty_is_parent(pty), -EINVAL);
+
+        pty_detach_event(pty);
+
+        if (pty_is_open(pty)) {
+                r = sd_event_add_io(event,
+                                    &pty->fd_source,
+                                    pty->fd,
+                                    EPOLLHUP | EPOLLERR | EPOLLIN | EPOLLOUT | EPOLLET,
+                                    pty_fd_fn,
+                                    pty);
+                if (r < 0)
+                        goto error;
+
+                r = sd_event_source_set_prepare(pty->fd_source, pty_fd_prepare_fn);
+                if (r < 0)
+                        goto error;
+        }
+
+        if (pty_has_child(pty)) {
+                r = sd_event_add_child(event,
+                                       &pty->child_source,
+                                       pty->child,
+                                       WEXITED,
+                                       pty_child_fn,
+                                       pty);
+                if (r < 0)
+                        goto error;
+        }
+
+        pty->event_fn = event_fn;
+        pty->event_fn_userdata = event_fn_userdata;
+
+        return 0;
+
+error:
+        pty_detach_event(pty);
+        return r;
+}
+
+void pty_detach_event(Pty *pty) {
+        if (!pty)
+                return;
+
+        pty->child_source = sd_event_source_unref(pty->child_source);
+        pty->fd_source = sd_event_source_unref(pty->fd_source);
+        pty->event_fn = NULL;
+        pty->event_fn_userdata = NULL;
+}
+
+int pty_write(Pty *pty, const void *buf, size_t size) {
+        bool was_empty;
+        int r;
+
+        assert_return(pty, -EINVAL);
+        assert_return(pty_is_open(pty), -ENODEV);
+        assert_return(pty_is_parent(pty), -ENODEV);
+
+        if (size < 1)
+                return 0;
+
+        /*
+         * Push @buf[0.. at size] into the output ring-buffer. In case the
+         * ring-buffer wasn't empty beforehand, we're already waiting for
+         * EPOLLOUT and we're done. If it was empty, we have to re-queue the
+         * FD for EPOLLOUT as we're edge-triggered and wouldn't get any new
+         * EPOLLOUT event.
+         */
+
+        was_empty = ring_get_size(&pty->out_buf) < 1;
+
+        r = ring_push(&pty->out_buf, buf, size);
+        if (r < 0)
+                return r;
+
+        if (was_empty)
+                pty->needs_requeue = true;
+
+        return 0;
+}
+
+int pty_signal(Pty *pty, int sig) {
+        assert_return(pty, -EINVAL);
+        assert_return(pty_is_open(pty), -ENODEV);
+        assert_return(pty_is_parent(pty), -ENODEV);
+
+        return ioctl(pty->fd, TIOCSIG, sig) < 0 ? -errno : 0;
+}
+
+int pty_resize(Pty *pty, unsigned short term_width, unsigned short term_height) {
+        struct winsize ws;
+
+        assert_return(pty, -EINVAL);
+        assert_return(pty_is_open(pty), -ENODEV);
+        assert_return(pty_is_parent(pty), -ENODEV);
+
+        zero(ws);
+        ws.ws_col = term_width;
+        ws.ws_row = term_height;
+
+        /*
+         * This will send SIGWINCH to the pty slave foreground process group.
+         * We will also get one, but we don't need it.
+         */
+        return ioctl(pty->fd, TIOCSWINSZ, &ws) < 0 ? -errno : 0;
+}
+
+pid_t pty_fork(Pty **out, sd_event *event, pty_event_t event_fn, void *event_fn_userdata, unsigned short initial_term_width, unsigned short initial_term_height) {
+        _pty_unref_ Pty *pty = NULL;
+        int r;
+        pid_t pid;
+
+        assert_return(out, -EINVAL);
+        assert_return((event && event_fn) || (!event && !event_fn), -EINVAL);
+
+        r = pty_new(&pty);
+        if (r < 0)
+                return r;
+
+        r = pty_unlock(pty);
+        if (r < 0)
+                return r;
+
+        pid = fork();
+        if (pid < 0)
+                return -errno;
+
+        if (pid == 0) {
+                /* child */
+
+                r = pty_make_child(pty);
+                if (r < 0)
+                        _exit(-r);
+
+                r = pty_setup_child(pty);
+                if (r < 0)
+                        _exit(-r);
+
+                /* sync with parent */
+                if (!barrier_place_and_sync(&pty->barrier))
+                        _exit(1);
+
+                /* fallthrough and return the child's PTY object */
+        } else {
+                /* parent */
+
+                r = pty_make_parent(pty, pid);
+                if (r < 0)
+                        goto parent_error;
+
+                r = pty_resize(pty, initial_term_width, initial_term_height);
+                if (r < 0)
+                        goto parent_error;
+
+                if (event) {
+                        r = pty_attach_event(pty, event, event_fn, event_fn_userdata);
+                        if (r < 0)
+                                goto parent_error;
+                }
+
+                /* sync with child */
+                if (!barrier_place_and_sync(&pty->barrier)) {
+                        r = -ECHILD;
+                        goto parent_error;
+                }
+
+                /* fallthrough and return the parent's PTY object */
+        }
+
+        *out = pty;
+        pty = NULL;
+        return pid;
+
+parent_error:
+        barrier_abort(&pty->barrier);
+        waitpid(pty->child, NULL, 0);
+        pty->child = 0;
+        return r;
+}
diff --git a/src/shared/pty.h b/src/shared/pty.h
new file mode 100644
index 0000000..a87ceb5
--- /dev/null
+++ b/src/shared/pty.h
@@ -0,0 +1,77 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+#pragma once
+
+/***
+  This file is part of systemd.
+
+  Copyright 2014 David Herrmann <dh.herrmann at gmail.com>
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "barrier.h"
+#include "macro.h"
+#include "sd-event.h"
+#include "util.h"
+
+typedef struct Pty Pty;
+
+enum {
+        PTY_CHILD,
+        PTY_HUP,
+        PTY_DATA,
+};
+
+typedef int (*pty_event_t) (Pty *pty, void *userdata, unsigned int event, const void *ptr, size_t size);
+
+int pty_new(Pty **out);
+Pty *pty_ref(Pty *pty);
+Pty *pty_unref(Pty *pty);
+
+#define _pty_unref_ _cleanup_(pty_unrefp)
+DEFINE_TRIVIAL_CLEANUP_FUNC(Pty*, pty_unref);
+
+Barrier *pty_get_barrier(Pty *pty);
+
+bool pty_is_unknown(Pty *pty);
+bool pty_is_parent(Pty *pty);
+bool pty_is_child(Pty *pty);
+bool pty_has_child(Pty *pty);
+pid_t pty_get_child(Pty *pty);
+
+bool pty_is_open(Pty *pty);
+int pty_get_fd(Pty *pty);
+
+int pty_make_child(Pty *pty);
+int pty_make_parent(Pty *pty, pid_t child);
+int pty_unlock(Pty *pty);
+int pty_setup_child(Pty *pty);
+void pty_close(Pty *pty);
+
+int pty_attach_event(Pty *pty, sd_event *event, pty_event_t event_fn, void *event_fn_userdata);
+void pty_detach_event(Pty *pty);
+
+int pty_write(Pty *pty, const void *buf, size_t size);
+int pty_signal(Pty *pty, int sig);
+int pty_resize(Pty *pty, unsigned short term_width, unsigned short term_height);
+
+pid_t pty_fork(Pty **out, sd_event *event, pty_event_t event_fn, void *event_fn_userdata, unsigned short initial_term_width, unsigned short initial_term_height);
diff --git a/src/test/test-pty.c b/src/test/test-pty.c
new file mode 100644
index 0000000..73c5c85
--- /dev/null
+++ b/src/test/test-pty.c
@@ -0,0 +1,143 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+  This file is part of systemd.
+
+  Copyright 2014 David Herrmann <dh.herrmann at gmail.com>
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <locale.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "def.h"
+#include "pty.h"
+#include "util.h"
+
+static const char sndmsg[] = "message\n";
+static const char rcvmsg[] = "message\r\n";
+static char rcvbuf[128];
+static size_t rcvsiz = 0;
+static sd_event *event;
+
+static void run_child(Pty *pty) {
+        int r, l;
+        char buf[512];
+
+        r = read(0, buf, sizeof(buf));
+        assert_se(r == strlen(sndmsg));
+        assert_se(!strncmp(buf, sndmsg, r));
+
+        l = write(1, buf, r);
+        assert_se(l == r);
+}
+
+static int pty_fn(Pty *pty, void *userdata, unsigned int ev, const void *ptr, size_t size) {
+        switch (ev) {
+        case PTY_DATA:
+                assert_se(rcvsiz < strlen(rcvmsg) * 2);
+                assert_se(rcvsiz + size < sizeof(rcvbuf));
+
+                memcpy(&rcvbuf[rcvsiz], ptr, size);
+                rcvsiz += size;
+
+                if (rcvsiz >= strlen(rcvmsg) * 2) {
+                        assert_se(rcvsiz == strlen(rcvmsg) * 2);
+                        assert_se(!memcmp(rcvbuf, rcvmsg, strlen(rcvmsg)));
+                        assert_se(!memcmp(&rcvbuf[strlen(rcvmsg)], rcvmsg, strlen(rcvmsg)));
+                }
+
+                break;
+        case PTY_HUP:
+                /* This is guaranteed to appear _after_ the input queues are
+                 * drained! */
+                assert_se(rcvsiz == strlen(rcvmsg) * 2);
+                break;
+        case PTY_CHILD:
+                /* this may appear at any time */
+                break;
+        default:
+                assert_se(0);
+                break;
+        }
+
+        /* if we got HUP _and_ CHILD, exit */
+        if (pty_get_fd(pty) < 0 && pty_get_child(pty) < 0)
+                sd_event_exit(event, 0);
+
+        return 0;
+}
+
+static void run_parent(Pty *pty) {
+        int r;
+
+        /* write message to pty, ECHO mode guarantees that we get it back
+         * twice: once via ECHO, once from the run_child() fn */
+        assert_se(pty_write(pty, sndmsg, strlen(sndmsg)) >= 0);
+
+        r = sd_event_loop(event);
+        assert_se(r >= 0);
+}
+
+static void test_pty(void) {
+        pid_t pid;
+        Pty *pty;
+
+        rcvsiz = 0;
+        memset(rcvbuf, 0, sizeof(rcvbuf));
+
+        assert_se(sd_event_default(&event) >= 0);
+
+        pid = pty_fork(&pty, event, pty_fn, NULL, 80, 25);
+        assert_se(pid >= 0);
+
+        if (pid == 0) {
+                /* child */
+                run_child(pty);
+                exit(0);
+        }
+
+        /* parent */
+        run_parent(pty);
+
+        /* Make sure the PTY recycled the child; yeah, this is racy if the
+         * PID was already reused; but that seems fine for a test. */
+        assert_se(waitpid(pid, NULL, WNOHANG) < 0 && errno == ECHILD);
+
+        pty_unref(pty);
+        sd_event_unref(event);
+}
+
+int main(int argc, char *argv[]) {
+        unsigned int i;
+
+        log_parse_environment();
+        log_open();
+
+        assert_se(sigprocmask_many(SIG_BLOCK, SIGCHLD, -1) >= 0);
+
+        /* Oh, there're ugly races in the TTY layer regarding HUP vs IN. Turns
+         * out they appear only 10% of the time. I fixed all of them and
+         * don't see them, anymore. But lets be safe and run this 1000 times
+         * so we catch any new ones, in case they appear again. */
+        for (i = 0; i < 1000; ++i)
+                test_pty();
+
+        return 0;
+}

commit a2da110b78abe4e4b1b6d8ae4ef78b087c4dcc8b
Author: David Herrmann <dh.herrmann at gmail.com>
Date:   Sun Jul 13 12:14:45 2014 +0200

    nspawn: use Barrier API instead of eventfd-util
    
    The Barrier-API simplifies cross-fork() synchronization a lot. Replace the
    hard-coded eventfd-util implementation and drop it.
    
    Compared to the old API, Barriers also handle exit() of the remote side as
    abortion. This way, segfaults will not cause the parent to deadlock.
    
    EINTR handling is currently ignored for any barrier-waits. This can easily
    be added, but it isn't needed so far so I dropped it. EINTR handling in
    general is ugly, anyway. You need to deal with pselect/ppoll/... variants
    and make sure not to unblock signals at the wrong times. So genrally,
    there's little use in adding it.

diff --git a/Makefile.am b/Makefile.am
index f0d80ba..fe680b0 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -845,8 +845,6 @@ libsystemd_shared_la_SOURCES = \
 	src/shared/barrier.h \
 	src/shared/async.c \
 	src/shared/async.h \
-	src/shared/eventfd-util.c \
-	src/shared/eventfd-util.h \
 	src/shared/copy.c \
 	src/shared/copy.h \
 	src/shared/base-filesystem.c \
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index bad93a5..e75cc28 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -40,7 +40,6 @@
 #include <sys/un.h>
 #include <sys/socket.h>
 #include <linux/netlink.h>
-#include <sys/eventfd.h>
 #include <net/if.h>
 #include <linux/veth.h>
 #include <sys/personality.h>
@@ -84,12 +83,12 @@
 #include "def.h"
 #include "rtnl-util.h"
 #include "udev-util.h"
-#include "eventfd-util.h"
 #include "blkid-util.h"
 #include "gpt.h"
 #include "siphash24.h"
 #include "copy.h"
 #include "base-filesystem.h"
+#include "barrier.h"
 
 #ifdef HAVE_SECCOMP
 #include "seccomp-util.h"
@@ -3074,12 +3073,18 @@ int main(int argc, char *argv[]) {
 
         for (;;) {
                 ContainerStatus container_status;
-                int eventfds[2] = { -1, -1 };
+                _barrier_destroy_ Barrier barrier = { };
                 struct sigaction sa = {
                         .sa_handler = nop_handler,
                         .sa_flags = SA_NOCLDSTOP,
                 };
 
+                r = barrier_init(&barrier);
+                if (r < 0) {
+                        log_error("Cannot initialize IPC barrier: %s", strerror(-r));
+                        goto finish;
+                }
+
                 /* Child can be killed before execv(), so handle SIGCHLD
                  * in order to interrupt parent's blocking calls and
                  * give it a chance to call wait() and terminate. */
@@ -3095,9 +3100,9 @@ int main(int argc, char *argv[]) {
                         goto finish;
                 }
 
-                pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
-                                         (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
-                                         (arg_private_network ? CLONE_NEWNET : 0), eventfds);
+                pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
+                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
+                                          (arg_private_network ? CLONE_NEWNET : 0), NULL);
                 if (pid < 0) {
                         if (errno == EINVAL)
                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
@@ -3126,6 +3131,8 @@ int main(int argc, char *argv[]) {
                         };
                         char **env_use;
 
+                        barrier_set_role(&barrier, BARRIER_CHILD);
+
                         envp[n_env] = strv_find_prefix(environ, "TERM=");
                         if (envp[n_env])
                                 n_env ++;
@@ -3151,26 +3158,26 @@ int main(int argc, char *argv[]) {
                                 }
 
                                 log_error("Failed to open console: %s", strerror(-k));
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
                                 log_error("Failed to duplicate console: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (setsid() < 0) {
                                 log_error("setsid() failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (reset_audit_loginuid() < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
                                 log_error("PR_SET_PDEATHSIG failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         /* Mark everything as slave, so that we still
@@ -3178,113 +3185,109 @@ int main(int argc, char *argv[]) {
                          * propagate mounts to the real root. */
                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
                                 log_error("MS_SLAVE|MS_REC failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (mount_devices(arg_directory,
                                           root_device, root_device_rw,
                                           home_device, home_device_rw,
                                           srv_device, srv_device_rw) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         /* Turn directory into bind mount */
                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
                                 log_error("Failed to make bind mount: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         r = setup_volatile(arg_directory);
                         if (r < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_volatile_state(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         r = base_filesystem_create(arg_directory);
                         if (r < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (arg_read_only) {
                                 k = bind_remount_recursive(arg_directory, true);
                                 if (k < 0) {
                                         log_error("Failed to make tree read-only: %s", strerror(-k));
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         }
 
                         if (mount_all(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (copy_devnodes(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_ptmx(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         dev_setup(arg_directory);
 
                         if (setup_seccomp() < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_dev_console(arg_directory, console) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
 
                         if (setup_boot_id(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_timezone(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_resolv_conf(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_journal(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (mount_binds(arg_directory, arg_bind, false) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (mount_tmpfs(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         /* Tell the parent that we are ready, and that
                          * it can cgroupify us to that we lack access
                          * to certain devices and resources. */
-                        r = eventfd_send_state(eventfds[1],
-                                               EVENTFD_CHILD_SUCCEEDED);
-                        eventfds[1] = safe_close(eventfds[1]);
-                        if (r < 0)
-                                goto child_fail;
+                        barrier_place(&barrier);
 
                         if (chdir(arg_directory) < 0) {
                                 log_error("chdir(%s) failed: %m", arg_directory);
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
                                 log_error("mount(MS_MOVE) failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (chroot(".") < 0) {
                                 log_error("chroot() failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (chdir("/") < 0) {
                                 log_error("chdir() failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         umask(0022);
@@ -3294,18 +3297,18 @@ int main(int argc, char *argv[]) {
 
                         if (drop_capabilities() < 0) {
                                 log_error("drop_capabilities() failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         r = change_uid_gid(&home);
                         if (r < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
                                 log_oom();
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
@@ -3313,7 +3316,7 @@ int main(int argc, char *argv[]) {
 
                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
                                         log_oom();
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         }
 
@@ -3321,13 +3324,13 @@ int main(int argc, char *argv[]) {
                                 k = fdset_cloexec(fds, false);
                                 if (k < 0) {
                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
 
                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
                                         log_oom();
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         }
 
@@ -3336,12 +3339,12 @@ int main(int argc, char *argv[]) {
                         if (arg_personality != 0xffffffffLU) {
                                 if (personality(arg_personality) < 0) {
                                         log_error("personality() failed: %m");
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         } else if (secondary) {
                                 if (personality(PER_LINUX32) < 0) {
                                         log_error("personality() failed: %m");
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         }
 
@@ -3349,7 +3352,7 @@ int main(int argc, char *argv[]) {
                         if (arg_selinux_context)
                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
 #endif
 
@@ -3359,7 +3362,7 @@ int main(int argc, char *argv[]) {
                                 n = strv_env_merge(2, envp, arg_setenv);
                                 if (!n) {
                                         log_oom();
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
 
                                 env_use = n;
@@ -3367,10 +3370,8 @@ int main(int argc, char *argv[]) {
                                 env_use = (char**) envp;
 
                         /* Wait until the parent is ready with the setup, too... */
-                        r = eventfd_parent_succeeded(eventfds[0]);
-                        eventfds[0] = safe_close(eventfds[0]);
-                        if (r < 0)
-                                goto child_fail;
+                        if (!barrier_place_and_sync(&barrier))
+                                _exit(EXIT_FAILURE);
 
                         if (arg_boot) {
                                 char **a;
@@ -3399,29 +3400,15 @@ int main(int argc, char *argv[]) {
                         }
 
                         log_error("execv() failed: %m");
-
-                child_fail:
-                        /* Tell the parent that the setup failed, so he
-                         * can clean up resources and terminate. */
-                        if (eventfds[1] != -1)
-                                eventfd_send_state(eventfds[1],
-                                                   EVENTFD_CHILD_FAILED);
                         _exit(EXIT_FAILURE);
                 }
 
+                barrier_set_role(&barrier, BARRIER_PARENT);
                 fdset_free(fds);
                 fds = NULL;
 
-                /* Wait for the child event:
-                 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
-                 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
-                 * it is ready with all it needs to do with priviliges.
-                 * After we got the notification we can make the process
-                 * join its cgroup which might limit what it can do */
-                r = eventfd_child_succeeded(eventfds[1]);
-                eventfds[1] = safe_close(eventfds[1]);
-
-                if (r >= 0) {
+                /* wait for child-setup to be done */
+                if (barrier_place_and_sync(&barrier)) {
                         int ifi = 0;
 
                         r = move_network_interfaces(pid);
@@ -3458,10 +3445,7 @@ int main(int argc, char *argv[]) {
                         /* Notify the child that the parent is ready with all
                          * its setup, and that the child can now hand over
                          * control to the code to run inside the container. */
-                        r = eventfd_send_state(eventfds[0], EVENTFD_PARENT_SUCCEEDED);
-                        eventfds[0] = safe_close(eventfds[0]);
-                        if (r < 0)
-                                goto finish;
+                        barrier_place(&barrier);
 
                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
                         if (k < 0) {
diff --git a/src/shared/eventfd-util.c b/src/shared/eventfd-util.c
deleted file mode 100644
index 27b7cf7..0000000
--- a/src/shared/eventfd-util.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
-
-/***
-  This file is part of systemd.
-
-  Copyright 2014 Djalal Harouni
-
-  systemd is free software; you can redistribute it and/or modify it
-  under the terms of the GNU Lesser General Public License as published by
-  the Free Software Foundation; either version 2.1 of the License, or
-  (at your option) any later version.
-
-  systemd is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public License
-  along with systemd; If not, see <http://www.gnu.org/licenses/>.
-***/
-
-#include <assert.h>
-#include <errno.h>
-#include <unistd.h>
-#include <sys/eventfd.h>
-#include <sys/syscall.h>
-
-#include "eventfd-util.h"
-#include "util.h"
-
-
-/*
- * Use this to create processes that need to setup a full context
- * and sync it with their parents using cheap mechanisms.
- *
- * This will create two blocking eventfd(s). A pair for the parent and
- * the other for the child so they can be used as a notify mechanism.
- * Each process will gets its copy of the parent and child eventfds.
- *
- * This is useful in case:
- * 1) If the parent fails or dies, the child must die.
- * 2) Child will install PR_SET_PDEATHSIG as soon as possible.
- * 3) Parent and child need to sync using less resources.
- * 4) If parent is not able to install a SIGCHLD handler:
- *    parent will wait using a blocking eventfd_read() or
- *    eventfd_child_succeeded() call on the child eventfd.
- *
- *    * If the child setup succeeded, child should notify with an
- *      EVENTFD_CHILD_SUCCEEDED, parent will continue.
- *    * If the child setup failed, child should notify with an
- *      EVENTFD_CHILD_FAILED before any _exit(). This avoids blocking
- *      the parent.
- *
- * 5) If parent is able to install a SIGCHLD handler:
- *    An empty signal handler without SA_RESTART will do it, since the
- *    blocking eventfd_read() or eventfd_parent_succeeded() of the
- *    parent will be interrupted by SIGCHLD and the call will fail with
- *    EINTR. This is useful in case the child dies abnormaly and did
- *    not have a chance to notify its parent using EVENTFD_CHILD_FAILED.
- *
- * 6) Call wait*() in the main instead of the signal handler in order
- *    to: 1) reduce side effects and 2) have a better handling for
- *    child termination in order to reduce various race conditions.
- *
- *
- * The return value of clone_with_eventfd() is the same of clone().
- * On success the eventfds[] will contain the two eventfd(s). These
- * file descriptors can be closed later with safe_close(). On failure,
- * a negative value is returned in the caller's context, and errno will
- * be set appropriately.
- *
- * Extra preliminary work:
- * 1) Child can wait before starting its setup by using the
- *    eventfd_recv_start() call on the parent eventfd, in that case the
- *    parent must notify with EVENTFD_START, after doing any preliminary
- *    work.
- *
- * Note: this function depends on systemd internal functions
- * safe_close() and it should be used only by direct binaries, no
- * libraries.
- */
-pid_t clone_with_eventfd(int flags, int eventfds[2]) {
-        pid_t pid;
-
-        assert(eventfds);
-
-        eventfds[0] = eventfd(EVENTFD_INIT, EFD_CLOEXEC);
-        if (eventfds[0] < 0)
-                return -1;
-
-        eventfds[1] = eventfd(EVENTFD_INIT, EFD_CLOEXEC);
-        if (eventfds[1] < 0)
-                goto err_eventfd0;
-
-        pid = syscall(__NR_clone, flags, NULL);
-        if (pid < 0)
-                goto err_eventfd1;
-
-        return pid;
-
-err_eventfd1:
-        eventfds[1] = safe_close(eventfds[1]);
-err_eventfd0:
-        eventfds[0] = safe_close(eventfds[0]);
-        return -1;
-}
-
-int eventfd_send_state(int efd, eventfd_t s) {
-        return eventfd_write(efd, s);
-}
-
-/*
- * Receive an eventfd state on the eventfd file descriptor.
- *
- * If the third argument is set to a value other than zero, then this
- * function will compare the received value with this argument and set
- * the return value.
- *
- * On success return 0. On error, -1 will be returned, and errno will
- * be set appropriately.
- */
-int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s) {
-        int ret;
-
-        ret = eventfd_read(efd, e);
-        if (ret < 0)
-                return ret;
-        else if (s != 0 && *e != s) {
-                errno = EINVAL;
-                return -1;
-        }
-
-        return 0;
-}
-
-/*
- * Receive the EVENTFD_START state on the eventfd file descriptor.
- *
- * On Success return 0. On error, -1 will be returned, and errno will
- * be set appropriately.
- */
-int eventfd_recv_start(int efd) {
-        eventfd_t e = EVENTFD_INIT;
-        return eventfd_recv_state(efd, &e, EVENTFD_START);
-}
-
-/*
- * Receive the EVENTFD_PARENT_SUCCEEDED state on the eventfd file
- * descriptor.
- *
- * On Success return 0. On error, -1 will be returned, and errno will
- * be set appropriately.
- */
-int eventfd_parent_succeeded(int efd) {
-        eventfd_t e = EVENTFD_INIT;
-        return eventfd_recv_state(efd, &e, EVENTFD_PARENT_SUCCEEDED);
-}
-
-/*
- * Receive the EVENTFD_CHILD_SUCCEEDED state on the eventfd file
- * descriptor.
- *
- * On Success return 0. On error, -1 will be returned, and errno will
- * be set appropriately.
- */
-int eventfd_child_succeeded(int efd) {
-        eventfd_t e = EVENTFD_INIT;
-        return eventfd_recv_state(efd, &e, EVENTFD_CHILD_SUCCEEDED);
-}
diff --git a/src/shared/eventfd-util.h b/src/shared/eventfd-util.h
deleted file mode 100644
index 0120f04..0000000
--- a/src/shared/eventfd-util.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
-
-#pragma once
-
-/***
-  This file is part of systemd.
-
-  Copyright 2014 Djalal Harouni
-
-  systemd is free software; you can redistribute it and/or modify it
-  under the terms of the GNU Lesser General Public License as published by
-  the Free Software Foundation; either version 2.1 of the License, or
-  (at your option) any later version.
-
-  systemd is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public License
-  along with systemd; If not, see <http://www.gnu.org/licenses/>.
-***/
-
-#include <sys/types.h>
-#include <sys/eventfd.h>
-
-enum {
-        EVENTFD_INIT,
-        EVENTFD_START,
-        EVENTFD_PARENT_SUCCEEDED,
-        EVENTFD_PARENT_FAILED,
-        EVENTFD_CHILD_SUCCEEDED,
-        EVENTFD_CHILD_FAILED,
-};
-
-pid_t clone_with_eventfd(int flags, int eventfds[2]);
-
-int eventfd_send_state(int efd, eventfd_t s);
-int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s);
-
-int eventfd_recv_start(int efd);
-int eventfd_parent_succeeded(int efd);
-int eventfd_child_succeeded(int efd);

commit 279da1e3f99b9c767a69849b5445e3cfd8d83376
Author: David Herrmann <dh.herrmann at gmail.com>
Date:   Thu Jul 10 15:25:47 2014 +0200

    shared: add generic IPC barrier
    
    The "Barrier" object is a simple inter-process barrier implementation. It
    allows placing synchronization points and waiting for the other side to
    reach it. Additionally, it has an abortion-mechanism as second-layer
    synchronization to send abortion-events asynchronously to the other side.
    
    The API is usually used to synchronize processes during fork(). However,
    it can be extended to pass state through execve() so you could synchronize
    beyond execve().
    
    Usually, it's used like this (error-handling replaced by assert() for
    simplicity):
    
        Barrier b;
    
        r = barrier_init(&b);
        assert_se(r >= 0);
    
        pid = fork();
        assert_se(pid >= 0);
        if (pid == 0) {
                barrier_set_role(&b, BARRIER_CHILD);
    
                ...do child post-setup...
                if (CHILD_SETUP_FAILED)
                           exit(1);
                ...child setup done...
    
                barrier_place(&b);
                if (!barrier_sync(&b)) {
                        /* parent setup failed */
                        exit(1);
                }
    
                barrier_destroy(&b); /* redundant as execve() and exit() imply this */
    
                /* parent & child setup successful */
                execve(...);
        }
    
        barrier_set_role(&b, BARRIER_PARENT);
    
        ...do parent post-setup...
        if (PARENT_SETUP_FAILED) {
                barrier_abort(&b);          /* send abortion event */
                barrier_wait_abortion(&b);  /* wait for child to abort (exit() implies abortion) */
                barrier_destroy(&b);
               ...bail out...
        }
        ...parent setup done...
    
        barrier_place(&b);
        if (!barrier_sync(&b)) {
                ...child setup failed... ;
                barrier_destroy(&b);
                ...bail out...
        }
    
        barrier_destroy(&b);
    
        ...child setup successfull...
    
    This is the most basic API. Using barrier_place() to place barriers and
    barrier_sync() to perform a full synchronization between both processes.
    barrier_abort() places an abortion barrier which superceeds any other
    barriers, exit() (or barrier_destroy()) places an abortion-barrier that
    queues behind existing barriers (thus *not* replacing existing barriers
    unlike barrier_abort()).
    
    This example uses hard-synchronization with wait_abortion(), sync() and
    friends. These are all optional. Barriers are highly dynamic and can be
    used for one-way synchronization or even no synchronization at all
    (postponing it for later). The sync() call performs a full two-way
    synchronization.
    
    The API is documented and should be fairly self-explanatory. A test-suite
    shows some special semantics regarding abortion, wait_next() and exit().
    
    Internally, barriers use two eventfds and a pipe. The pipe is used to
    detect exit()s of the remote side as eventfds do not allow that. The
    eventfds are used to place barriers, one for each side. Barriers itself
    are numbered, but the numbers are reused once both sides reached the same
    barrier, thus you cannot address barriers by the index. Moreover, the
    numbering is implicit and we only store a counter. This makes the
    implementation itself very lightweight, which is probably negligible
    considering that we need 3 FDs for a barrier..
    
    Last but not least: This barrier implementation is quite heavy. It's
    definitely not meant for fast IPC synchronization. However, it's very easy
    to use. And given the *HUGE* overhead of fork(), the barrier-overhead
    should be negligible.

diff --git a/.gitignore b/.gitignore
index bf53064..4860631 100644
--- a/.gitignore
+++ b/.gitignore
@@ -124,6 +124,7 @@
 /tags
 /test-architecture
 /test-async
+/test-barrier
 /test-boot-timestamp
 /test-bus-chat
 /test-bus-cleanup
diff --git a/Makefile.am b/Makefile.am
index 7025137..f0d80ba 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -841,6 +841,8 @@ libsystemd_shared_la_SOURCES = \
 	src/shared/login-shared.h \
 	src/shared/ring.c \
 	src/shared/ring.h \
+	src/shared/barrier.c \
+	src/shared/barrier.h \
 	src/shared/async.c \
 	src/shared/async.h \
 	src/shared/eventfd-util.c \
@@ -1251,6 +1253,7 @@ tests += \
 	test-ellipsize \
 	test-util \
 	test-ring \
+	test-barrier \
 	test-tmpfiles \
 	test-namespace \
 	test-date \
@@ -1421,6 +1424,12 @@ test_ring_SOURCES = \
 test_ring_LDADD = \
 	libsystemd-core.la
 
+test_barrier_SOURCES = \
+	src/test/test-barrier.c
+
+test_barrier_LDADD = \
+	libsystemd-core.la
+
 test_tmpfiles_SOURCES = \
 	src/test/test-tmpfiles.c
 
diff --git a/src/shared/barrier.c b/src/shared/barrier.c
new file mode 100644
index 0000000..c198329
--- /dev/null
+++ b/src/shared/barrier.c
@@ -0,0 +1,440 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+  This file is part of systemd.
+
+  Copyright 2014 David Herrmann <dh.herrmann at gmail.com>
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/eventfd.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "barrier.h"
+#include "macro.h"
+#include "util.h"
+
+/**
+ * Barriers
+ * This barrier implementation provides a simple synchronization method based
+ * on file-descriptors that can safely be used between threads and processes. A
+ * barrier object contains 2 shared counters based on eventfd. Both processes
+ * can now place barriers and wait for the other end to reach a random or
+ * specific barrier.
+ * Barriers are numbered, so you can either wait for the other end to reach any
+ * barrier or the last barrier that you placed. This way, you can use barriers
+ * for one-way *and* full synchronization. Note that even-though barriers are
+ * numbered, these numbers are internal and recycled once both sides reached the
+ * same barrier (implemented as a simple signed counter). It is thus not
+ * possible to address barriers by their ID.
+ *
+ * Barrier-API: Both ends can place as many barriers via barrier_place() as
+ * they want and each pair of barriers on both sides will be implicitly linked.
+ * Each side can use the barrier_wait/sync_*() family of calls to wait for the
+ * other side to place a specific barrier. barrier_wait_next() waits until the
+ * other side calls barrier_place(). No links between the barriers are
+ * considered and this simply serves as most basic asynchronous barrier.
+ * barrier_sync_next() is like barrier_wait_next() and waits for the other side
+ * to place their next barrier via barrier_place(). However, it only waits for
+ * barriers that are linked to a barrier we already placed. If the other side
+ * already placed more barriers than we did, barrier_sync_next() returns
+ * immediately.
+ * barrier_sync() extends barrier_sync_next() and waits until the other end
+ * placed as many barriers via barrier_place() as we did. If they already placed
+ * as many as we did (or more), it returns immediately.
+ *
+ * Additionally to basic barriers, an abortion event is available.
+ * barrier_abort() places an abortion event that cannot be undone. An abortion
+ * immediately cancels all placed barriers and replaces them. Any running and
+ * following wait/sync call besides barrier_wait_abortion() will immediately
+ * return false on both sides (otherwise, they always return true).
+ * barrier_abort() can be called multiple times on both ends and will be a
+ * no-op if already called on this side.
+ * barrier_wait_abortion() can be used to wait for the other side to call
+ * barrier_abort() and is the only wait/sync call that does not return
+ * immediately if we aborted outself. It only returns once the other side
+ * called barrier_abort().
+ *
+ * Barriers can be used for in-process and inter-process synchronization.
+ * However, for in-process synchronization you could just use mutexes.
+ * Therefore, main target is IPC and we require both sides to *not* share the FD
+ * table. If that's given, barriers provide target tracking: If the remote side
+ * exit()s, an abortion event is implicitly queued on the other side. This way,
+ * a sync/wait call will be woken up if the remote side crashed or exited
+ * unexpectedly. However, note that these abortion events are only queued if the
+ * barrier-queue has been drained. Therefore, it is safe to place a barrier and
+ * exit. The other side can safely wait on the barrier even though the exit
+ * queued an abortion event. Usually, the abortion event would overwrite the
+ * barrier, however, that's not true for exit-abortion events. Those are only
+ * queued if the barrier-queue is drained (thus, the receiving side has placed
+ * more barriers than the remote side).
+ */
+
+/**
+ * barrier_init() - Initialize a barrier object
+ * @obj: barrier to initialize
+ *
+ * This initializes a barrier object. The caller is responsible of allocating
+ * the memory and keeping it valid. The memory does not have to be zeroed
+ * beforehand.
+ * Two eventfd objects are allocated for each barrier. If allocation fails, an
+ * error is returned.
+ *
+ * If this function fails, the barrier is reset to an invalid state so it is
+ * safe to call barrier_destroy() on the object regardless whether the
+ * initialization succeeded or not.
+ *
+ * The caller is responsible to destroy the object via barrier_destroy() before
+ * releasing the underlying memory.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int barrier_init(Barrier *obj) {
+        _cleanup_(barrier_destroy) Barrier b = { };
+        int r;
+
+        assert_return(obj, -EINVAL);
+
+        b.me = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+        if (b.me < 0)
+                return -errno;
+
+        b.them = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+        if (b.them < 0)
+                return -errno;
+
+        r = pipe2(b.pipe, O_CLOEXEC | O_NONBLOCK);
+        if (r < 0)
+                return -errno;
+
+        memcpy(obj, &b, sizeof(b));
+        zero(b);
+        return 0;
+}
+
+/**
+ * barrier_destroy() - Destroy a barrier object
+ * @b: barrier to destroy or NULL
+ *
+ * This destroys a barrier object that has previously been initialized via
+ * barrier_init(). The object is released and reset to invalid state.
+ * Therefore, it is safe to call barrier_destroy() multiple times or even if
+ * barrier_init() failed. However, you must not call barrier_destroy() if you
+ * never called barrier_init() on the object before.
+ *
+ * It is safe to initialize a barrier via zero() / memset(.., 0, ...). Even
+ * though it has embedded FDs, barrier_destroy() can deal with zeroed objects
+ * just fine.
+ *
+ * If @b is NULL, this is a no-op.
+ */
+void barrier_destroy(Barrier *b) {
+        if (!b)
+                return;
+
+        /* @me and @them cannot be both FD 0. Lets be pedantic and check the
+         * pipes and barriers, too. If all are 0, the object was zero()ed and
+         * is invalid. This allows users to use zero(barrier) to reset the
+         * backing memory. */
+        if (b->me == 0 &&
+            b->them == 0 &&
+            b->pipe[0] == 0 &&
+            b->pipe[1] == 0 &&
+            b->barriers == 0)
+                return;
+
+        b->me = safe_close(b->me);
+        b->them = safe_close(b->them);
+        b->pipe[0] = safe_close(b->pipe[0]);
+        b->pipe[1] = safe_close(b->pipe[1]);
+        b->barriers = 0;
+}
+
+/**
+ * barrier_set_role() - Set the local role of the barrier
+ * @b: barrier to operate on
+ * @role: role to set on the barrier
+ *
+ * This sets the roles on a barrier object. This is needed to know which
+ * side of the barrier you're on. Usually, the parent creates the barrier via
+ * barrier_init() and then calls fork() or clone(). Therefore, the FDs are
+ * duplicated and the child retains the same barrier object.
+ *
+ * Both sides need to call barrier_set_role() after fork() or clone() are done.
+ * If this is not done, barriers will not work correctly.
+ *
+ * Note that barriers could be supported without fork() or clone(). However,
+ * this is currently not needed so it hasn't been implemented.
+ */
+void barrier_set_role(Barrier *b, unsigned int role) {
+        int fd;
+
+        assert(b);
+        assert(role == BARRIER_PARENT || role == BARRIER_CHILD);
+        /* make sure this is only called once */
+        assert(b->pipe[1] >= 0 && b->pipe[1] >= 0);
+
+        if (role == BARRIER_PARENT) {
+                b->pipe[1] = safe_close(b->pipe[1]);
+        } else {
+                b->pipe[0] = safe_close(b->pipe[0]);
+
+                /* swap me/them for children */
+                fd = b->me;
+                b->me = b->them;
+                b->them = fd;
+        }
+}
+
+/* places barrier; returns false if we aborted, otherwise true */
+static bool barrier_write(Barrier *b, uint64_t buf) {
+        ssize_t len;
+
+        /* prevent new sync-points if we already aborted */
+        if (barrier_i_aborted(b))
+                return false;
+
+        do {
+                len = write(b->me, &buf, sizeof(buf));
+        } while (len < 0 && (errno == EAGAIN || errno == EINTR));
+
+        if (len != sizeof(buf))
+                goto error;
+
+        /* lock if we aborted */
+        if (buf >= (uint64_t)BARRIER_ABORTION) {
+                if (barrier_they_aborted(b))
+                        b->barriers = BARRIER_WE_ABORTED;
+                else
+                        b->barriers = BARRIER_I_ABORTED;
+        } else if (!barrier_is_aborted(b)) {
+                b->barriers += buf;
+        }
+
+        return !barrier_i_aborted(b);
+
+error:
+        /* If there is an unexpected error, we have to make this fatal. There
+         * is no way we can recover from sync-errors. Therefore, we close the
+         * pipe-ends and treat this as abortion. The other end will notice the
+         * pipe-close and treat it as abortion, too. */
+
+        b->pipe[0] = safe_close(b->pipe[0]);
+        b->pipe[1] = safe_close(b->pipe[1]);
+        b->barriers = BARRIER_WE_ABORTED;
+        return false;
+}
+
+/* waits for barriers; returns false if they aborted, otherwise true */
+static bool barrier_read(Barrier *b, int64_t comp) {
+        uint64_t buf;
+        ssize_t len;
+        struct pollfd pfd[2] = { };
+        int r;
+
+        if (barrier_they_aborted(b))
+                return false;
+
+        while (b->barriers > comp) {
+                pfd[0].fd = (b->pipe[0] >= 0) ? b->pipe[0] : b->pipe[1];
+                pfd[0].events = POLLHUP;
+                pfd[0].revents = 0;
+                pfd[1].fd = b->them;
+                pfd[1].events = POLLIN;
+                pfd[1].revents = 0;
+
+                r = poll(pfd, 2, -1);
+                if (r < 0 && (errno == EAGAIN || errno == EINTR))
+                        continue;
+                else if (r < 0)
+                        goto error;
+
+                if (pfd[1].revents) {
+                        /* events on @them signal us new data */
+                        len = read(b->them, &buf, sizeof(buf));
+                        if (len < 0 && (errno == EAGAIN || errno == EINTR))
+                                continue;
+
+                        if (len != sizeof(buf))
+                                goto error;
+                } else if (pfd[0].revents & (POLLHUP | POLLERR | POLLNVAL)) {
+                        /* POLLHUP on the pipe tells us the other side exited.
+                         * We treat this as implicit abortion. But we only
+                         * handle it if there's no event on the eventfd. This
+                         * guarantees that exit-abortions do not overwrite real
+                         * barriers. */
+                        buf = BARRIER_ABORTION;
+                }
+
+                /* lock if they aborted */
+                if (buf >= (uint64_t)BARRIER_ABORTION) {
+                        if (barrier_i_aborted(b))
+                                b->barriers = BARRIER_WE_ABORTED;
+                        else
+                                b->barriers = BARRIER_THEY_ABORTED;
+                } else if (!barrier_is_aborted(b)) {
+                        b->barriers -= buf;
+                }
+        }
+
+        return !barrier_they_aborted(b);
+
+error:
+        /* If there is an unexpected error, we have to make this fatal. There
+         * is no way we can recover from sync-errors. Therefore, we close the
+         * pipe-ends and treat this as abortion. The other end will notice the
+         * pipe-close and treat it as abortion, too. */
+
+        b->pipe[0] = safe_close(b->pipe[0]);
+        b->pipe[1] = safe_close(b->pipe[1]);
+        b->barriers = BARRIER_WE_ABORTED;
+        return false;
+}
+
+/**
+ * barrier_place() - Place a new barrier
+ * @b: barrier object
+ *
+ * This places a new barrier on the barrier object. If either side already
+ * aborted, this is a no-op and returns "false". Otherwise, the barrier is
+ * placed and this returns "true".
+ *
+ * Returns: true if barrier was placed, false if either side aborted.
+ */
+bool barrier_place(Barrier *b) {
+        assert(b);
+
+        if (barrier_is_aborted(b))
+                return false;
+
+        barrier_write(b, BARRIER_SINGLE);
+        return true;
+}
+
+/**
+ * barrier_abort() - Abort the synchronization
+ * @b: barrier object to abort
+ *
+ * This aborts the barrier-synchronization. If barrier_abort() was already
+ * called on this side, this is a no-op. Otherwise, the barrier is put into the
+ * ABORT-state and will stay there. The other side is notified about the
+ * abortion. Any following attempt to place normal barriers or to wait on normal
+ * barriers will return immediately as "false".
+ *
+ * You can wait for the other side to call barrier_abort(), too. Use
+ * barrier_wait_abortion() for that.
+ *
+ * Returns: false if the other side already aborted, true otherwise.
+ */
+bool barrier_abort(Barrier *b) {
+        assert(b);
+
+        barrier_write(b, BARRIER_ABORTION);
+        return !barrier_they_aborted(b);
+}
+
+/**
+ * barrier_wait_next() - Wait for the next barrier of the other side
+ * @b: barrier to operate on
+ *
+ * This waits until the other side places its next barrier. This is independent
+ * of any barrier-links and just waits for any next barrier of the other side.
+ *
+ * If either side aborted, this returns false.
+ *
+ * Returns: false if either side aborted, true otherwise.
+ */
+bool barrier_wait_next(Barrier *b) {
+        assert(b);
+
+        if (barrier_is_aborted(b))
+                return false;
+
+        barrier_read(b, b->barriers - 1);
+        return !barrier_is_aborted(b);
+}
+
+/**
+ * barrier_wait_abortion() - Wait for the other side to abort
+ * @b: barrier to operate on
+ *
+ * This waits until the other side called barrier_abort(). This can be called
+ * regardless whether the local side already called barrier_abort() or not.
+ *
+ * If the other side has already aborted, this returns immediately.
+ *
+ * Returns: false if the local side aborted, true otherwise.
+ */
+bool barrier_wait_abortion(Barrier *b) {
+        assert(b);
+
+        barrier_read(b, BARRIER_THEY_ABORTED);
+        return !barrier_i_aborted(b);
+}
+
+/**
+ * barrier_sync_next() - Wait for the other side to place a next linked barrier
+ * @b: barrier to operate on
+ *
+ * This is like barrier_wait_next() and waits for the other side to call
+ * barrier_place(). However, this only waits for linked barriers. That means, if
+ * the other side already placed more barriers than (or as much as) we did, this
+ * returns immediately instead of waiting.
+ *
+ * If either side aborted, this returns false.
+ *
+ * Returns: false if either side aborted, true otherwise.
+ */
+bool barrier_sync_next(Barrier *b) {
+        assert(b);
+
+        if (barrier_is_aborted(b))
+                return false;
+
+        barrier_read(b, MAX((int64_t)0, b->barriers - 1));
+        return !barrier_is_aborted(b);
+}
+
+/**
+ * barrier_sync() - Wait for the other side to place as many barriers as we did
+ * @b: barrier to operate on
+ *
+ * This is like barrier_sync_next() but waits for the other side to call
+ * barrier_place() as often as we did (in total). If they already placed as much
+ * as we did (or more), this returns immediately instead of waiting.
+ *
+ * If either side aborted, this returns false.
+ *
+ * Returns: false if either side aborted, true otherwise.
+ */
+bool barrier_sync(Barrier *b) {
+        assert(b);
+
+        if (barrier_is_aborted(b))
+                return false;
+
+        barrier_read(b, 0);
+        return !barrier_is_aborted(b);
+}
diff --git a/src/shared/barrier.h b/src/shared/barrier.h
new file mode 100644
index 0000000..7f76ec7
--- /dev/null
+++ b/src/shared/barrier.h
@@ -0,0 +1,92 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+#pragma once
+
+/***
+  This file is part of systemd.
+
+  Copyright 2014 David Herrmann <dh.herrmann at gmail.com>
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "macro.h"
+#include "util.h"
+
+/* See source file for an API description. */
+
+typedef struct Barrier Barrier;
+
+enum {
+        BARRIER_SINGLE                  = 1LL,
+        BARRIER_ABORTION                = INT64_MAX,
+
+        /* bias values to store state; keep @WE < @THEY < @I */
+        BARRIER_BIAS                    = INT64_MIN,
+        BARRIER_WE_ABORTED              = BARRIER_BIAS + 1LL,
+        BARRIER_THEY_ABORTED            = BARRIER_BIAS + 2LL,
+        BARRIER_I_ABORTED               = BARRIER_BIAS + 3LL,
+};
+
+enum {
+        BARRIER_PARENT,
+        BARRIER_CHILD,
+};
+
+struct Barrier {
+        int me;
+        int them;
+        int pipe[2];
+        int64_t barriers;
+};
+
+int barrier_init(Barrier *obj);
+void barrier_destroy(Barrier *b);
+
+void barrier_set_role(Barrier *b, unsigned int role);
+
+bool barrier_place(Barrier *b);
+bool barrier_abort(Barrier *b);
+
+bool barrier_wait_next(Barrier *b);
+bool barrier_wait_abortion(Barrier *b);
+bool barrier_sync_next(Barrier *b);
+bool barrier_sync(Barrier *b);
+
+static inline bool barrier_i_aborted(Barrier *b) {
+        return b->barriers == BARRIER_I_ABORTED || b->barriers == BARRIER_WE_ABORTED;
+}
+
+static inline bool barrier_they_aborted(Barrier *b) {
+        return b->barriers == BARRIER_THEY_ABORTED || b->barriers == BARRIER_WE_ABORTED;
+}
+
+static inline bool barrier_we_aborted(Barrier *b) {
+        return b->barriers == BARRIER_WE_ABORTED;
+}
+
+static inline bool barrier_is_aborted(Barrier *b) {
+        return b->barriers == BARRIER_I_ABORTED || b->barriers == BARRIER_THEY_ABORTED || b->barriers == BARRIER_WE_ABORTED;
+}
+
+static inline bool barrier_place_and_sync(Barrier *b) {
+        barrier_place(b);
+        return barrier_sync(b);
+}
diff --git a/src/test/test-barrier.c b/src/test/test-barrier.c
new file mode 100644
index 0000000..640e508
--- /dev/null
+++ b/src/test/test-barrier.c
@@ -0,0 +1,460 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+  This file is part of systemd.
+
+  Copyright 2014 David Herrmann <dh.herrmann at gmail.com>
+
+  systemd is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation; either version 2.1 of the License, or
+  (at your option) any later version.
+
+  systemd is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+/*
+ * IPC barrier tests
+ * These tests verify the correct behavior of the IPC Barrier implementation.
+ * Note that the tests use alarm-timers to verify dead-locks and timeouts. These
+ * might not work on slow machines where 20ms are too short to perform specific
+ * operations (though, very unlikely). In case that turns out true, we have to
+ * increase it at the slightly cost of lengthen test-duration on other machines.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "barrier.h"
+#include "def.h"
+#include "util.h"
+
+/* 20ms to test deadlocks; All timings use multiples of this constant as
+ * alarm/sleep timers. If this timeout is too small for slow machines to perform
+ * the requested operations, we have to increase it. On an i7 this works fine
+ * with 1ms base-time, so 20ms should be just fine for everyone. */
+#define BASE_TIME 20
+
+static void malarm(unsigned long msecs) {
+        struct itimerval v = { };
+
+        timeval_store(&v.it_value, msecs * USEC_PER_MSEC);
+        assert_se(setitimer(ITIMER_REAL, &v, NULL) >= 0);
+}
+
+static void msleep(unsigned long msecs) {
+        assert_se(msecs < MSEC_PER_SEC);
+        usleep(msecs * USEC_PER_MSEC);
+}
+
+#define TEST_BARRIER(_FUNCTION, _CHILD_CODE, _WAIT_CHILD, _PARENT_CODE, _WAIT_PARENT)  \
+        static void _FUNCTION(void) {                                   \
+                Barrier b;                                              \
+                pid_t pid1, pid2;                                       \
+                                                                        \
+                assert_se(barrier_init(&b) >= 0);                       \
+                                                                        \
+                pid1 = fork();                                          \
+                assert_se(pid1 >= 0);                                   \
+                if (pid1 == 0) {                                        \
+                        barrier_set_role(&b, BARRIER_CHILD);            \
+                        { _CHILD_CODE; }                                \
+                        exit(42);                                       \
+                }                                                       \
+                                                                        \
+                pid2 = fork();                                          \
+                assert_se(pid2 >= 0);                                   \
+                if (pid2 == 0) {                                        \
+                        barrier_set_role(&b, BARRIER_PARENT);           \
+                        { _PARENT_CODE; }                               \
+                        exit(42);                                       \
+                }                                                       \
+                                                                        \
+                barrier_destroy(&b);                                    \
+                malarm(999);                                            \
+                { _WAIT_CHILD; }                                        \
+                { _WAIT_PARENT; }                                       \
+                malarm(0);                                              \
+        }
+
+#define TEST_BARRIER_WAIT_SUCCESS(_pid) \
+                ({                                                      \
+                        int pidr, status;                               \
+                        pidr = waitpid(_pid, &status, 0);               \
+                        assert_se(pidr == _pid);                        \
+                        assert_se(WIFEXITED(status));                   \
+                        assert_se(WEXITSTATUS(status) == 42);           \
+                })
+
+#define TEST_BARRIER_WAIT_ALARM(_pid) \
+                ({                                                      \
+                        int pidr, status;                               \
+                        pidr = waitpid(_pid, &status, 0);               \
+                        assert_se(pidr == _pid);                        \
+                        assert_se(WIFSIGNALED(status));                 \
+                        assert_se(WTERMSIG(status) == SIGALRM);         \
+                })
+
+/*
+ * Test basic sync points
+ * This places a barrier in both processes and waits synchronously for them.
+ * The timeout makes sure the sync works as expected. The msleep() on one side
+ * makes sure the exit of the parent does not overwrite previous barriers. Due
+ * to the msleep(), we know that the parent already exited, thus there's a
+ * pending HUP on the pipe. However, the barrier_sync() prefers reads on the
+ * eventfd, thus we can safely wait on the barrier.
+ */
+TEST_BARRIER(test_barrier_sync,
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_place(&b));
+                msleep(BASE_TIME * 2);
+                assert_se(barrier_sync(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_place(&b));
+                assert_se(barrier_sync(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test wait_next()
+ * This places a barrier in the parent and syncs on it. The child sleeps while
+ * the parent places the barrier and then waits for a barrier. The wait will
+ * succeed as the child hasn't read the parent's barrier, yet. The following
+ * barrier and sync synchronize the exit.
+ */
+TEST_BARRIER(test_barrier_wait_next,
+        ({
+                msleep(100);
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_wait_next(&b));
+                assert_se(barrier_place(&b));
+                assert_se(barrier_sync(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                malarm(400);
+                assert_se(barrier_place(&b));
+                assert_se(barrier_sync(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test wait_next() multiple times
+ * This places two barriers in the parent and waits for the child to exit. The
+ * child sleeps 20ms so both barriers _should_ be in place. It then waits for
+ * the parent to place the next barrier twice. The first call will fetch both
+ * barriers and return. However, the second call will stall as the parent does
+ * not place a 3rd barrier (the sleep caught two barriers). wait_next() is does
+ * not look at barrier-links so this stall is expected. Thus this test times
+ * out.
+ */
+TEST_BARRIER(test_barrier_wait_next_twice,
+        ({
+                msleep(BASE_TIME);
+                malarm(BASE_TIME);
+                assert_se(barrier_wait_next(&b));
+                assert_se(barrier_wait_next(&b));
+                assert_se(0);
+        }),
+        TEST_BARRIER_WAIT_ALARM(pid1),
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_place(&b));
+                assert_se(barrier_place(&b));
+                msleep(BASE_TIME * 2);
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test wait_next() with local barriers
+ * This is the same as test_barrier_wait_next_twice, but places local barriers
+ * between both waits. This does not have any effect on the wait so it times out
+ * like the other test.
+ */
+TEST_BARRIER(test_barrier_wait_next_twice_local,
+        ({
+                msleep(BASE_TIME);
+                malarm(BASE_TIME);
+                assert_se(barrier_wait_next(&b));
+                assert_se(barrier_place(&b));
+                assert_se(barrier_place(&b));
+                assert_se(barrier_wait_next(&b));
+                assert_se(0);
+        }),
+        TEST_BARRIER_WAIT_ALARM(pid1),
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_place(&b));
+                assert_se(barrier_place(&b));
+                msleep(BASE_TIME * 2);
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test wait_next() with sync_next()
+ * This is again the same as test_barrier_wait_next_twice but uses a
+ * synced wait as the second wait. This works just fine because the local state
+ * has no barriers placed, therefore, the remote is always in sync.
+ */
+TEST_BARRIER(test_barrier_wait_next_twice_sync,
+        ({
+                msleep(BASE_TIME);
+                malarm(BASE_TIME);
+                assert_se(barrier_wait_next(&b));
+                assert_se(barrier_sync_next(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_place(&b));
+                assert_se(barrier_place(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test wait_next() with sync_next() and local barriers
+ * This is again the same as test_barrier_wait_next_twice_local but uses a
+ * synced wait as the second wait. This works just fine because the local state
+ * is in sync with the remote.
+ */
+TEST_BARRIER(test_barrier_wait_next_twice_local_sync,
+        ({
+                msleep(BASE_TIME);
+                malarm(BASE_TIME);
+                assert_se(barrier_wait_next(&b));
+                assert_se(barrier_place(&b));
+                assert_se(barrier_place(&b));
+                assert_se(barrier_sync_next(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_place(&b));
+                assert_se(barrier_place(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test sync_next() and sync()
+ * This tests sync_*() synchronizations and makes sure they work fine if the
+ * local state is behind the remote state.
+ */
+TEST_BARRIER(test_barrier_sync_next,
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_sync_next(&b));
+                assert_se(barrier_sync(&b));
+                assert_se(barrier_place(&b));
+                assert_se(barrier_place(&b));
+                assert_se(barrier_sync_next(&b));
+                assert_se(barrier_sync_next(&b));
+                assert_se(barrier_sync(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                malarm(BASE_TIME * 10);
+                msleep(BASE_TIME);
+                assert_se(barrier_place(&b));
+                assert_se(barrier_place(&b));
+                assert_se(barrier_sync(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test sync_next() and sync() with local barriers
+ * This tests timeouts if sync_*() is used if local barriers are placed but the
+ * remote didn't place any.
+ */
+TEST_BARRIER(test_barrier_sync_next_local,
+        ({
+                malarm(BASE_TIME);
+                assert_se(barrier_place(&b));
+                assert_se(barrier_sync_next(&b));
+                assert_se(0);
+        }),
+        TEST_BARRIER_WAIT_ALARM(pid1),
+        ({
+                msleep(BASE_TIME * 2);
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test sync_next() and sync() with local barriers and abortion
+ * This is the same as test_barrier_sync_next_local but aborts the sync in the
+ * parent. Therefore, the sync_next() succeeds just fine due to the abortion.
+ */
+TEST_BARRIER(test_barrier_sync_next_local_abort,
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_place(&b));
+                assert_se(!barrier_sync_next(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                assert_se(barrier_abort(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test matched wait_abortion()
+ * This runs wait_abortion() with remote abortion.
+ */
+TEST_BARRIER(test_barrier_wait_abortion,
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_wait_abortion(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                assert_se(barrier_abort(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test unmatched wait_abortion()
+ * This runs wait_abortion() without any remote abortion going on. It thus must
+ * timeout.
+ */
+TEST_BARRIER(test_barrier_wait_abortion_unmatched,
+        ({
+                malarm(BASE_TIME);
+                assert_se(barrier_wait_abortion(&b));
+                assert_se(0);
+        }),
+        TEST_BARRIER_WAIT_ALARM(pid1),
+        ({
+                msleep(BASE_TIME * 2);
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test matched wait_abortion() with local abortion
+ * This runs wait_abortion() with local and remote abortion.
+ */
+TEST_BARRIER(test_barrier_wait_abortion_local,
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_abort(&b));
+                assert_se(!barrier_wait_abortion(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                assert_se(barrier_abort(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test unmatched wait_abortion() with local abortion
+ * This runs wait_abortion() with only local abortion. This must time out.
+ */
+TEST_BARRIER(test_barrier_wait_abortion_local_unmatched,
+        ({
+                malarm(BASE_TIME);
+                assert_se(barrier_abort(&b));
+                assert_se(!barrier_wait_abortion(&b));
+                assert_se(0);
+        }),
+        TEST_BARRIER_WAIT_ALARM(pid1),
+        ({
+                msleep(BASE_TIME * 2);
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test child exit
+ * Place barrier and sync with the child. The child only exits()s, which should
+ * cause an implicit abortion and wake the parent.
+ */
+TEST_BARRIER(test_barrier_exit,
+        ({
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                malarm(BASE_TIME * 10);
+                assert_se(barrier_place(&b));
+                assert_se(!barrier_sync(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+/*
+ * Test child exit with sleep
+ * Same as test_barrier_exit but verifies the test really works due to the
+ * child-exit. We add a usleep() which triggers the alarm in the parent and
+ * causes the test to time out.
+ */
+TEST_BARRIER(test_barrier_no_exit,
+        ({
+                msleep(BASE_TIME * 2);
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                malarm(BASE_TIME);
+                assert_se(barrier_place(&b));
+                assert_se(!barrier_sync(&b));
+        }),
+        TEST_BARRIER_WAIT_ALARM(pid2));
+
+/*
+ * Test pending exit against sync
+ * The parent places a barrier *and* exits. The 20ms wait in the child
+ * guarantees both are pending. However, our logic prefers pending barriers over
+ * pending exit-abortions (unlike normal abortions), thus the wait_next() must
+ * succeed, same for the sync_next() as our local barrier-count is smaller than
+ * the remote. Once we place a barrier our count is equal, so the sync still
+ * succeeds. Only if we place one more barrier, we're ahead of the remote, thus
+ * we will fail due to HUP on the pipe.
+ */
+TEST_BARRIER(test_barrier_pending_exit,
+        ({
+                malarm(BASE_TIME * 4);
+                msleep(BASE_TIME * 2);
+                assert_se(barrier_wait_next(&b));
+                assert_se(barrier_sync_next(&b));
+                assert_se(barrier_place(&b));
+                assert_se(barrier_sync_next(&b));
+                assert_se(barrier_place(&b));
+                assert_se(!barrier_sync_next(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid1),
+        ({
+                assert_se(barrier_place(&b));
+        }),
+        TEST_BARRIER_WAIT_SUCCESS(pid2));
+
+int main(int argc, char *argv[]) {
+        log_parse_environment();
+        log_open();
+
+        test_barrier_sync();
+        test_barrier_wait_next();
+        test_barrier_wait_next_twice();
+        test_barrier_wait_next_twice_sync();
+        test_barrier_wait_next_twice_local();
+        test_barrier_wait_next_twice_local_sync();
+        test_barrier_sync_next();
+        test_barrier_sync_next_local();
+        test_barrier_sync_next_local_abort();
+        test_barrier_wait_abortion();
+        test_barrier_wait_abortion_unmatched();
+        test_barrier_wait_abortion_local();
+        test_barrier_wait_abortion_local_unmatched();
+        test_barrier_exit();
+        test_barrier_no_exit();
+        test_barrier_pending_exit();
+
+        return 0;
+}