[systemd-commits] 4 commits - Makefile-man.am man/sd_listen_fds.xml man/sd_notify.xml man/systemd.service.xml src/core src/journal src/libsystemd src/machine src/shared src/systemd src/udev units/systemd-journald.service.in

Lennart Poettering lennart at kemper.freedesktop.org
Mon Jan 5 18:16:51 PST 2015


 Makefile-man.am                       |   31 ++
 man/sd_listen_fds.xml                 |   21 +
 man/sd_notify.xml                     |  122 ++++++++++-
 man/systemd.service.xml               |   29 ++
 src/core/dbus-service.c               |    1 
 src/core/load-fragment-gperf.gperf.m4 |    1 
 src/core/manager.c                    |   78 ++++---
 src/core/service.c                    |  166 +++++++++++++--
 src/core/service.h                    |   14 +
 src/core/unit.c                       |   21 +
 src/core/unit.h                       |    6 
 src/journal/journald-server.c         |   30 +-
 src/journal/journald-stream.c         |  376 +++++++++++++++++++++++++++++-----
 src/journal/journald-stream.h         |    3 
 src/libsystemd/libsystemd.sym.m4      |    5 
 src/libsystemd/sd-daemon/sd-daemon.c  |   94 +++++---
 src/machine/machine.c                 |   10 
 src/shared/fdset.c                    |   54 ++++
 src/shared/fdset.h                    |   11 
 src/shared/util.c                     |   25 ++
 src/shared/util.h                     |    2 
 src/systemd/sd-daemon.h               |   13 +
 src/udev/collect/Makefile             |    1 
 src/udev/v4l_id/Makefile              |    1 
 units/systemd-journald.service.in     |    1 
 25 files changed, 951 insertions(+), 165 deletions(-)

New commits:
commit 15f7a395c8f0b493c5ab8f57e6af7db7ad14688a
Author: Lennart Poettering <lennart at poettering.net>
Date:   Tue Jan 6 03:16:07 2015 +0100

    build-sys: add two more missing makefile links

diff --git a/src/udev/collect/Makefile b/src/udev/collect/Makefile
new file mode 120000
index 0000000..d0b0e8e
--- /dev/null
+++ b/src/udev/collect/Makefile
@@ -0,0 +1 @@
+../Makefile
\ No newline at end of file
diff --git a/src/udev/v4l_id/Makefile b/src/udev/v4l_id/Makefile
new file mode 120000
index 0000000..d0b0e8e
--- /dev/null
+++ b/src/udev/v4l_id/Makefile
@@ -0,0 +1 @@
+../Makefile
\ No newline at end of file

commit 17a20d64912e95ea90380381f85e9ef7fd56ff67
Author: Lennart Poettering <lennart at poettering.net>
Date:   Tue Jan 6 00:31:38 2015 +0100

    machined: simplification

diff --git a/src/machine/machine.c b/src/machine/machine.c
index b283118..6c01617 100644
--- a/src/machine/machine.c
+++ b/src/machine/machine.c
@@ -202,6 +202,9 @@ int machine_save(Machine *m) {
                 goto finish;
         }
 
+        free(temp_path);
+        temp_path = NULL;
+
         if (m->unit) {
                 char *sl;
 
@@ -213,12 +216,11 @@ int machine_save(Machine *m) {
         }
 
 finish:
-        if (r < 0) {
-                if (temp_path)
-                        unlink(temp_path);
+        if (temp_path)
+                unlink(temp_path);
 
+        if (r < 0)
                 log_error_errno(r, "Failed to save machine data %s: %m", m->state_file);
-        }
 
         return r;
 }

commit 13790add4bf648fed816361794d8277a75253410
Author: Lennart Poettering <lennart at poettering.net>
Date:   Tue Jan 6 00:30:25 2015 +0100

    journald: allow restarting journald without losing stream connections
    
    Making use of the fd storage capability of the previous commit, allow
    restarting journald by serilizing stream state to /run, and pushing open
    fds to PID 1.

diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
index 4613550..e0a078e 100644
--- a/src/journal/journald-server.c
+++ b/src/journal/journald-server.c
@@ -1182,6 +1182,10 @@ int server_process_datagram(sd_event_source *es, int fd, uint32_t revents, void
                         log_error_errno(errno, "recvmsg() failed: %m");
                         return -errno;
                 }
+                if (n == 0) {
+                        log_error("Got EOF on socket.");
+                        return -ECONNRESET;
+                }
 
                 for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg; cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
 
@@ -1462,6 +1466,7 @@ static int server_open_hostname(Server *s) {
 }
 
 int server_init(Server *s) {
+        _cleanup_fdset_free_ FDSet *fds = NULL;
         int n, r, fd;
 
         assert(s);
@@ -1558,26 +1563,33 @@ int server_init(Server *s) {
                         s->audit_fd = fd;
 
                 } else {
-                        log_warning("Unknown socket passed as file descriptor %d, ignoring.", fd);
 
-                        /* Let's close the fd, better be safe than
-                           sorry. The fd might reference some resource
-                           that we really want to release if we don't
-                           make use of it. */
+                        if (!fds) {
+                                fds = fdset_new();
+                                if (!fds)
+                                        return log_oom();
+                        }
 
-                        safe_close(fd);
+                        r = fdset_put(fds, fd);
+                        if (r < 0)
+                                return log_oom();
                 }
         }
 
-        r = server_open_syslog_socket(s);
+        r = server_open_stdout_socket(s, fds);
         if (r < 0)
                 return r;
 
-        r = server_open_native_socket(s);
+        if (fdset_size(fds) > 0) {
+                log_warning("%u unknown file descriptors passed, closing.", fdset_size(fds));
+                fds = fdset_free(fds);
+        }
+
+        r = server_open_syslog_socket(s);
         if (r < 0)
                 return r;
 
-        r = server_open_stdout_socket(s);
+        r = server_open_native_socket(s);
         if (r < 0)
                 return r;
 
diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c
index be498d4..eabe019 100644
--- a/src/journal/journald-stream.c
+++ b/src/journal/journald-stream.c
@@ -28,8 +28,11 @@
 #endif
 
 #include "sd-event.h"
+#include "sd-daemon.h"
 #include "socket-util.h"
 #include "selinux-util.h"
+#include "mkdir.h"
+#include "fileio.h"
 #include "journald-server.h"
 #include "journald-stream.h"
 #include "journald-syslog.h"
@@ -69,14 +72,153 @@ struct StdoutStream {
         bool forward_to_kmsg:1;
         bool forward_to_console:1;
 
+        bool fdstore:1;
+
         char buffer[LINE_MAX+1];
         size_t length;
 
         sd_event_source *event_source;
 
+        char *state_file;
+
         LIST_FIELDS(StdoutStream, stdout_stream);
 };
 
+void stdout_stream_free(StdoutStream *s) {
+        if (!s)
+                return;
+
+        if (s->server) {
+                assert(s->server->n_stdout_streams > 0);
+                s->server->n_stdout_streams --;
+                LIST_REMOVE(stdout_stream, s->server->stdout_streams, s);
+        }
+
+        if (s->event_source) {
+                sd_event_source_set_enabled(s->event_source, SD_EVENT_OFF);
+                s->event_source = sd_event_source_unref(s->event_source);
+        }
+
+        safe_close(s->fd);
+
+#ifdef HAVE_SELINUX
+        if (s->security_context)
+                freecon(s->security_context);
+#endif
+
+        free(s->identifier);
+        free(s->unit_id);
+        free(s->state_file);
+
+        free(s);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(StdoutStream*, stdout_stream_free);
+
+static void stdout_stream_destroy(StdoutStream *s) {
+        if (!s)
+                return;
+
+        if (s->state_file)
+                unlink(s->state_file);
+
+        stdout_stream_free(s);
+}
+
+static int stdout_stream_save(StdoutStream *s) {
+        _cleanup_free_ char *temp_path = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int r;
+
+        assert(s);
+
+        if (s->state != STDOUT_STREAM_RUNNING)
+                return 0;
+
+        if (!s->state_file) {
+                struct stat st;
+
+                r = fstat(s->fd, &st);
+                if (r < 0)
+                        return log_warning_errno(errno, "Failed to stat connected stream: %m");
+
+                /* We use device and inode numbers as identifier for the stream */
+                if (asprintf(&s->state_file, "/run/systemd/journal/streams/%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
+                        return log_oom();
+        }
+
+        mkdir_p("/run/systemd/journal/streams", 0755);
+
+        r = fopen_temporary(s->state_file, &f, &temp_path);
+        if (r < 0)
+                goto finish;
+
+        fprintf(f,
+                "# This is private data. Do not parse\n"
+                "PRIORITY=%i\n"
+                "LEVEL_PREFIX=%i\n"
+                "FORWARD_TO_SYSLOG=%i\n"
+                "FORWARD_TO_KMSG=%i\n"
+                "FORWARD_TO_CONSOLE=%i\n",
+                s->priority,
+                s->level_prefix,
+                s->forward_to_syslog,
+                s->forward_to_kmsg,
+                s->forward_to_console);
+
+        if (!isempty(s->identifier)) {
+                _cleanup_free_ char *escaped;
+
+                escaped = cescape(s->identifier);
+                if (!escaped) {
+                        r = -ENOMEM;
+                        goto finish;
+                }
+
+                fprintf(f, "IDENTIFIER=%s\n", escaped);
+        }
+
+        if (!isempty(s->unit_id)) {
+                _cleanup_free_ char *escaped;
+
+                escaped = cescape(s->unit_id);
+                if (!escaped) {
+                        r = -ENOMEM;
+                        goto finish;
+                }
+
+                fprintf(f, "UNIT=%s\n", escaped);
+        }
+
+        r = fflush_and_check(f);
+        if (r < 0)
+                goto finish;
+
+        if (rename(temp_path, s->state_file) < 0) {
+                r = -errno;
+                goto finish;
+        }
+
+        free(temp_path);
+        temp_path = NULL;
+
+        /* Store the connection fd in PID 1, so that we get it passed
+         * in again on next start */
+        if (!s->fdstore) {
+                sd_pid_notify_with_fds(0, false, "FDSTORE=1", &s->fd, 1);
+                s->fdstore = true;
+        }
+
+finish:
+        if (temp_path)
+                unlink(temp_path);
+
+        if (r < 0)
+                log_error_errno(r, "Failed to save stream data %s: %m", s->state_file);
+
+        return r;
+}
+
 static int stdout_stream_log(StdoutStream *s, const char *p) {
         struct iovec iovec[N_IOVEC_META_FIELDS + 5];
         int priority;
@@ -229,6 +371,9 @@ static int stdout_stream_line(StdoutStream *s, char *p) {
 
                 s->forward_to_console = !!r;
                 s->state = STDOUT_STREAM_RUNNING;
+
+                /* Try to save the stream, so that journald can be restarted and we can recover */
+                (void) stdout_stream_save(s);
                 return 0;
 
         case STDOUT_STREAM_RUNNING:
@@ -323,40 +468,63 @@ static int stdout_stream_process(sd_event_source *es, int fd, uint32_t revents,
         return 1;
 
 terminate:
-        stdout_stream_free(s);
+        stdout_stream_destroy(s);
         return 0;
 }
 
-void stdout_stream_free(StdoutStream *s) {
+static int stdout_stream_install(Server *s, int fd, StdoutStream **ret) {
+        _cleanup_(stdout_stream_freep) StdoutStream *stream = NULL;
+        int r;
+
         assert(s);
+        assert(fd >= 0);
 
-        if (s->server) {
-                assert(s->server->n_stdout_streams > 0);
-                s->server->n_stdout_streams --;
-                LIST_REMOVE(stdout_stream, s->server->stdout_streams, s);
-        }
+        stream = new0(StdoutStream, 1);
+        if (!stream)
+                return log_oom();
 
-        if (s->event_source) {
-                sd_event_source_set_enabled(s->event_source, SD_EVENT_OFF);
-                s->event_source = sd_event_source_unref(s->event_source);
-        }
+        stream->fd = -1;
+        stream->priority = LOG_INFO;
 
-        safe_close(s->fd);
+        r = getpeercred(fd, &stream->ucred);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine peer credentials: %m");
 
 #ifdef HAVE_SELINUX
-        if (s->security_context)
-                freecon(s->security_context);
+        if (mac_selinux_use()) {
+                if (getpeercon(fd, &stream->security_context) < 0 && errno != ENOPROTOOPT)
+                        log_error_errno(errno, "Failed to determine peer security context: %m");
+        }
 #endif
 
-        free(s->identifier);
-        free(s->unit_id);
-        free(s);
+        (void) shutdown(fd, SHUT_WR);
+
+        r = sd_event_add_io(s->event, &stream->event_source, fd, EPOLLIN, stdout_stream_process, stream);
+        if (r < 0)
+                return log_error_errno(r, "Failed to add stream to event loop: %m");
+
+        r = sd_event_source_set_priority(stream->event_source, SD_EVENT_PRIORITY_NORMAL+5);
+        if (r < 0)
+                return log_error_errno(r, "Failed to adjust stdout event source priority: %m");
+
+        stream->fd = fd;
+
+        stream->server = s;
+        LIST_PREPEND(stdout_stream, s->stdout_streams, stream);
+        s->n_stdout_streams ++;
+
+        if (ret)
+                *ret = stream;
+
+        stream = NULL;
+
+        return 0;
 }
 
 static int stdout_stream_new(sd_event_source *es, int listen_fd, uint32_t revents, void *userdata) {
+        _cleanup_close_ int fd = -1;
         Server *s = userdata;
-        StdoutStream *stream;
-        int fd, r;
+        int r;
 
         assert(s);
 
@@ -376,60 +544,163 @@ static int stdout_stream_new(sd_event_source *es, int listen_fd, uint32_t revent
 
         if (s->n_stdout_streams >= STDOUT_STREAMS_MAX) {
                 log_warning("Too many stdout streams, refusing connection.");
-                safe_close(fd);
                 return 0;
         }
 
-        stream = new0(StdoutStream, 1);
-        if (!stream) {
-                safe_close(fd);
-                return log_oom();
+        r = stdout_stream_install(s, fd, NULL);
+        if (r < 0)
+                return r;
+
+        fd = -1;
+        return 0;
+}
+
+static int stdout_stream_load(StdoutStream *stream, const char *fname) {
+        _cleanup_free_ char
+                *priority = NULL,
+                *level_prefix = NULL,
+                *forward_to_syslog = NULL,
+                *forward_to_kmsg = NULL,
+                *forward_to_console = NULL;
+        int r;
+
+        assert(stream);
+        assert(fname);
+
+        if (!stream->state_file) {
+                stream->state_file = strappend("/run/systemd/journal/streams/", fname);
+                if (!stream->state_file)
+                        return log_oom();
         }
 
-        stream->fd = fd;
+        r = parse_env_file(stream->state_file, NEWLINE,
+                           "PRIORITY", &priority,
+                           "LEVEL_PREFIX", &level_prefix,
+                           "FORWARD_TO_SYSLOG", &forward_to_syslog,
+                           "FORWARD_TO_KMSG", &forward_to_kmsg,
+                           "FORWARD_TO_CONSOLE", &forward_to_console,
+                           "IDENTIFIER", &stream->identifier,
+                           "UNIT", &stream->unit_id,
+                           NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to read: %s", stream->state_file);
 
-        r = getpeercred(fd, &stream->ucred);
-        if (r < 0) {
-                log_error_errno(errno, "Failed to determine peer credentials: %m");
-                goto fail;
+        if (priority) {
+                int p;
+
+                p = log_level_from_string(priority);
+                if (p >= 0)
+                        stream->priority = p;
         }
 
-#ifdef HAVE_SELINUX
-        if (mac_selinux_use()) {
-                if (getpeercon(fd, &stream->security_context) < 0 && errno != ENOPROTOOPT)
-                        log_error_errno(errno, "Failed to determine peer security context: %m");
+        if (level_prefix) {
+                r = parse_boolean(level_prefix);
+                if (r >= 0)
+                        stream->level_prefix = r;
         }
-#endif
 
-        if (shutdown(fd, SHUT_WR) < 0) {
-                log_error_errno(errno, "Failed to shutdown writing side of socket: %m");
-                goto fail;
+        if (forward_to_syslog) {
+                r = parse_boolean(forward_to_syslog);
+                if (r >= 0)
+                        stream->forward_to_syslog = r;
         }
 
-        r = sd_event_add_io(s->event, &stream->event_source, fd, EPOLLIN, stdout_stream_process, stream);
-        if (r < 0) {
-                log_error_errno(r, "Failed to add stream to event loop: %m");
-                goto fail;
+        if (forward_to_kmsg) {
+                r = parse_boolean(forward_to_kmsg);
+                if (r >= 0)
+                        stream->forward_to_kmsg = r;
         }
 
-        r = sd_event_source_set_priority(stream->event_source, SD_EVENT_PRIORITY_NORMAL+5);
-        if (r < 0) {
-                log_error_errno(r, "Failed to adjust stdout event source priority: %m");
-                goto fail;
+        if (forward_to_console) {
+                r = parse_boolean(forward_to_console);
+                if (r >= 0)
+                        stream->forward_to_console = r;
         }
 
-        stream->server = s;
-        LIST_PREPEND(stdout_stream, s->stdout_streams, stream);
-        s->n_stdout_streams ++;
+        return 0;
+}
+
+static int stdout_stream_restore(Server *s, const char *fname, int fd) {
+        StdoutStream *stream;
+        int r;
+
+        assert(s);
+        assert(fname);
+        assert(fd >= 0);
+
+        if (s->n_stdout_streams >= STDOUT_STREAMS_MAX) {
+                log_warning("Too many stdout streams, refusing restoring of stream.");
+                return -ENOBUFS;
+        }
+
+        r = stdout_stream_install(s, fd, &stream);
+        if (r < 0)
+                return r;
+
+        stream->state = STDOUT_STREAM_RUNNING;
+        stream->fdstore = true;
+
+        /* Ignore all parsing errors */
+        (void) stdout_stream_load(stream, fname);
 
         return 0;
+}
+
+static int server_restore_streams(Server *s, FDSet *fds) {
+        _cleanup_closedir_ DIR *d = NULL;
+        struct dirent *de;
+        int r;
+
+        d = opendir("/run/systemd/journal/streams");
+        if (!d) {
+                if (errno == ENOENT)
+                        return 0;
+
+                return log_warning_errno(errno, "Failed to enumerate /run/systemd/journal/streams: %m");
+        }
+
+        FOREACH_DIRENT(de, d, goto fail) {
+                unsigned long st_dev, st_ino;
+                bool found = false;
+                Iterator i;
+                int fd;
+
+                if (sscanf(de->d_name, "%lu:%lu", &st_dev, &st_ino) != 2)
+                        continue;
+
+                FDSET_FOREACH(fd, fds, i) {
+                        struct stat st;
+
+                        if (fstat(fd, &st) < 0)
+                                return log_error_errno(errno, "Failed to stat %s: %m", de->d_name);
+
+                        if (S_ISSOCK(st.st_mode) && st.st_dev == st_dev && st.st_ino == st_ino) {
+                                found = true;
+                                break;
+                        }
+                }
+
+                if (!found) {
+                        /* No file descriptor? Then let's delete the state file */
+                        log_debug("Cannot restore stream file %s", de->d_name);
+                        unlinkat(dirfd(d), de->d_name, 0);
+                        continue;
+                }
+
+                fdset_remove(fds, fd);
+
+                r = stdout_stream_restore(s, de->d_name, fd);
+                if (r < 0)
+                        safe_close(fd);
+        }
 
-fail:
-        stdout_stream_free(stream);
         return 0;
+
+fail:
+        return log_error_errno(errno, "Failed to read streams directory: %m");
 }
 
-int server_open_stdout_socket(Server *s) {
+int server_open_stdout_socket(Server *s, FDSet *fds) {
         int r;
 
         assert(s);
@@ -465,5 +736,8 @@ int server_open_stdout_socket(Server *s) {
         if (r < 0)
                 return log_error_errno(r, "Failed to adjust priority of stdout server event source: %m");
 
+        /* Try to restore streams, but don't bother if this fails */
+        (void) server_restore_streams(s, fds);
+
         return 0;
 }
diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h
index 8cad012..94bf955 100644
--- a/src/journal/journald-stream.h
+++ b/src/journal/journald-stream.h
@@ -21,8 +21,9 @@
   along with systemd; If not, see <http://www.gnu.org/licenses/>.
 ***/
 
+#include "fdset.h"
 #include "journald-server.h"
 
-int server_open_stdout_socket(Server *s);
+int server_open_stdout_socket(Server *s, FDSet *fds);
 
 void stdout_stream_free(StdoutStream *s);
diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in
index 8d380c8..e3eea25 100644
--- a/units/systemd-journald.service.in
+++ b/units/systemd-journald.service.in
@@ -23,6 +23,7 @@ NotifyAccess=all
 StandardOutput=null
 CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE
 WatchdogSec=1min
+FileDescriptorStoreMax=1024
 
 # Increase the default a bit in order to allow many simultaneous
 # services being run since we keep one fd open per service.

commit a354329f724d6ce913d2ccffb2be8f3327a67faa
Author: Lennart Poettering <lennart at poettering.net>
Date:   Tue Jan 6 00:26:25 2015 +0100

    core: add new logic for services to store file descriptors in PID 1
    
    With this change it is possible to send file descriptors to PID 1, via
    sd_pid_notify_with_fds() which PID 1 will store individually for each
    service, and pass via the usual fd passing logic on next invocation.
    This is useful for enable daemon reload schemes where daemons serialize
    their state to /run, push their fds into PID 1 and terminate, restoring
    their state on next start from the data in /run and passed in from PID
    1.
    
    The fds are kept by PID 1 as long as no POLLHUP or POLLERR is seen on
    them, and the service they belong to are either not dead or failed, or
    have a job queued.

diff --git a/Makefile-man.am b/Makefile-man.am
index 68e7483..8dc8feb 100644
--- a/Makefile-man.am
+++ b/Makefile-man.am
@@ -198,6 +198,9 @@ MANPAGES_ALIAS += \
 	man/sd_journal_wait.3 \
 	man/sd_machine_get_ifindices.3 \
 	man/sd_notifyf.3 \
+	man/sd_pid_notify.3 \
+	man/sd_pid_notify_with_fds.3 \
+	man/sd_pid_notifyf.3 \
 	man/sleep.conf.d.5 \
 	man/system.conf.d.5 \
 	man/systemd-ask-password-console.path.8 \
@@ -308,6 +311,9 @@ man/sd_journal_test_cursor.3: man/sd_journal_get_cursor.3
 man/sd_journal_wait.3: man/sd_journal_get_fd.3
 man/sd_machine_get_ifindices.3: man/sd_machine_get_class.3
 man/sd_notifyf.3: man/sd_notify.3
+man/sd_pid_notify.3: man/sd_notify.3
+man/sd_pid_notify_with_fds.3: man/sd_notify.3
+man/sd_pid_notifyf.3: man/sd_notify.3
 man/sleep.conf.d.5: man/systemd-sleep.conf.5
 man/system.conf.d.5: man/systemd-system.conf.5
 man/systemd-ask-password-console.path.8: man/systemd-ask-password-console.service.8
@@ -566,6 +572,15 @@ man/sd_machine_get_ifindices.html: man/sd_machine_get_class.html
 man/sd_notifyf.html: man/sd_notify.html
 	$(html-alias)
 
+man/sd_pid_notify.html: man/sd_notify.html
+	$(html-alias)
+
+man/sd_pid_notify_with_fds.html: man/sd_notify.html
+	$(html-alias)
+
+man/sd_pid_notifyf.html: man/sd_notify.html
+	$(html-alias)
+
 man/sleep.conf.d.html: man/systemd-sleep.conf.html
 	$(html-alias)
 
@@ -674,12 +689,6 @@ man/systemd-user.conf.html: man/systemd-system.conf.html
 man/user.conf.d.html: man/systemd-system.conf.html
 	$(html-alias)
 
-if ENABLE_HWDB
-MANPAGES += \
-        man/hwdb.7 \
-        man/systemd-hwdb.8
-
-endif
 
 if ENABLE_BACKLIGHT
 MANPAGES += \
@@ -750,6 +759,16 @@ man/systemd-hostnamed.html: man/systemd-hostnamed.service.html
 
 endif
 
+if ENABLE_HWDB
+MANPAGES += \
+	man/hwdb.7 \
+	man/systemd-hwdb.8
+MANPAGES_ALIAS += \
+	#
+
+
+endif
+
 if ENABLE_KDBUS
 MANPAGES += \
 	man/sd_bus_creds_get_pid.3 \
diff --git a/man/sd_listen_fds.xml b/man/sd_listen_fds.xml
index 6999db9..4377745 100644
--- a/man/sd_listen_fds.xml
+++ b/man/sd_listen_fds.xml
@@ -73,7 +73,7 @@
                 <para>If the <parameter>unset_environment</parameter>
                 parameter is non-zero,
                 <function>sd_listen_fds()</function> will unset the
-                <varname>$LISTEN_FDS</varname>/<varname>$LISTEN_PID</varname>
+                <varname>$LISTEN_FDS</varname> and <varname>$LISTEN_PID</varname>
                 environment variables before returning (regardless of
                 whether the function call itself succeeded or
                 not). Further calls to
@@ -83,10 +83,11 @@
 
                 <para>If a daemon receives more than one file
                 descriptor, they will be passed in the same order as
-                configured in the systemd socket definition
-                file. Nonetheless, it is recommended to verify the
-                correct socket types before using them. To simplify
-                this checking, the functions
+                configured in the systemd socket unit file (see
+                <citerefentry><refentrytitle>systemd.socket</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+                for details). Nonetheless, it is recommended to verify
+                the correct socket types before using them. To
+                simplify this checking, the functions
                 <citerefentry><refentrytitle>sd_is_fifo</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
                 <citerefentry><refentrytitle>sd_is_socket</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
                 <citerefentry><refentrytitle>sd_is_socket_inet</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
@@ -103,6 +104,16 @@
                 <para>This function call will set the FD_CLOEXEC flag
                 for all passed file descriptors to avoid further
                 inheritance to children of the calling process.</para>
+
+                <para>If multiple socket units activate the same
+                service the order of the file descriptors passed to
+                its main process is undefined. If additional file
+                descriptors have been passed to the service manager
+                using
+                <citerefentry><refentrytitle>sd_pid_notify_with_fds</refentrytitle><manvolnum>3</manvolnum></citerefentry>'s
+                <literal>FDSTORE=1</literal> messages, these file
+                descriptors are passed last, in arbitrary order, and
+                with duplicates removed.</para>
         </refsect1>
 
         <refsect1>
diff --git a/man/sd_notify.xml b/man/sd_notify.xml
index 35f6f71..2bf3383 100644
--- a/man/sd_notify.xml
+++ b/man/sd_notify.xml
@@ -46,6 +46,9 @@
         <refnamediv>
                 <refname>sd_notify</refname>
                 <refname>sd_notifyf</refname>
+                <refname>sd_pid_notify</refname>
+                <refname>sd_pid_notifyf</refname>
+                <refname>sd_pid_notify_with_fds</refname>
                 <refpurpose>Notify service manager about start-up completion and other service status changes</refpurpose>
         </refnamediv>
 
@@ -65,6 +68,30 @@
                                 <paramdef>const char *<parameter>format</parameter></paramdef>
                                 <paramdef>...</paramdef>
                         </funcprototype>
+
+                        <funcprototype>
+                                <funcdef>int <function>sd_pid_notify</function></funcdef>
+                                <paramdef>pid_t <parameter>pid</parameter></paramdef>
+                                <paramdef>int <parameter>unset_environment</parameter></paramdef>
+                                <paramdef>const char *<parameter>state</parameter></paramdef>
+                        </funcprototype>
+
+                        <funcprototype>
+                                <funcdef>int <function>sd_pid_notifyf</function></funcdef>
+                                <paramdef>pid_t <parameter>pid</parameter></paramdef>
+                                <paramdef>int <parameter>unset_environment</parameter></paramdef>
+                                <paramdef>const char *<parameter>format</parameter></paramdef>
+                                <paramdef>...</paramdef>
+                        </funcprototype>
+
+                        <funcprototype>
+                                <funcdef>int <function>sd_pid_notify_with_fds</function></funcdef>
+                                <paramdef>pid_t <parameter>pid</parameter></paramdef>
+                                <paramdef>int <parameter>unset_environment</parameter></paramdef>
+                                <paramdef>const char *<parameter>state</parameter></paramdef>
+                                <paramdef>const int *<parameter>fds</parameter></paramdef>
+                                <paramdef>unsigned <parameter>n_fds</parameter></paramdef>
+                        </funcprototype>
                 </funcsynopsis>
         </refsynopsisdiv>
 
@@ -175,7 +202,7 @@
                         <varlistentry>
                                 <term>MAINPID=...</term>
 
-                                <listitem><para>The main pid of the
+                                <listitem><para>The main process ID (PID) of the
                                 service, in case the service manager did
                                 not fork off the process
                                 itself. Example:
@@ -185,7 +212,7 @@
                         <varlistentry>
                                 <term>WATCHDOG=1</term>
 
-                                <listitem><para>Tells systemd to
+                                <listitem><para>Tells the service manager to
                                 update the watchdog timestamp. This is
                                 the keep-alive ping that services need
                                 to issue in regular intervals if
@@ -199,12 +226,53 @@
                                 check if the the watchdog is enabled.
                                 </para></listitem>
                         </varlistentry>
+
+
+                        <varlistentry>
+                                <term>FDSTORE=1</term>
+
+                                <listitem><para>Stores additional file
+                                descriptors in the service
+                                manager. File descriptors sent this
+                                way will be maintained per-service by
+                                the service manager and be passed
+                                again using the usual file descriptor
+                                passing logic on the next invocation
+                                of the service (see
+                                <citerefentry><refentrytitle>sd_listen_fds</refentrytitle><manvolnum>3</manvolnum></citerefentry>). This
+                                is useful for implementing service
+                                restart schemes where services
+                                serialize their state to
+                                <filename>/run</filename>, push their
+                                file descriptors to the system
+                                manager, and are then restarted,
+                                retrieving their state again via
+                                socket passing and
+                                <filename>/run</filename>. Note that
+                                the service manager will accept
+                                messages for a service only if
+                                <varname>FileDescriptorStoreMax=</varname>
+                                is set to non-zero for it (defaults to
+                                zero). See
+                                <citerefentry><refentrytitle>systemd.service</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+                                for details. Multiple arrays of file
+                                descriptors may be sent in seperate
+                                messages, in which case the arrays are
+                                combined. Note that the service
+                                manager removes duplicate file
+                                descriptors before passing them to the
+                                service. Use
+                                <function>sd_pid_notify_with_fds()</function>
+                                to send messages with
+                                <literal>FDSTORE=1</literal>, see
+                                below.</para></listitem>
+                        </varlistentry>
+
                 </variablelist>
 
                 <para>It is recommended to prefix variable names that
-                are not shown in the list above with
-                <varname>X_</varname> to avoid namespace
-                clashes.</para>
+                are not listed above with <varname>X_</varname> to
+                avoid namespace clashes.</para>
 
                 <para>Note that systemd will accept status data sent
                 from a service only if the
@@ -217,6 +285,36 @@
                 <function>sd_notify()</function> but takes a
                 <function>printf()</function>-like format string plus
                 arguments.</para>
+
+                <para><function>sd_pid_notify()</function> and
+                <function>sd_pid_notifyf()</function> are similar to
+                <function>sd_notify()</function> and
+                <function>sd_notifyf()</function> but take a process
+                ID (PID) to use as originating PID for the message as
+                first argument. This is useful to send notification
+                messages on behalf of other processes, provided the
+                appropriate privileges are available. If the PID
+                argument is specified as 0 the process ID of the
+                calling process is used, in which case the calls are
+                fully equivalent to <function>sd_notify()</function>
+                and <function>sd_notifyf()</function>.</para>
+
+                <para><function>sd_pid_notify_with_fds()</function> is
+                similar to <function>sd_pid_notify()</function> but
+                takes an additional array of file descriptors. These
+                file descriptors are sent along the notification
+                message to the service manager. This is particularly
+                useful for sending <literal>FDSTORE=1</literal>
+                messages, as described above. The additional arguments
+                are a pointer to the file descriptor array plus the
+                number of file descriptors in the array. If the number
+                of file descriptors is passed as 0, the call is fully
+                equivalent to <function>sd_pid_notify()</function>,
+                i.e. no file descriptors are passed. Note that sending
+                file descriptors to the service manager on messages
+                that do not expect them (i.e. without
+                <literal>FDSTORE=1</literal>) they are immediately
+                closed on reception.</para>
         </refsect1>
 
         <refsect1>
@@ -295,13 +393,25 @@
                 <example>
                         <title>Error Cause Notification</title>
 
-                        <para>A service could send the following shortly before exiting, on failure</para>
+                        <para>A service could send the following shortly before exiting, on failure:</para>
 
                         <programlisting>sd_notifyf(0, "STATUS=Failed to start up: %s\n"
               "ERRNO=%i",
               strerror(errno),
               errno);</programlisting>
                 </example>
+
+                <example>
+                        <title>Store a File Descriptor in the Service Manager</title>
+
+                        <para>To store an open file descriptor in the
+                        service manager, in order to continue
+                        operation after a service restart without
+                        losing state use
+                        <literal>FDSTORE=1</literal>:</para>
+
+                        <programlisting>sd_pid_notify_with_fds(0, 0, "FDSTORE=1", &fd, 1);</programlisting>
+                </example>
         </refsect1>
 
         <refsect1>
diff --git a/man/systemd.service.xml b/man/systemd.service.xml
index 0b68aa0..4c890df 100644
--- a/man/systemd.service.xml
+++ b/man/systemd.service.xml
@@ -1117,6 +1117,35 @@
                                 command.</para></listitem>
                         </varlistentry>
 
+                        <varlistentry>
+                                <term><varname>FileDescriptorStoreMax=</varname></term>
+                                <listitem><para>Configure how many
+                                file descriptors may be stored in the
+                                service manager for the service using
+                                <citerefentry><refentrytitle>sd_pid_notify_with_fds</refentrytitle><manvolnum>3</manvolnum></citerefentry>'s
+                                <literal>FDSTORE=1</literal>
+                                messages. This is useful for
+                                implementing service restart schemes
+                                where the state is serialized to
+                                <filename>/run</filename> and the file
+                                descriptors passed to the service
+                                manager, to allow restarts without
+                                losing state. Defaults to 0, i.e. no
+                                file descriptors may be stored in the
+                                service manager by default. All file
+                                descriptors passed to the service
+                                manager from a specific service are
+                                passed back to the service's main
+                                process on the next service
+                                restart. Any file descriptors passed
+                                to the service manager are
+                                automatically closed when POLLHUP or
+                                POLLERR is seen on them, or when the
+                                service is fully stopped and no job
+                                queued or being executed for
+                                it.</para></listitem>
+                        </varlistentry>
+
                 </variablelist>
 
                 <para>Check
diff --git a/src/core/dbus-service.c b/src/core/dbus-service.c
index 2b50ac9..6d4713b 100644
--- a/src/core/dbus-service.c
+++ b/src/core/dbus-service.c
@@ -59,6 +59,7 @@ const sd_bus_vtable bus_service_vtable[] = {
         SD_BUS_PROPERTY("MainPID", "u", bus_property_get_pid, offsetof(Service, main_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
         SD_BUS_PROPERTY("ControlPID", "u", bus_property_get_pid, offsetof(Service, control_pid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
         SD_BUS_PROPERTY("BusName", "s", NULL, offsetof(Service, bus_name), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("FileDescriptorStoreMax", "u", NULL, offsetof(Service, n_fd_store_max), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("StatusText", "s", NULL, offsetof(Service, status_text), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
         SD_BUS_PROPERTY("StatusErrno", "i", NULL, offsetof(Service, status_errno), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
         SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Service, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4
index e0ffaa6..9e87d91 100644
--- a/src/core/load-fragment-gperf.gperf.m4
+++ b/src/core/load-fragment-gperf.gperf.m4
@@ -226,6 +226,7 @@ Service.SuccessExitStatus,       config_parse_set_status,            0,
 Service.SysVStartPriority,       config_parse_warn_compat,           DISABLED_LEGACY,               0
 Service.NonBlocking,             config_parse_bool,                  0,                             offsetof(Service, exec_context.non_blocking)
 Service.BusName,                 config_parse_unit_string_printf,    0,                             offsetof(Service, bus_name)
+Service.FileDescriptorStoreMax,  config_parse_unsigned,              0,                             offsetof(Service, n_fd_store_max)
 Service.NotifyAccess,            config_parse_notify_access,         0,                             offsetof(Service, notify_access)
 Service.Sockets,                 config_parse_service_sockets,       0,                             0
 m4_ifdef(`ENABLE_KDBUS',
diff --git a/src/core/manager.c b/src/core/manager.c
index 519b374..c18312a 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -84,6 +84,9 @@
 #define JOBS_IN_PROGRESS_PERIOD_USEC (USEC_PER_SEC / 3)
 #define JOBS_IN_PROGRESS_PERIOD_DIVISOR 3
 
+#define NOTIFY_FD_MAX 768
+#define NOTIFY_BUFFER_MAX PIPE_BUF
+
 static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
 static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
 static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
@@ -1449,7 +1452,7 @@ static unsigned manager_dispatch_dbus_queue(Manager *m) {
         return n;
 }
 
-static void manager_invoke_notify_message(Manager *m, Unit *u, pid_t pid, char *buf, size_t n) {
+static void manager_invoke_notify_message(Manager *m, Unit *u, pid_t pid, char *buf, size_t n, FDSet *fds) {
         _cleanup_strv_free_ char **tags = NULL;
 
         assert(m);
@@ -1466,12 +1469,13 @@ static void manager_invoke_notify_message(Manager *m, Unit *u, pid_t pid, char *
         log_unit_debug(u->id, "Got notification message for unit %s", u->id);
 
         if (UNIT_VTABLE(u)->notify_message)
-                UNIT_VTABLE(u)->notify_message(u, pid, tags);
+                UNIT_VTABLE(u)->notify_message(u, pid, tags, fds);
 }
 
 static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
         Manager *m = userdata;
         ssize_t n;
+        int r;
 
         assert(m);
         assert(m->notify_fd == fd);
@@ -1482,73 +1486,101 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t
         }
 
         for (;;) {
-                char buf[4096];
+                _cleanup_fdset_free_ FDSet *fds = NULL;
+                char buf[NOTIFY_BUFFER_MAX+1];
                 struct iovec iovec = {
                         .iov_base = buf,
                         .iov_len = sizeof(buf)-1,
                 };
-                bool found = false;
-
                 union {
                         struct cmsghdr cmsghdr;
-                        uint8_t buf[CMSG_SPACE(sizeof(struct ucred))];
+                        uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
+                                    CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)];
                 } control = {};
-
                 struct msghdr msghdr = {
                         .msg_iov = &iovec,
                         .msg_iovlen = 1,
                         .msg_control = &control,
                         .msg_controllen = sizeof(control),
                 };
-                struct ucred *ucred;
+                struct cmsghdr *cmsg;
+                struct ucred *ucred = NULL;
+                bool found = false;
                 Unit *u1, *u2, *u3;
+                int *fd_array = NULL;
+                unsigned n_fds = 0;
 
-                n = recvmsg(m->notify_fd, &msghdr, MSG_DONTWAIT);
-                if (n <= 0) {
-                        if (n == 0)
-                                return -EIO;
-
+                n = recvmsg(m->notify_fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
+                if (n < 0) {
                         if (errno == EAGAIN || errno == EINTR)
                                 break;
 
                         return -errno;
                 }
+                if (n == 0)
+                        return -ECONNRESET;
+
+                for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg; cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
+                        if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
+
+                                fd_array = (int*) CMSG_DATA(cmsg);
+                                n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+                        } else if (cmsg->cmsg_level == SOL_SOCKET &&
+                                   cmsg->cmsg_type == SCM_CREDENTIALS &&
+                                   cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
 
-                if (msghdr.msg_controllen < CMSG_LEN(sizeof(struct ucred)) ||
-                    control.cmsghdr.cmsg_level != SOL_SOCKET ||
-                    control.cmsghdr.cmsg_type != SCM_CREDENTIALS ||
-                    control.cmsghdr.cmsg_len != CMSG_LEN(sizeof(struct ucred))) {
-                        log_warning("Received notify message without credentials. Ignoring.");
+                                ucred = (struct ucred*) CMSG_DATA(cmsg);
+                        }
+                }
+
+                if (n_fds > 0) {
+                        assert(fd_array);
+
+                        r = fdset_new_array(&fds, fd_array, n_fds);
+                        if (r < 0) {
+                                close_many(fd_array, n_fds);
+                                return log_oom();
+                        }
+                }
+
+                if (!ucred || ucred->pid <= 0) {
+                        log_warning("Received notify message without valid credentials. Ignoring.");
                         continue;
                 }
 
-                ucred = (struct ucred*) CMSG_DATA(&control.cmsghdr);
+                if ((size_t) n >= sizeof(buf)) {
+                        log_warning("Received notify message exceeded maximum size. Ignoring.");
+                        continue;
+                }
 
-                assert((size_t) n < sizeof(buf));
                 buf[n] = 0;
 
                 /* Notify every unit that might be interested, but try
                  * to avoid notifying the same one multiple times. */
                 u1 = manager_get_unit_by_pid(m, ucred->pid);
                 if (u1) {
-                        manager_invoke_notify_message(m, u1, ucred->pid, buf, n);
+                        manager_invoke_notify_message(m, u1, ucred->pid, buf, n, fds);
                         found = true;
                 }
 
                 u2 = hashmap_get(m->watch_pids1, LONG_TO_PTR(ucred->pid));
                 if (u2 && u2 != u1) {
-                        manager_invoke_notify_message(m, u2, ucred->pid, buf, n);
+                        manager_invoke_notify_message(m, u2, ucred->pid, buf, n, fds);
                         found = true;
                 }
 
                 u3 = hashmap_get(m->watch_pids2, LONG_TO_PTR(ucred->pid));
                 if (u3 && u3 != u2 && u3 != u1) {
-                        manager_invoke_notify_message(m, u3, ucred->pid, buf, n);
+                        manager_invoke_notify_message(m, u3, ucred->pid, buf, n, fds);
                         found = true;
                 }
 
                 if (!found)
                         log_warning("Cannot find unit for notify message of PID "PID_FMT".", ucred->pid);
+
+                if (fdset_size(fds) > 0)
+                        log_warning("Got auxiliary fds with notification message, closing all.");
         }
 
         return 0;
diff --git a/src/core/service.c b/src/core/service.c
index bfbe959..78232ee 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -242,6 +242,42 @@ static void service_reset_watchdog(Service *s) {
         service_start_watchdog(s);
 }
 
+static void service_fd_store_unlink(ServiceFDStore *fs) {
+
+        if (!fs)
+                return;
+
+        if (fs->service) {
+                assert(fs->service->n_fd_store > 0);
+                LIST_REMOVE(fd_store, fs->service->fd_store, fs);
+                fs->service->n_fd_store--;
+        }
+
+        if (fs->event_source) {
+                sd_event_source_set_enabled(fs->event_source, SD_EVENT_OFF);
+                sd_event_source_unref(fs->event_source);
+        }
+
+        safe_close(fs->fd);
+        free(fs);
+}
+
+static void service_release_resources(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(s);
+
+        if (!s->fd_store)
+                return;
+
+        log_debug("Releasing all resources for %s", u->id);
+
+        while (s->fd_store)
+                service_fd_store_unlink(s->fd_store);
+
+        assert(s->n_fd_store == 0);
+}
+
 static void service_done(Unit *u) {
         Service *s = SERVICE(u);
 
@@ -286,6 +322,8 @@ static void service_done(Unit *u) {
         service_stop_watchdog(s);
 
         s->timer_event_source = sd_event_source_unref(s->timer_event_source);
+
+        service_release_resources(u);
 }
 
 static int service_arm_timer(Service *s, usec_t usec) {
@@ -549,6 +587,14 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) {
         if (s->status_text)
                 fprintf(f, "%sStatus Text: %s\n",
                         prefix, s->status_text);
+
+        if (s->n_fd_store_max > 0) {
+                fprintf(f,
+                        "%sFile Descriptor Store Max: %u\n"
+                        "%sFile Descriptor Store Current: %u\n",
+                        prefix, s->n_fd_store_max,
+                        prefix, s->n_fd_store);
+        }
 }
 
 static int service_load_pid_file(Service *s, bool may_warn) {
@@ -806,10 +852,10 @@ static int service_coldplug(Unit *u) {
 }
 
 static int service_collect_fds(Service *s, int **fds, unsigned *n_fds) {
+        _cleanup_free_ int *rfds = NULL;
+        unsigned rn_fds = 0;
         Iterator i;
         int r;
-        int *rfds = NULL;
-        unsigned rn_fds = 0;
         Unit *u;
 
         assert(s);
@@ -831,10 +877,12 @@ static int service_collect_fds(Service *s, int **fds, unsigned *n_fds) {
 
                 r = socket_collect_fds(sock, &cfds, &cn_fds);
                 if (r < 0)
-                        goto fail;
+                        return r;
 
-                if (!cfds)
+                if (cn_fds <= 0) {
+                        free(cfds);
                         continue;
+                }
 
                 if (!rfds) {
                         rfds = cfds;
@@ -842,32 +890,39 @@ static int service_collect_fds(Service *s, int **fds, unsigned *n_fds) {
                 } else {
                         int *t;
 
-                        t = new(int, rn_fds+cn_fds);
+                        t = realloc(rfds, (rn_fds + cn_fds) * sizeof(int));
                         if (!t) {
                                 free(cfds);
-                                r = -ENOMEM;
-                                goto fail;
+                                return -ENOMEM;
                         }
 
-                        memcpy(t, rfds, rn_fds * sizeof(int));
-                        memcpy(t+rn_fds, cfds, cn_fds * sizeof(int));
-                        free(rfds);
+                        memcpy(t + rn_fds, cfds, cn_fds * sizeof(int));
+                        rfds = t;
+                        rn_fds += cn_fds;
+
                         free(cfds);
 
-                        rfds = t;
-                        rn_fds = rn_fds+cn_fds;
                 }
         }
 
+        if (s->n_fd_store > 0) {
+                ServiceFDStore *fs;
+                int *t;
+
+                t = realloc(rfds, (rn_fds + s->n_fd_store) * sizeof(int));
+                if (!t)
+                        return -ENOMEM;
+
+                rfds = t;
+                LIST_FOREACH(fd_store, fs, s->fd_store)
+                        rfds[rn_fds++] = fs->fd;
+        }
+
         *fds = rfds;
         *n_fds = rn_fds;
 
+        rfds = NULL;
         return 0;
-
-fail:
-        free(rfds);
-
-        return r;
 }
 
 static int service_spawn(
@@ -2543,7 +2598,75 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void
         return 0;
 }
 
-static void service_notify_message(Unit *u, pid_t pid, char **tags) {
+static int on_fd_store_io(sd_event_source *e, int fd, uint32_t revents, void *userdata) {
+        ServiceFDStore *fs = userdata;
+
+        assert(e);
+        assert(fs);
+
+        /* If we get either EPOLLHUP or EPOLLERR, it's time to remove this entry from the fd store */
+        service_fd_store_unlink(fs);
+        return 0;
+}
+
+static int service_add_fd_set(Service *s, FDSet *fds) {
+        int r;
+
+        assert(s);
+
+        if (fdset_size(fds) <= 0)
+                return 0;
+
+        while (s->n_fd_store < s->n_fd_store_max) {
+                _cleanup_close_ int fd = -1;
+                ServiceFDStore *fs;
+                bool same = false;
+
+                fd = fdset_steal_first(fds);
+                if (fd < 0)
+                        break;
+
+                LIST_FOREACH(fd_store, fs, s->fd_store) {
+                        r = same_fd(fs->fd, fd);
+                        if (r < 0)
+                                return log_unit_error_errno(UNIT(s)->id, r, "%s: Couldn't check if same fd: %m", UNIT(s)->id);
+                        if (r > 0) {
+                                same = true;
+                                break;
+                        }
+                }
+
+                if (same)
+                        continue;
+
+                fs = new0(ServiceFDStore, 1);
+                if (!fs)
+                        return log_oom();
+
+                fs->fd = fd;
+                fs->service = s;
+
+                r = sd_event_add_io(UNIT(s)->manager->event, &fs->event_source, fd, 0, on_fd_store_io, fs);
+                if (r < 0) {
+                        free(fs);
+                        return log_unit_error_errno(UNIT(s)->id, r, "%s: Failed to add even source: %m", UNIT(s)->id);
+                }
+
+                LIST_PREPEND(fd_store, s->fd_store, fs);
+                s->n_fd_store++;
+
+                fd = -1;
+
+                log_unit_debug(UNIT(s)->id, "%s: added fd to fd store.", UNIT(s)->id);
+        }
+
+        if (fdset_size(fds) > 0)
+                log_unit_warning(UNIT(s)->id, "%s: tried to store more fds than FDStoreMax=%u allows, closing remaining.", UNIT(s)->id, s->n_fd_store_max);
+
+        return 0;
+}
+
+static void service_notify_message(Unit *u, pid_t pid, char **tags, FDSet *fds) {
         Service *s = SERVICE(u);
         _cleanup_free_ char *cc = NULL;
         bool notify_dbus = false;
@@ -2675,6 +2798,12 @@ static void service_notify_message(Unit *u, pid_t pid, char **tags) {
                 service_reset_watchdog(s);
         }
 
+        /* Add the passed fds to the fd store */
+        if (strv_find(tags, "FDSTORE=1")) {
+                log_unit_debug(u->id, "%s: got FDSTORE=1", u->id);
+                service_add_fd_set(s, fds);
+        }
+
         /* Notify clients about changed status or main pid */
         if (notify_dbus)
                 unit_add_to_dbus_queue(u);
@@ -2917,6 +3046,7 @@ const UnitVTable service_vtable = {
         .init = service_init,
         .done = service_done,
         .load = service_load,
+        .release_resources = service_release_resources,
 
         .coldplug = service_coldplug,
 
diff --git a/src/core/service.h b/src/core/service.h
index f6a78c4..dfeee6a 100644
--- a/src/core/service.h
+++ b/src/core/service.h
@@ -22,6 +22,7 @@
 ***/
 
 typedef struct Service Service;
+typedef struct ServiceFDStore ServiceFDStore;
 
 #include "unit.h"
 #include "path.h"
@@ -115,6 +116,15 @@ typedef enum ServiceResult {
         _SERVICE_RESULT_INVALID = -1
 } ServiceResult;
 
+struct ServiceFDStore {
+        Service *service;
+
+        int fd;
+        sd_event_source *event_source;
+
+        LIST_FIELDS(ServiceFDStore, fd_store);
+};
+
 struct Service {
         Unit meta;
 
@@ -198,6 +208,10 @@ struct Service {
 
         NotifyAccess notify_access;
         NotifyState notify_state;
+
+        ServiceFDStore *fd_store;
+        unsigned n_fd_store;
+        unsigned n_fd_store_max;
 };
 
 extern const UnitVTable service_vtable;
diff --git a/src/core/unit.c b/src/core/unit.c
index 229bd0f..7311c58 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -278,21 +278,32 @@ int unit_set_description(Unit *u, const char *description) {
 }
 
 bool unit_check_gc(Unit *u) {
+        UnitActiveState state;
         assert(u);
 
-        if (UNIT_VTABLE(u)->no_gc)
+        if (u->job)
                 return true;
 
-        if (u->no_gc)
+        if (u->nop_job)
                 return true;
 
-        if (u->job)
+        state = unit_active_state(u);
+
+        /* If the unit is inactive and failed and no job is queued for
+         * it, then release its runtime resources */
+        if (UNIT_IS_INACTIVE_OR_FAILED(state) &&
+            UNIT_VTABLE(u)->release_resources)
+                UNIT_VTABLE(u)->release_resources(u);
+
+        /* But we keep the unit object around for longer when it is
+         * referenced or configured to not be gc'ed */
+        if (state != UNIT_INACTIVE)
                 return true;
 
-        if (u->nop_job)
+        if (UNIT_VTABLE(u)->no_gc)
                 return true;
 
-        if (unit_active_state(u) != UNIT_INACTIVE)
+        if (u->no_gc)
                 return true;
 
         if (u->refs)
diff --git a/src/core/unit.h b/src/core/unit.h
index 19fa2f0..53b8a7f 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -345,6 +345,10 @@ struct UnitVTable {
          * way */
         bool (*check_gc)(Unit *u);
 
+        /* When the unit is not running and no job for it queued we
+         * shall release its runtime resources */
+        void (*release_resources)(Unit *u);
+
         /* Return true when this unit is suitable for snapshotting */
         bool (*check_snapshot)(Unit *u);
 
@@ -359,7 +363,7 @@ struct UnitVTable {
         void (*notify_cgroup_empty)(Unit *u);
 
         /* Called whenever a process of this unit sends us a message */
-        void (*notify_message)(Unit *u, pid_t pid, char **tags);
+        void (*notify_message)(Unit *u, pid_t pid, char **tags, FDSet *fds);
 
         /* Called whenever a name this Unit registered for comes or
          * goes away. */
diff --git a/src/libsystemd/libsystemd.sym.m4 b/src/libsystemd/libsystemd.sym.m4
index 80a61ba..19a49f4 100644
--- a/src/libsystemd/libsystemd.sym.m4
+++ b/src/libsystemd/libsystemd.sym.m4
@@ -158,6 +158,11 @@ global:
         sd_session_get_desktop;
 } LIBSYSTEMD_216;
 
+LIBSYSTEMD_219 {
+global:
+        sd_pid_notify_with_fds;
+} LIBSYSTEMD_217;
+
 m4_ifdef(`ENABLE_KDBUS',
 LIBSYSTEMD_FUTURE {
 global:
diff --git a/src/libsystemd/sd-daemon/sd-daemon.c b/src/libsystemd/sd-daemon/sd-daemon.c
index 1f2a533..028c2a7 100644
--- a/src/libsystemd/sd-daemon/sd-daemon.c
+++ b/src/libsystemd/sd-daemon/sd-daemon.c
@@ -340,16 +340,28 @@ _public_ int sd_is_mq(int fd, const char *path) {
         return 1;
 }
 
-_public_ int sd_pid_notify(pid_t pid, int unset_environment, const char *state) {
-        union sockaddr_union sockaddr = {};
-        _cleanup_close_ int fd = -1;
-        struct msghdr msghdr = {};
-        struct iovec iovec = {};
-        const char *e;
+_public_ int sd_pid_notify_with_fds(pid_t pid, int unset_environment, const char *state, const int *fds, unsigned n_fds) {
+        union sockaddr_union sockaddr = {
+                .sa.sa_family = AF_UNIX,
+        };
+        struct iovec iovec = {
+                .iov_base = (char*) state,
+        };
+        struct msghdr msghdr = {
+                .msg_iov = &iovec,
+                .msg_iovlen = 1,
+                .msg_name = &sockaddr,
+        };
         union {
                 struct cmsghdr cmsghdr;
-                uint8_t buf[CMSG_SPACE(sizeof(struct ucred))];
-        } control = {};
+                uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
+                            CMSG_SPACE(sizeof(int) * n_fds)];
+        } control;
+        _cleanup_close_ int fd = -1;
+        struct cmsghdr *cmsg = NULL;
+        const char *e;
+        size_t controllen_without_ucred = 0;
+        bool try_without_ucred = false;
         int r;
 
         if (!state) {
@@ -357,6 +369,11 @@ _public_ int sd_pid_notify(pid_t pid, int unset_environment, const char *state)
                 goto finish;
         }
 
+        if (n_fds > 0 && !fds) {
+                r = -EINVAL;
+                goto finish;
+        }
+
         e = getenv("NOTIFY_SOCKET");
         if (!e)
                 return 0;
@@ -373,42 +390,50 @@ _public_ int sd_pid_notify(pid_t pid, int unset_environment, const char *state)
                 goto finish;
         }
 
-        sockaddr.sa.sa_family = AF_UNIX;
-        strncpy(sockaddr.un.sun_path, e, sizeof(sockaddr.un.sun_path));
+        iovec.iov_len = strlen(state);
 
+        strncpy(sockaddr.un.sun_path, e, sizeof(sockaddr.un.sun_path));
         if (sockaddr.un.sun_path[0] == '@')
                 sockaddr.un.sun_path[0] = 0;
 
-        iovec.iov_base = (char*) state;
-        iovec.iov_len = strlen(state);
-
-        msghdr.msg_name = &sockaddr;
         msghdr.msg_namelen = offsetof(struct sockaddr_un, sun_path) + strlen(e);
-
         if (msghdr.msg_namelen > sizeof(struct sockaddr_un))
                 msghdr.msg_namelen = sizeof(struct sockaddr_un);
 
-        msghdr.msg_iov = &iovec;
-        msghdr.msg_iovlen = 1;
+        if (n_fds > 0) {
+                msghdr.msg_control = &control;
+                msghdr.msg_controllen = CMSG_LEN(sizeof(int) * n_fds);
+
+                cmsg = CMSG_FIRSTHDR(&msghdr);
+                cmsg->cmsg_level = SOL_SOCKET;
+                cmsg->cmsg_type = SCM_RIGHTS;
+                cmsg->cmsg_len = CMSG_LEN(sizeof(int) * n_fds);
+
+                memcpy(CMSG_DATA(cmsg), fds, sizeof(int) * n_fds);
+        }
 
         if (pid != 0 && pid != getpid()) {
-                struct cmsghdr *cmsg;
-                struct ucred ucred = {};
+                struct ucred *ucred;
+
+                try_without_ucred = true;
+                controllen_without_ucred = msghdr.msg_controllen;
 
                 msghdr.msg_control = &control;
-                msghdr.msg_controllen = sizeof(control);
+                msghdr.msg_controllen += CMSG_LEN(sizeof(struct ucred));
+
+                if (cmsg)
+                        cmsg = CMSG_NXTHDR(&msghdr, cmsg);
+                else
+                        cmsg = CMSG_FIRSTHDR(&msghdr);
 
-                cmsg = CMSG_FIRSTHDR(&msghdr);
                 cmsg->cmsg_level = SOL_SOCKET;
                 cmsg->cmsg_type = SCM_CREDENTIALS;
                 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
 
-                ucred.pid = pid;
-                ucred.uid = getuid();
-                ucred.gid = getgid();
-
-                memcpy(CMSG_DATA(cmsg), &ucred, sizeof(struct ucred));
-                msghdr.msg_controllen = cmsg->cmsg_len;
+                ucred = (struct ucred*) CMSG_DATA(cmsg);
+                ucred->pid = pid;
+                ucred->uid = getuid();
+                ucred->gid = getgid();
         }
 
         /* First try with fake ucred data, as requested */
@@ -417,10 +442,11 @@ _public_ int sd_pid_notify(pid_t pid, int unset_environment, const char *state)
                 goto finish;
         }
 
-        /* If that failed, try with our own instead */
-        if (msghdr.msg_control) {
-                msghdr.msg_control = NULL;
-                msghdr.msg_controllen = 0;
+        /* If that failed, try with our own ucred instead */
+        if (try_without_ucred) {
+                if (controllen_without_ucred <= 0)
+                        msghdr.msg_control = NULL;
+                msghdr.msg_controllen = controllen_without_ucred;
 
                 if (sendmsg(fd, &msghdr, MSG_NOSIGNAL) >= 0) {
                         r = 1;
@@ -437,8 +463,12 @@ finish:
         return r;
 }
 
+_public_ int sd_pid_notify(pid_t pid, int unset_environment, const char *state) {
+        return sd_pid_notify_with_fds(pid, unset_environment, state, NULL, 0);
+}
+
 _public_ int sd_notify(int unset_environment, const char *state) {
-        return sd_pid_notify(0, unset_environment, state);
+        return sd_pid_notify_with_fds(0, unset_environment, state, NULL, 0);
 }
 
 _public_ int sd_pid_notifyf(pid_t pid, int unset_environment, const char *format, ...) {
diff --git a/src/shared/fdset.c b/src/shared/fdset.c
index 46f7773..9e35ce5 100644
--- a/src/shared/fdset.c
+++ b/src/shared/fdset.c
@@ -41,7 +41,31 @@ FDSet *fdset_new(void) {
         return MAKE_FDSET(set_new(NULL));
 }
 
-void fdset_free(FDSet *s) {
+int fdset_new_array(FDSet **ret, int *fds, unsigned n_fds) {
+        unsigned i;
+        FDSet *s;
+        int r;
+
+        assert(ret);
+
+        s = fdset_new();
+        if (!s)
+                return -ENOMEM;
+
+        for (i = 0; i < n_fds; i++) {
+
+                r = fdset_put(s, fds[i]);
+                if (r < 0) {
+                        set_free(MAKE_SET(s));
+                        return r;
+                }
+        }
+
+        *ret = s;
+        return 0;
+}
+
+FDSet* fdset_free(FDSet *s) {
         void *p;
 
         while ((p = set_steal_first(MAKE_SET(s)))) {
@@ -61,6 +85,7 @@ void fdset_free(FDSet *s) {
         }
 
         set_free(MAKE_SET(s));
+        return NULL;
 }
 
 int fdset_put(FDSet *s, int fd) {
@@ -70,6 +95,19 @@ int fdset_put(FDSet *s, int fd) {
         return set_put(MAKE_SET(s), FD_TO_PTR(fd));
 }
 
+int fdset_consume(FDSet *s, int fd) {
+        int r;
+
+        assert(s);
+        assert(fd >= 0);
+
+        r = fdset_put(s, fd);
+        if (r <= 0)
+                safe_close(fd);
+
+        return r;
+}
+
 int fdset_put_dup(FDSet *s, int fd) {
         int copy, r;
 
@@ -223,6 +261,10 @@ unsigned fdset_size(FDSet *fds) {
         return set_size(MAKE_SET(fds));
 }
 
+bool fdset_isempty(FDSet *fds) {
+        return set_isempty(MAKE_SET(fds));
+}
+
 int fdset_iterate(FDSet *s, Iterator *i) {
         void *p;
 
@@ -232,3 +274,13 @@ int fdset_iterate(FDSet *s, Iterator *i) {
 
         return PTR_TO_FD(p);
 }
+
+int fdset_steal_first(FDSet *fds) {
+        void *p;
+
+        p = set_steal_first(MAKE_SET(fds));
+        if (!p)
+                return -ENOENT;
+
+        return PTR_TO_FD(p);
+}
diff --git a/src/shared/fdset.h b/src/shared/fdset.h
index 907acd7..c3c5e52 100644
--- a/src/shared/fdset.h
+++ b/src/shared/fdset.h
@@ -27,25 +27,30 @@
 typedef struct FDSet FDSet;
 
 FDSet* fdset_new(void);
-void fdset_free(FDSet *s);
+FDSet* fdset_free(FDSet *s);
 
 int fdset_put(FDSet *s, int fd);
 int fdset_put_dup(FDSet *s, int fd);
+int fdset_consume(FDSet *s, int fd);
 
 bool fdset_contains(FDSet *s, int fd);
 int fdset_remove(FDSet *s, int fd);
 
-int fdset_new_fill(FDSet **_s);
-int fdset_new_listen_fds(FDSet **_s, bool unset);
+int fdset_new_array(FDSet **ret, int *fds, unsigned n_fds);
+int fdset_new_fill(FDSet **ret);
+int fdset_new_listen_fds(FDSet **ret, bool unset);
 
 int fdset_cloexec(FDSet *fds, bool b);
 
 int fdset_close_others(FDSet *fds);
 
 unsigned fdset_size(FDSet *fds);
+bool fdset_isempty(FDSet *fds);
 
 int fdset_iterate(FDSet *s, Iterator *i);
 
+int fdset_steal_first(FDSet *fds);
+
 #define FDSET_FOREACH(fd, fds, i) \
         for ((i) = ITERATOR_FIRST, (fd) = fdset_iterate((fds), &(i)); (fd) >= 0; (fd) = fdset_iterate((fds), &(i)))
 
diff --git a/src/shared/util.c b/src/shared/util.c
index bda3c93..f01022e 100644
--- a/src/shared/util.c
+++ b/src/shared/util.c
@@ -7672,3 +7672,28 @@ int fd_setcrtime(int fd, usec_t usec) {
 
         return 0;
 }
+
+int same_fd(int a, int b) {
+        struct stat sta, stb;
+
+        assert(a >= 0);
+        assert(b >= 0);
+
+        if (a == b)
+                return true;
+
+        if (fstat(a, &sta) < 0)
+                return -errno;
+
+        if (fstat(b, &stb) < 0)
+                return -errno;
+
+        if ((sta.st_mode & S_IFMT) != (stb.st_mode & S_IFMT))
+                return false;
+
+        if (S_ISREG(sta.st_mode) || S_ISDIR(sta.st_mode) || S_ISFIFO(sta.st_mode) || S_ISSOCK(sta.st_mode) || S_ISLNK(sta.st_mode))
+                return (sta.st_dev == stb.st_dev) && (sta.st_ino == stb.st_ino);
+
+        /* We consider all device fds different... */
+        return false;
+}
diff --git a/src/shared/util.h b/src/shared/util.h
index a131a3c..4b7e12e 100644
--- a/src/shared/util.h
+++ b/src/shared/util.h
@@ -1069,3 +1069,5 @@ int fd_setcrtime(int fd, usec_t usec);
 int fd_getcrtime(int fd, usec_t *usec);
 int path_getcrtime(const char *p, usec_t *usec);
 int fd_getcrtime_at(int dirfd, const char *name, usec_t *usec, int flags);
+
+int same_fd(int a, int b);
diff --git a/src/systemd/sd-daemon.h b/src/systemd/sd-daemon.h
index 351b4e5..b878b4d 100644
--- a/src/systemd/sd-daemon.h
+++ b/src/systemd/sd-daemon.h
@@ -190,6 +190,12 @@ int sd_is_mq(int fd, const char *path);
                   timestamps to detect failed services. Also see
                   sd_watchdog_enabled() below.
 
+     FDSTORE=1    Store the file descriptors passed along with the
+                  message in the per-service file descriptor store,
+                  and pass them to the main process again on next
+                  invocation. This variable is only supported with
+                  sd_pid_notify_with_fds().
+
   Daemons can choose to send additional variables. However, it is
   recommended to prefix variable names not listed above with X_.
 
@@ -243,6 +249,13 @@ int sd_pid_notify(pid_t pid, int unset_environment, const char *state);
 int sd_pid_notifyf(pid_t pid, int unset_environment, const char *format, ...) _sd_printf_(3,4);
 
 /*
+  Similar to sd_pid_notify(), but also passes the specified fd array
+  to the service manager for storage. This is particularly useful for
+  FDSTORE=1 messages.
+*/
+int sd_pid_notify_with_fds(pid_t pid, int unset_environment, const char *state, const int *fds, unsigned n_fds);
+
+/*
   Returns > 0 if the system was booted with systemd. Returns < 0 on
   error. Returns 0 if the system was not booted with systemd. Note
   that all of the functions above handle non-systemd boots just



More information about the systemd-commits mailing list