[systemd-commits] 3 commits - CODING_STYLE TODO src/journal

Lennart Poettering lennart at kemper.freedesktop.org
Thu Oct 30 09:43:27 PDT 2014


 CODING_STYLE                  |    5 +++
 TODO                          |    4 --
 src/journal/journal-send.c    |   36 ++++++++++++++++------
 src/journal/journald-native.c |   68 ++++++++++++++++++++++++++++++------------
 4 files changed, 81 insertions(+), 32 deletions(-)

New commits:
commit f6721176284fe49b1beb146c70379cd1f2b17852
Author: Lennart Poettering <lennart at poettering.net>
Date:   Thu Oct 30 17:39:29 2014 +0100

    update TODO

diff --git a/TODO b/TODO
index d11d73f..c86d6ab 100644
--- a/TODO
+++ b/TODO
@@ -60,8 +60,6 @@ Features:
 
 * PID 1 doesn't apply nspawns devices cgroup policy
 
-* rework journal-send.c to use memfds for large blobs if they are available instead of unlinked files in /tmp. Also, if we detect that the kernel knows memfds, refuse anything but sealed memfds.
-
 * maybe support a new very "soft" reboot mode, that simply kills all processes, disassembles everything, flushes /run and sysvipc, and then reexecs systemd again
 
 * man: document that corrupted journal files is nothing to act on
@@ -71,8 +69,6 @@ Features:
   cannot pass into sendmsg() of the AF_UNIX sokcet (which only accepts
   253 messages)
 
-* busctl: add a pcap writer, using LINKTYPE_DBUS/231
-
 * man: maybe use the word "inspect" rather than "introspect"?
 
 * introduce machines.target to order after all nspawn instances

commit c79e98eadd3056a36a662699fa650db5b1bca0c3
Author: Lennart Poettering <lennart at poettering.net>
Date:   Thu Oct 30 17:36:02 2014 +0100

    journal: when sending huge log messages prefer memfds over temporary files in /dev/shm
    
    Previously when a log message grew beyond the maximum AF_UNIX/SOCK_DGRAM
    datagram limit we'd send an fd to a deleted file in /dev/shm instead.
    Because the sender could still modify the file after delivery we had to
    immediately copy the data on the receiving side.
    
    With memfds we can optimize this logic, and also remove the dependency
    on /dev/shm: simply send a sealed memfd around, and if we detect the
    seal memory map the fd and use it directly.

diff --git a/src/journal/journal-send.c b/src/journal/journal-send.c
index bb1ef66..bfc404e 100644
--- a/src/journal/journal-send.c
+++ b/src/journal/journal-send.c
@@ -198,7 +198,7 @@ finish:
 
 _public_ int sd_journal_sendv(const struct iovec *iov, int n) {
         PROTECT_ERRNO;
-        int fd;
+        int fd, r;
         _cleanup_close_ int buffer_fd = -1;
         struct iovec *w;
         uint64_t *l;
@@ -218,6 +218,7 @@ _public_ int sd_journal_sendv(const struct iovec *iov, int n) {
         } control;
         struct cmsghdr *cmsg;
         bool have_syslog_identifier = false;
+        bool seal = true;
 
         assert_return(iov, -EINVAL);
         assert_return(n > 0, -EINVAL);
@@ -304,21 +305,36 @@ _public_ int sd_journal_sendv(const struct iovec *iov, int n) {
         if (errno != EMSGSIZE && errno != ENOBUFS)
                 return -errno;
 
-        /* Message doesn't fit... Let's dump the data in a temporary
-         * file and just pass a file descriptor of it to the other
-         * side.
+        /* Message doesn't fit... Let's dump the data in a memfd or
+         * temporary file and just pass a file descriptor of it to the
+         * other side.
          *
-         * We use /dev/shm instead of /tmp here, since we want this to
-         * be a tmpfs, and one that is available from early boot on
-         * and where unprivileged users can create files. */
-        buffer_fd = open_tmpfile("/dev/shm", O_RDWR | O_CLOEXEC);
-        if (buffer_fd < 0)
-                return buffer_fd;
+         * For the temporary files we use /dev/shm instead of /tmp
+         * here, since we want this to be a tmpfs, and one that is
+         * available from early boot on and where unprivileged users
+         * can create files. */
+        buffer_fd = memfd_create("journal-message", MFD_ALLOW_SEALING | MFD_CLOEXEC);
+        if (buffer_fd < 0) {
+                if (errno == ENOSYS) {
+                        buffer_fd = open_tmpfile("/dev/shm", O_RDWR | O_CLOEXEC);
+                        if (buffer_fd < 0)
+                                return buffer_fd;
+
+                        seal = false;
+                } else
+                        return -errno;
+        }
 
         n = writev(buffer_fd, w, j);
         if (n < 0)
                 return -errno;
 
+        if (seal) {
+                r = fcntl(buffer_fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_SEAL);
+                if (r < 0)
+                        return -errno;
+        }
+
         mh.msg_iov = NULL;
         mh.msg_iovlen = 0;
 
diff --git a/src/journal/journald-native.c b/src/journal/journald-native.c
index a635202..6a9ae8a 100644
--- a/src/journal/journald-native.c
+++ b/src/journal/journald-native.c
@@ -22,6 +22,7 @@
 #include <unistd.h>
 #include <stddef.h>
 #include <sys/epoll.h>
+#include <sys/mman.h>
 
 #include "socket-util.h"
 #include "path-util.h"
@@ -306,17 +307,29 @@ void server_process_native_file(
                 const char *label, size_t label_len) {
 
         struct stat st;
-        _cleanup_free_ void *p = NULL;
-        ssize_t n;
+        bool sealed;
         int r;
 
+        /* Data is in the passed fd, since it didn't fit in a
+         * datagram. */
+
         assert(s);
         assert(fd >= 0);
 
-        if (!ucred || ucred->uid != 0) {
+        /* If it's a memfd, check if it is sealed. If so, we can just
+         * use map it and use it, and do not need to copy the data
+         * out. */
+        r = fcntl(fd, F_GET_SEALS);
+        sealed = r >= 0 &&
+                (r & (F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_SEAL)) == (F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_SEAL);
+
+        if (!sealed && (!ucred || ucred->uid != 0)) {
                 _cleanup_free_ char *sl = NULL, *k = NULL;
                 const char *e;
 
+                /* If this is not a sealed memfd, and the peer is unknown or
+                 * unprivileged, then verify the path. */
+
                 if (asprintf(&sl, "/proc/self/fd/%i", fd) < 0) {
                         log_oom();
                         return;
@@ -344,11 +357,6 @@ void server_process_native_file(
                 }
         }
 
-        /* Data is in the passed file, since it didn't fit in a
-         * datagram. We can't map the file here, since clients might
-         * then truncate it and trigger a SIGBUS for us. So let's
-         * stupidly read it */
-
         if (fstat(fd, &st) < 0) {
                 log_error("Failed to stat passed file, ignoring: %m");
                 return;
@@ -367,17 +375,41 @@ void server_process_native_file(
                 return;
         }
 
-        p = malloc(st.st_size);
-        if (!p) {
-                log_oom();
-                return;
-        }
+        if (sealed) {
+                void *p;
+                size_t ps;
+
+                /* The file is sealed, we can just map it and use it. */
 
-        n = pread(fd, p, st.st_size, 0);
-        if (n < 0)
-                log_error("Failed to read file, ignoring: %s", strerror(-n));
-        else if (n > 0)
-                server_process_native_message(s, p, n, ucred, tv, label, label_len);
+                ps = PAGE_ALIGN(st.st_size);
+                p = mmap(NULL, ps, PROT_READ, MAP_PRIVATE, fd, 0);
+                if (p == MAP_FAILED) {
+                        log_error("Failed to map memfd, ignoring: %m");
+                        return;
+                }
+
+                server_process_native_message(s, p, st.st_size, ucred, tv, label, label_len);
+                assert_se(munmap(p, ps) >= 0);
+        } else {
+                _cleanup_free_ void *p = NULL;
+                ssize_t n;
+
+                /* The file is not sealed, we can't map the file here, since
+                 * clients might then truncate it and trigger a SIGBUS for
+                 * us. So let's stupidly read it */
+
+                p = malloc(st.st_size);
+                if (!p) {
+                        log_oom();
+                        return;
+                }
+
+                n = pread(fd, p, st.st_size, 0);
+                if (n < 0)
+                        log_error("Failed to read file, ignoring: %s", strerror(-n));
+                else if (n > 0)
+                        server_process_native_message(s, p, n, ucred, tv, label, label_len);
+        }
 }
 
 int server_open_native_socket(Server*s) {

commit dd4540da0e1f983540d862cc657df7161a3bdd06
Author: Lennart Poettering <lennart at poettering.net>
Date:   Thu Oct 30 17:05:25 2014 +0100

    CODING_STYLE: clarify that we really should use O_CLOEXEC everywhere

diff --git a/CODING_STYLE b/CODING_STYLE
index 4439ee6..0b1f809 100644
--- a/CODING_STYLE
+++ b/CODING_STYLE
@@ -190,3 +190,8 @@
 - Do not write functions that clobber call-by-reference variables on
   failure. Use temporary variables for these cases and change the
   passed in variables only on success.
+
+- When you allocate a file descriptor, it should be made O_CLOEXEC
+  right from the beginning, as none of our files should leak to forked
+  binaries by default. Hence, whenever you open a file, O_CLOEXEC must
+  be specified, right from the beginning.



More information about the systemd-commits mailing list