[systemd-commits] 11 commits - .gitignore Makefile.am TODO src/bus-proxyd src/core src/journal src/journal-remote src/libsystemd src/machine src/nspawn src/shared src/test
Lennart Poettering
lennart at kemper.freedesktop.org
Sun Jan 4 16:42:32 PST 2015
.gitignore | 1
Makefile.am | 11 ++
TODO | 1
src/bus-proxyd/bus-proxyd.c | 4
src/core/cgroup.c | 38 ++++++--
src/core/mount-setup.c | 41 +--------
src/journal-remote/journal-gatewayd.c | 49 +++++-----
src/journal-remote/journal-upload.c | 12 +-
src/journal/coredumpctl.c | 25 ++---
src/journal/journal-file.c | 100 ++++++++++++++++------
src/journal/journal-file.h | 2
src/journal/journalctl.c | 2
src/journal/journald-audit.c | 2
src/journal/journald-native.c | 2
src/journal/journald-server.c | 35 +++++--
src/journal/journald-server.h | 3
src/journal/journald-syslog.c | 2
src/journal/journald.c | 4
src/journal/mmap-cache.c | 126 ++++++++++++++++++++++++++--
src/journal/mmap-cache.h | 5 -
src/libsystemd/sd-bus/bus-util.c | 8 +
src/libsystemd/sd-bus/sd-bus.c | 4
src/machine/machine-dbus.c | 4
src/nspawn/nspawn.c | 130 +++++++++++++++++++++++++++--
src/shared/cgroup-util.c | 61 ++++++++++++-
src/shared/cgroup-util.h | 2
src/shared/sigbus.c | 152 ++++++++++++++++++++++++++++++++++
src/shared/sigbus.h | 25 +++++
src/test/test-sigbus.c | 62 +++++++++++++
29 files changed, 752 insertions(+), 161 deletions(-)
New commits:
commit 805d14864f8d6936202b85730a7d8a77317d1202
Author: Lennart Poettering <lennart at poettering.net>
Date: Mon Jan 5 01:20:44 2015 +0100
journald: add some additional checks before we divide by values read from journal file headers
Since the file headers might be replaced by zeroed pages now due to
sigbus we should make sure we don't end up dividing by zero because we
don't check values read from journal file headers for changes.
diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c
index 44a9692..1736ff5 100644
--- a/src/journal/journal-file.c
+++ b/src/journal/journal-file.c
@@ -658,7 +658,7 @@ static int journal_file_link_field(
uint64_t offset,
uint64_t hash) {
- uint64_t p, h;
+ uint64_t p, h, m;
int r;
assert(f);
@@ -668,11 +668,14 @@ static int journal_file_link_field(
if (o->object.type != OBJECT_FIELD)
return -EINVAL;
- /* This might alter the window we are looking at */
+ m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
+ if (m <= 0)
+ return -EBADMSG;
+ /* This might alter the window we are looking at */
o->field.next_hash_offset = o->field.head_data_offset = 0;
- h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
+ h = hash % m;
p = le64toh(f->field_hash_table[h].tail_hash_offset);
if (p == 0)
f->field_hash_table[h].head_hash_offset = htole64(offset);
@@ -698,7 +701,7 @@ static int journal_file_link_data(
uint64_t offset,
uint64_t hash) {
- uint64_t p, h;
+ uint64_t p, h, m;
int r;
assert(f);
@@ -708,13 +711,16 @@ static int journal_file_link_data(
if (o->object.type != OBJECT_DATA)
return -EINVAL;
- /* This might alter the window we are looking at */
+ m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
+ if (m <= 0)
+ return -EBADMSG;
+ /* This might alter the window we are looking at */
o->data.next_hash_offset = o->data.next_field_offset = 0;
o->data.entry_offset = o->data.entry_array_offset = 0;
o->data.n_entries = 0;
- h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
+ h = hash % m;
p = le64toh(f->data_hash_table[h].tail_hash_offset);
if (p == 0)
/* Only entry in the hash table is easy */
@@ -743,7 +749,7 @@ int journal_file_find_field_object_with_hash(
const void *field, uint64_t size, uint64_t hash,
Object **ret, uint64_t *offset) {
- uint64_t p, osize, h;
+ uint64_t p, osize, h, m;
int r;
assert(f);
@@ -751,10 +757,12 @@ int journal_file_find_field_object_with_hash(
osize = offsetof(Object, field.payload) + size;
- if (f->header->field_hash_table_size == 0)
+ m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
+
+ if (m <= 0)
return -EBADMSG;
- h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
+ h = hash % m;
p = le64toh(f->field_hash_table[h].head_hash_offset);
while (p > 0) {
@@ -804,7 +812,7 @@ int journal_file_find_data_object_with_hash(
const void *data, uint64_t size, uint64_t hash,
Object **ret, uint64_t *offset) {
- uint64_t p, osize, h;
+ uint64_t p, osize, h, m;
int r;
assert(f);
@@ -812,10 +820,11 @@ int journal_file_find_data_object_with_hash(
osize = offsetof(Object, data.payload) + size;
- if (f->header->data_hash_table_size == 0)
+ m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
+ if (m <= 0)
return -EBADMSG;
- h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
+ h = hash % m;
p = le64toh(f->data_hash_table[h].head_hash_offset);
while (p > 0) {
commit 3774cf57081b595003e9236602d049b1011b6e97
Author: Lennart Poettering <lennart at poettering.net>
Date: Mon Jan 5 01:08:51 2015 +0100
journalctl: static variables immediately configured via command line arguments should be prefixed with "arg_"
diff --git a/src/journal/coredumpctl.c b/src/journal/coredumpctl.c
index 8ebca4f..756e793 100644
--- a/src/journal/coredumpctl.c
+++ b/src/journal/coredumpctl.c
@@ -50,8 +50,7 @@ static const char* arg_field = NULL;
static int arg_no_pager = false;
static int arg_no_legend = false;
static int arg_one = false;
-
-static FILE* output = NULL;
+static FILE* arg_output = NULL;
static Set *new_matches(void) {
Set *set;
@@ -184,13 +183,13 @@ static int parse_argv(int argc, char *argv[], Set *matches) {
break;
case 'o':
- if (output) {
+ if (arg_output) {
log_error("cannot set output more than once");
return -EINVAL;
}
- output = fopen(optarg, "we");
- if (!output)
+ arg_output = fopen(optarg, "we");
+ if (!arg_output)
return log_error_errno(errno, "writing to '%s': %m", optarg);
break;
@@ -687,14 +686,14 @@ static int dump_core(sd_journal* j) {
if (r < 0)
return r;
- print_info(output ? stdout : stderr, j, false);
+ print_info(arg_output ? stdout : stderr, j, false);
- if (on_tty() && !output) {
+ if (on_tty() && !arg_output) {
log_error("Refusing to dump core to tty.");
return -ENOTTY;
}
- r = save_core(j, output ? fileno(output) : STDOUT_FILENO, NULL, NULL);
+ r = save_core(j, arg_output ? fileno(arg_output) : STDOUT_FILENO, NULL, NULL);
if (r < 0)
return log_error_errno(r, "Coredump retrieval failed: %m");
@@ -855,8 +854,8 @@ int main(int argc, char *argv[]) {
end:
pager_close();
- if (output)
- fclose(output);
+ if (arg_output)
+ fclose(arg_output);
return r >= 0 ? r : EXIT_FAILURE;
}
commit 2cf4172a71860c6e44edd27a3b68047ae062d7fc
Author: Lennart Poettering <lennart at poettering.net>
Date: Mon Jan 5 00:52:47 2015 +0100
journal: install sigbus handler for journal tools too
This makes them robust regarding truncation. Ideally, we'd export this
as an API, but given how messy SIGBUS handling is, and the uncertain
ownership logic of signal handlers we should not do this (unless libc
one day invents a scheme how to sanely install SIGBUS handlers for
specific memory areas only). However, for now we can still make all our
own tools robust.
Note that external tools will only have read-access to the journal
anyway, where SIGBUS is much more unlikely, given that only writes are
subject to disk full problems.
diff --git a/src/journal-remote/journal-gatewayd.c b/src/journal-remote/journal-gatewayd.c
index 7a99430..576f7ca 100644
--- a/src/journal-remote/journal-gatewayd.c
+++ b/src/journal-remote/journal-gatewayd.c
@@ -31,20 +31,21 @@
#include <gnutls/gnutls.h>
#endif
-#include "log.h"
-#include "util.h"
#include "sd-journal.h"
#include "sd-daemon.h"
#include "sd-bus.h"
+#include "log.h"
+#include "util.h"
#include "bus-util.h"
#include "logs-show.h"
#include "microhttpd-util.h"
#include "build.h"
#include "fileio.h"
+#include "sigbus.h"
-static char *key_pem = NULL;
-static char *cert_pem = NULL;
-static char *trust_pem = NULL;
+static char *arg_key_pem = NULL;
+static char *arg_cert_pem = NULL;
+static char *arg_trust_pem = NULL;
typedef struct RequestMeta {
sd_journal *journal;
@@ -833,7 +834,7 @@ static int request_handler(
return MHD_YES;
}
- if (trust_pem) {
+ if (arg_trust_pem) {
r = check_permissions(connection, &code, NULL);
if (r < 0)
return code;
@@ -904,37 +905,37 @@ static int parse_argv(int argc, char *argv[]) {
return 0;
case ARG_KEY:
- if (key_pem) {
+ if (arg_key_pem) {
log_error("Key file specified twice");
return -EINVAL;
}
- r = read_full_file(optarg, &key_pem, NULL);
+ r = read_full_file(optarg, &arg_key_pem, NULL);
if (r < 0)
return log_error_errno(r, "Failed to read key file: %m");
- assert(key_pem);
+ assert(arg_key_pem);
break;
case ARG_CERT:
- if (cert_pem) {
+ if (arg_cert_pem) {
log_error("Certificate file specified twice");
return -EINVAL;
}
- r = read_full_file(optarg, &cert_pem, NULL);
+ r = read_full_file(optarg, &arg_cert_pem, NULL);
if (r < 0)
return log_error_errno(r, "Failed to read certificate file: %m");
- assert(cert_pem);
+ assert(arg_cert_pem);
break;
case ARG_TRUST:
#ifdef HAVE_GNUTLS
- if (trust_pem) {
+ if (arg_trust_pem) {
log_error("CA certificate file specified twice");
return -EINVAL;
}
- r = read_full_file(optarg, &trust_pem, NULL);
+ r = read_full_file(optarg, &arg_trust_pem, NULL);
if (r < 0)
return log_error_errno(r, "Failed to read CA certificate file: %m");
- assert(trust_pem);
+ assert(arg_trust_pem);
break;
#else
log_error("Option --trust is not available.");
@@ -952,12 +953,12 @@ static int parse_argv(int argc, char *argv[]) {
return -EINVAL;
}
- if (!!key_pem != !!cert_pem) {
+ if (!!arg_key_pem != !!arg_cert_pem) {
log_error("Certificate and key files must be specified together");
return -EINVAL;
}
- if (trust_pem && !key_pem) {
+ if (arg_trust_pem && !arg_key_pem) {
log_error("CA certificate can only be used with certificate file");
return -EINVAL;
}
@@ -979,6 +980,8 @@ int main(int argc, char *argv[]) {
if (r == 0)
return EXIT_SUCCESS;
+ sigbus_install();
+
#ifdef HAVE_GNUTLS
gnutls_global_set_log_function(log_func_gnutls);
log_reset_gnutls_level();
@@ -1008,18 +1011,18 @@ int main(int argc, char *argv[]) {
if (n > 0)
opts[opts_pos++] = (struct MHD_OptionItem)
{MHD_OPTION_LISTEN_SOCKET, SD_LISTEN_FDS_START};
- if (key_pem) {
- assert(cert_pem);
+ if (arg_key_pem) {
+ assert(arg_cert_pem);
opts[opts_pos++] = (struct MHD_OptionItem)
- {MHD_OPTION_HTTPS_MEM_KEY, 0, key_pem};
+ {MHD_OPTION_HTTPS_MEM_KEY, 0, arg_key_pem};
opts[opts_pos++] = (struct MHD_OptionItem)
- {MHD_OPTION_HTTPS_MEM_CERT, 0, cert_pem};
+ {MHD_OPTION_HTTPS_MEM_CERT, 0, arg_cert_pem};
flags |= MHD_USE_SSL;
}
- if (trust_pem) {
+ if (arg_trust_pem) {
assert(flags & MHD_USE_SSL);
opts[opts_pos++] = (struct MHD_OptionItem)
- {MHD_OPTION_HTTPS_MEM_TRUST, 0, trust_pem};
+ {MHD_OPTION_HTTPS_MEM_TRUST, 0, arg_trust_pem};
}
d = MHD_start_daemon(flags, 19531,
diff --git a/src/journal-remote/journal-upload.c b/src/journal-remote/journal-upload.c
index 62853b6..5b25da5 100644
--- a/src/journal-remote/journal-upload.c
+++ b/src/journal-remote/journal-upload.c
@@ -26,13 +26,13 @@
#include <getopt.h>
#include "sd-daemon.h"
-
#include "log.h"
#include "util.h"
#include "build.h"
#include "fileio.h"
#include "mkdir.h"
#include "conf-parser.h"
+#include "sigbus.h"
#include "journal-upload.h"
#define PRIV_KEY_FILE CERTIFICATE_ROOT "/private/journal-upload.pem"
@@ -40,14 +40,10 @@
#define TRUST_FILE CERTIFICATE_ROOT "/ca/trusted.pem"
#define DEFAULT_PORT 19532
-static const char* arg_url;
-
-static void close_fd_input(Uploader *u);
-
+static const char* arg_url = NULL;
static const char *arg_key = NULL;
static const char *arg_cert = NULL;
static const char *arg_trust = NULL;
-
static const char *arg_directory = NULL;
static char **arg_file = NULL;
static const char *arg_cursor = NULL;
@@ -58,6 +54,8 @@ static bool arg_merge = false;
static int arg_follow = -1;
static const char *arg_save_state = NULL;
+static void close_fd_input(Uploader *u);
+
#define SERVER_ANSWER_KEEP 2048
#define STATE_FILE "/var/lib/systemd/journal-upload/state"
@@ -792,6 +790,8 @@ int main(int argc, char **argv) {
if (r <= 0)
goto finish;
+ sigbus_install();
+
r = setup_uploader(&u, arg_url, arg_save_state);
if (r < 0)
goto cleanup;
diff --git a/src/journal/coredumpctl.c b/src/journal/coredumpctl.c
index a6551ac..8ebca4f 100644
--- a/src/journal/coredumpctl.c
+++ b/src/journal/coredumpctl.c
@@ -26,8 +26,7 @@
#include <fcntl.h>
#include <unistd.h>
-#include "systemd/sd-journal.h"
-
+#include "sd-journal.h"
#include "build.h"
#include "set.h"
#include "util.h"
@@ -38,6 +37,7 @@
#include "journal-internal.h"
#include "copy.h"
#include "compress.h"
+#include "sigbus.h"
static enum {
ACTION_NONE,
@@ -803,6 +803,8 @@ int main(int argc, char *argv[]) {
if (arg_action == ACTION_NONE)
goto end;
+ sigbus_install();
+
r = sd_journal_open(&j, SD_JOURNAL_LOCAL_ONLY);
if (r < 0) {
log_error_errno(r, "Failed to open journal: %m");
diff --git a/src/journal/journalctl.c b/src/journal/journalctl.c
index 7b67bc1..c91f2cf 100644
--- a/src/journal/journalctl.c
+++ b/src/journal/journalctl.c
@@ -54,6 +54,7 @@
#include "pager.h"
#include "strv.h"
#include "set.h"
+#include "sigbus.h"
#include "journal-internal.h"
#include "journal-def.h"
#include "journal-verify.h"
@@ -1723,6 +1724,7 @@ int main(int argc, char *argv[]) {
goto finish;
signal(SIGWINCH, columns_lines_cache_reset);
+ sigbus_install();
if (arg_action == ACTION_NEW_ID128) {
r = generate_new_id128();
commit b798e7baa5dc3ca6199bbff69f71a08d94299fdd
Author: Lennart Poettering <lennart at poettering.net>
Date: Mon Jan 5 00:38:31 2015 +0100
systemctl: fix waiting for jobs when using direct connections to PID 1 for dbus
diff --git a/src/libsystemd/sd-bus/bus-util.c b/src/libsystemd/sd-bus/bus-util.c
index 29520f9..86b83db 100644
--- a/src/libsystemd/sd-bus/bus-util.c
+++ b/src/libsystemd/sd-bus/bus-util.c
@@ -1638,13 +1638,21 @@ int bus_wait_for_jobs_new(sd_bus *bus, BusWaitForJobs **ret) {
d->bus = sd_bus_ref(bus);
+ /* When we are a bus client we match by sender. Direct
+ * connections OTOH have no initialized sender field, and
+ * hence we ignore the sender then */
r = sd_bus_add_match(
bus,
&d->slot_job_removed,
+ bus->bus_client ?
"type='signal',"
"sender='org.freedesktop.systemd1',"
"interface='org.freedesktop.systemd1.Manager',"
"member='JobRemoved',"
+ "path='/org/freedesktop/systemd1'" :
+ "type='signal',"
+ "interface='org.freedesktop.systemd1.Manager',"
+ "member='JobRemoved',"
"path='/org/freedesktop/systemd1'",
match_job_removed, d);
if (r < 0)
commit ea69bd41c5923f4f278a09bb7d8cb1abcfa122e1
Author: Lennart Poettering <lennart at poettering.net>
Date: Mon Jan 5 00:13:26 2015 +0100
journald: constify all things
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
index d987d8f..019c3a6 100644
--- a/src/journal/journald-server.c
+++ b/src/journal/journald-server.c
@@ -297,8 +297,13 @@ static JournalFile* find_journal(Server *s, uid_t uid) {
return f;
}
-static int do_rotate(Server *s, JournalFile **f, const char* name,
- bool seal, uint32_t uid) {
+static int do_rotate(
+ Server *s,
+ JournalFile **f,
+ const char* name,
+ bool seal,
+ uint32_t uid) {
+
int r;
assert(s);
@@ -308,11 +313,9 @@ static int do_rotate(Server *s, JournalFile **f, const char* name,
r = journal_file_rotate(f, s->compress, seal);
if (r < 0)
if (*f)
- log_error_errno(r, "Failed to rotate %s: %m",
- (*f)->path);
+ log_error_errno(r, "Failed to rotate %s: %m", (*f)->path);
else
- log_error_errno(r, "Failed to create new %s journal: %m",
- name);
+ log_error_errno(r, "Failed to create new %s journal: %m", name);
else
server_fix_perms(s, *f, uid);
return r;
@@ -366,15 +369,20 @@ void server_sync(Server *s) {
s->sync_scheduled = false;
}
-static void do_vacuum(Server *s, char *ids, JournalFile *f, const char* path,
- JournalMetrics *metrics) {
- char *p;
+static void do_vacuum(
+ Server *s,
+ const char *id,
+ JournalFile *f,
+ const char* path,
+ JournalMetrics *metrics) {
+
+ const char *p;
int r;
if (!f)
return;
- p = strappenda(path, ids);
+ p = strappenda(path, id);
r = journal_directory_vacuum(p, metrics->max_use, s->max_retention_usec, &s->oldest_file_usec, false);
if (r < 0 && r != -ENOENT)
log_error_errno(r, "Failed to vacuum %s: %m", p);
commit 146d47736780e06f618379a6c9f46edcf46803a7
Author: Lennart Poettering <lennart at poettering.net>
Date: Sun Jan 4 22:20:22 2015 +0100
machined,bus-proxy: fix connecting to containers
diff --git a/src/bus-proxyd/bus-proxyd.c b/src/bus-proxyd/bus-proxyd.c
index 6d9e1a0..a7818f5 100644
--- a/src/bus-proxyd/bus-proxyd.c
+++ b/src/bus-proxyd/bus-proxyd.c
@@ -139,9 +139,9 @@ static int parse_argv(int argc, char *argv[]) {
return log_oom();
#ifdef ENABLE_KDBUS
- a = strjoin("x-container-kernel:machine=", e, ";x-container-unix:machine=", e, NULL);
+ a = strjoin("x-machine-kernel:machine=", e, ";x-machine-unix:machine=", e, NULL);
#else
- a = strjoin("x-container-unix:machine=", e, NULL);
+ a = strjoin("x-machine-unix:machine=", e, NULL);
#endif
if (!a)
return log_oom();
diff --git a/src/libsystemd/sd-bus/sd-bus.c b/src/libsystemd/sd-bus/sd-bus.c
index f87c316..a8d03b8 100644
--- a/src/libsystemd/sd-bus/sd-bus.c
+++ b/src/libsystemd/sd-bus/sd-bus.c
@@ -952,7 +952,7 @@ static int bus_parse_next_address(sd_bus *b) {
break;
} else if (startswith(a, "x-machine-unix:")) {
- a += 17;
+ a += 15;
r = parse_container_unix_address(b, &a, &guid);
if (r < 0)
return r;
@@ -960,7 +960,7 @@ static int bus_parse_next_address(sd_bus *b) {
break;
} else if (startswith(a, "x-machine-kernel:")) {
- a += 19;
+ a += 17;
r = parse_container_kernel_address(b, &a, &guid);
if (r < 0)
return r;
diff --git a/src/machine/machine-dbus.c b/src/machine/machine-dbus.c
index cbdbc52..e7d4a3b 100644
--- a/src/machine/machine-dbus.c
+++ b/src/machine/machine-dbus.c
@@ -463,9 +463,9 @@ int bus_machine_method_open_login(sd_bus *bus, sd_bus_message *message, void *us
return r;
#ifdef ENABLE_KDBUS
- asprintf(&container_bus->address, "x-container-kernel:pid=" PID_FMT ";x-container-unix:pid=" PID_FMT, m->leader, m->leader);
+ asprintf(&container_bus->address, "x-machine-kernel:pid=" PID_FMT ";x-machine-unix:pid=" PID_FMT, m->leader, m->leader);
#else
- asprintf(&container_bus->address, "x-container-kernel:pid=" PID_FMT, m->leader);
+ asprintf(&container_bus->address, "x-machine-kernel:pid=" PID_FMT, m->leader);
#endif
if (!container_bus->address)
return -ENOMEM;
commit 8531ae707d4d0203e83304d4af948b8169a5fce1
Author: Lennart Poettering <lennart at poettering.net>
Date: Sun Jan 4 22:09:07 2015 +0100
journald: prefix exported calls with "server_", unexport unnecessary calls
diff --git a/src/journal/journald-audit.c b/src/journal/journald-audit.c
index 20936f4..9d21738 100644
--- a/src/journal/journald-audit.c
+++ b/src/journal/journald-audit.c
@@ -538,7 +538,7 @@ int server_open_audit(Server *s) {
if (r < 0)
return log_error_errno(errno, "Failed to set SO_PASSCRED on audit socket: %m");
- r = sd_event_add_io(s->event, &s->audit_event_source, s->audit_fd, EPOLLIN, process_datagram, s);
+ r = sd_event_add_io(s->event, &s->audit_event_source, s->audit_fd, EPOLLIN, server_process_datagram, s);
if (r < 0)
return log_error_errno(r, "Failed to add audit fd to event loop: %m");
diff --git a/src/journal/journald-native.c b/src/journal/journald-native.c
index b0120ef..851625d 100644
--- a/src/journal/journald-native.c
+++ b/src/journal/journald-native.c
@@ -453,7 +453,7 @@ int server_open_native_socket(Server*s) {
if (r < 0)
return log_error_errno(errno, "SO_TIMESTAMP failed: %m");
- r = sd_event_add_io(s->event, &s->native_event_source, s->native_fd, EPOLLIN, process_datagram, s);
+ r = sd_event_add_io(s->event, &s->native_event_source, s->native_fd, EPOLLIN, server_process_datagram, s);
if (r < 0)
return log_error_errno(r, "Failed to add native server fd to event loop: %m");
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
index 6d037cf..d987d8f 100644
--- a/src/journal/journald-server.c
+++ b/src/journal/journald-server.c
@@ -446,7 +446,7 @@ static void server_cache_hostname(Server *s) {
s->hostname_field = x;
}
-bool shall_try_append_again(JournalFile *f, int r) {
+static bool shall_try_append_again(JournalFile *f, int r) {
/* -E2BIG Hit configured limit
-EFBIG Hit fs limit
@@ -1106,7 +1106,7 @@ finish:
return r;
}
-int process_datagram(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
+int server_process_datagram(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
Server *s = userdata;
assert(s);
diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h
index 9c7fa50..c96877c 100644
--- a/src/journal/journald-server.h
+++ b/src/journal/journald-server.h
@@ -167,7 +167,6 @@ const char *split_mode_to_string(SplitMode s) _const_;
SplitMode split_mode_from_string(const char *s) _pure_;
void server_fix_perms(Server *s, JournalFile *f, uid_t uid);
-bool shall_try_append_again(JournalFile *f, int r);
int server_init(Server *s);
void server_done(Server *s);
void server_sync(Server *s);
@@ -176,4 +175,4 @@ void server_rotate(Server *s);
int server_schedule_sync(Server *s, int priority);
int server_flush_to_var(Server *s);
void server_maybe_append_tags(Server *s);
-int process_datagram(sd_event_source *es, int fd, uint32_t revents, void *userdata);
+int server_process_datagram(sd_event_source *es, int fd, uint32_t revents, void *userdata);
diff --git a/src/journal/journald-syslog.c b/src/journal/journald-syslog.c
index cc44d45..6f43fd4 100644
--- a/src/journal/journald-syslog.c
+++ b/src/journal/journald-syslog.c
@@ -457,7 +457,7 @@ int server_open_syslog_socket(Server *s) {
if (r < 0)
return log_error_errno(errno, "SO_TIMESTAMP failed: %m");
- r = sd_event_add_io(s->event, &s->syslog_event_source, s->syslog_fd, EPOLLIN, process_datagram, s);
+ r = sd_event_add_io(s->event, &s->syslog_event_source, s->syslog_fd, EPOLLIN, server_process_datagram, s);
if (r < 0)
return log_error_errno(r, "Failed to add syslog server fd to event loop: %m");
commit fa6ac76083b8ffc1309876459f54f9f0e2843731
Author: Lennart Poettering <lennart at poettering.net>
Date: Tue Dec 30 20:57:53 2014 +0100
journald: process SIGBUS for the memory maps we set up
Even though we use fallocate() it appears that file systems like btrfs
will trigger SIGBUS on certain low-disk-space situation. We should
handle that, hence catch the signal, add it to a list of invalidated
pages, and replace the page with an empty memory area. After each write
check if SIGBUS was triggered, and consider the write invalid if it was.
This should make journald a lot more robust with file systems where
fallocate() is not reliable, for example all CoW file systems
(btrfs...), where changing written data can fail with disk full errors.
https://bugzilla.redhat.com/show_bug.cgi?id=1045810
diff --git a/.gitignore b/.gitignore
index 078fd9a..70ee4f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -233,6 +233,7 @@
/test-rtnl-manual
/test-sched-prio
/test-set
+/test-sigbus
/test-sleep
/test-socket-util
/test-ssd
diff --git a/Makefile.am b/Makefile.am
index 73e911f..10fc8a9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -901,6 +901,8 @@ libsystemd_shared_la_SOURCES = \
src/shared/verbs.h \
src/shared/machine-image.c \
src/shared/machine-image.h \
+ src/shared/sigbus.c \
+ src/shared/sigbus.h \
src/shared/build.h
if HAVE_UTMP
@@ -1386,7 +1388,8 @@ tests += \
test-locale-util \
test-execute \
test-copy \
- test-cap-list
+ test-cap-list \
+ test-sigbus
EXTRA_DIST += \
test/a.service \
@@ -1580,6 +1583,12 @@ test_copy_SOURCES = \
test_copy_LDADD = \
libsystemd-shared.la
+test_sigbus_SOURCES = \
+ src/test/test-sigbus.c
+
+test_sigbus_LDADD = \
+ libsystemd-shared.la
+
test_condition_SOURCES = \
src/test/test-condition.c
diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c
index 48c27ee..44a9692 100644
--- a/src/journal/journal-file.c
+++ b/src/journal/journal-file.c
@@ -67,6 +67,9 @@
/* How much to increase the journal file size at once each time we allocate something new. */
#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
+/* The mmap context to use for the header we pick as one above the last defined typed */
+#define CONTEXT_HEADER _OBJECT_TYPE_MAX
+
static int journal_file_set_online(JournalFile *f) {
assert(f);
@@ -76,6 +79,9 @@ static int journal_file_set_online(JournalFile *f) {
if (!(f->fd >= 0 && f->header))
return -EINVAL;
+ if (mmap_cache_got_sigbus(f->mmap, f->fd))
+ return -EIO;
+
switch(f->header->state) {
case STATE_ONLINE:
return 0;
@@ -104,8 +110,14 @@ int journal_file_set_offline(JournalFile *f) {
fsync(f->fd);
+ if (mmap_cache_got_sigbus(f->mmap, f->fd))
+ return -EIO;
+
f->header->state = STATE_OFFLINE;
+ if (mmap_cache_got_sigbus(f->mmap, f->fd))
+ return -EIO;
+
fsync(f->fd);
return 0;
@@ -120,14 +132,10 @@ void journal_file_close(JournalFile *f) {
journal_file_append_tag(f);
#endif
- /* Sync everything to disk, before we mark the file offline */
- if (f->mmap && f->fd >= 0)
- mmap_cache_close_fd(f->mmap, f->fd);
-
journal_file_set_offline(f);
- if (f->header)
- munmap(f->header, PAGE_ALIGN(sizeof(Header)));
+ if (f->mmap && f->fd >= 0)
+ mmap_cache_close_fd(f->mmap, f->fd);
safe_close(f->fd);
free(f->path);
@@ -194,8 +202,8 @@ static int journal_file_init_header(JournalFile *f, JournalFile *template) {
}
static int journal_file_refresh_header(JournalFile *f) {
- int r;
sd_id128_t boot_id;
+ int r;
assert(f);
@@ -212,12 +220,12 @@ static int journal_file_refresh_header(JournalFile *f) {
f->header->boot_id = boot_id;
- journal_file_set_online(f);
+ r = journal_file_set_online(f);
/* Sync the online state to disk */
fsync(f->fd);
- return 0;
+ return r;
}
static int journal_file_verify_header(JournalFile *f) {
@@ -321,6 +329,9 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size)
* for sure, since we always call posix_fallocate()
* ourselves */
+ if (mmap_cache_got_sigbus(f->mmap, f->fd))
+ return -EIO;
+
old_size =
le64toh(f->header->header_size) +
le64toh(f->header->arena_size);
@@ -376,6 +387,7 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size)
static unsigned type_to_context(ObjectType type) {
/* One context for each type, plus one catch-all for the rest */
assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
+ assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
}
@@ -1357,6 +1369,14 @@ int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const st
r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
+ /* If the memory mapping triggered a SIGBUS then we return an
+ * IO error and ignore the error code passed down to us, since
+ * it is very likely just an effect of a nullified replacement
+ * mapping page */
+
+ if (mmap_cache_got_sigbus(f->mmap, f->fd))
+ r = -EIO;
+
journal_file_post_change(f);
return r;
@@ -1712,7 +1732,6 @@ found:
return 1;
}
-
static int generic_array_bisect_plus_one(
JournalFile *f,
uint64_t extra,
@@ -2457,9 +2476,10 @@ int journal_file_open(
JournalFile *template,
JournalFile **ret) {
+ bool newly_created = false;
JournalFile *f;
+ void *h;
int r;
- bool newly_created = false;
assert(fname);
assert(ret);
@@ -2564,13 +2584,14 @@ int journal_file_open(
goto fail;
}
- f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
- if (f->header == MAP_FAILED) {
- f->header = NULL;
+ r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
+ if (r < 0) {
r = -errno;
goto fail;
}
+ f->header = h;
+
if (!newly_created) {
r = journal_file_verify_header(f);
if (r < 0)
@@ -2627,10 +2648,18 @@ int journal_file_open(
if (r < 0)
goto fail;
+ if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
+ r = -EIO;
+ goto fail;
+ }
+
*ret = f;
return 0;
fail:
+ if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
+ r = -EIO;
+
journal_file_close(f);
return r;
@@ -2697,7 +2726,8 @@ int journal_file_open_reliably(
r != -EHOSTDOWN && /* other machine */
r != -EPROTONOSUPPORT && /* incompatible feature */
r != -EBUSY && /* unclean shutdown */
- r != -ESHUTDOWN /* already archived */)
+ r != -ESHUTDOWN && /* already archived */
+ r != -EIO /* IO error, including SIGBUS on mmap */)
return r;
if ((flags & O_ACCMODE) == O_RDONLY)
@@ -2804,7 +2834,12 @@ int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint6
return r;
}
- return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
+ r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
+
+ if (mmap_cache_got_sigbus(to->mmap, to->fd))
+ return -EIO;
+
+ return r;
}
void journal_default_metrics(JournalMetrics *m, int fd) {
diff --git a/src/journal/journal-file.h b/src/journal/journal-file.h
index 01bb4e0..19fd725 100644
--- a/src/journal/journal-file.h
+++ b/src/journal/journal-file.h
@@ -27,7 +27,7 @@
#include <gcrypt.h>
#endif
-#include "systemd/sd-id128.h"
+#include "sd-id128.h"
#include "sparse-endian.h"
#include "journal-def.h"
diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c
index a2a2e19..6d037cf 100644
--- a/src/journal/journald-server.c
+++ b/src/journal/journald-server.c
@@ -452,6 +452,7 @@ bool shall_try_append_again(JournalFile *f, int r) {
-EFBIG Hit fs limit
-EDQUOT Quota limit hit
-ENOSPC Disk full
+ -EIO I/O error of some kind (mmap)
-EHOSTDOWN Other machine
-EBUSY Unclean shutdown
-EPROTONOSUPPORT Unsupported feature
@@ -469,6 +470,8 @@ bool shall_try_append_again(JournalFile *f, int r) {
log_info("%s: Unsupported feature, rotating.", f->path);
else if (r == -EBADMSG || r == -ENODATA || r == ESHUTDOWN)
log_warning("%s: Journal file corrupted, rotating.", f->path);
+ else if (r == -EIO)
+ log_warning("%s: IO error, rotating.", f->path);
else
return false;
diff --git a/src/journal/journald.c b/src/journal/journald.c
index 604c861..80f4634 100644
--- a/src/journal/journald.c
+++ b/src/journal/journald.c
@@ -33,6 +33,8 @@
#include "journald-kmsg.h"
#include "journald-syslog.h"
+#include "sigbus.h"
+
int main(int argc, char *argv[]) {
Server server;
int r;
@@ -49,6 +51,8 @@ int main(int argc, char *argv[]) {
umask(0022);
+ sigbus_install();
+
r = server_init(&server);
if (r < 0)
goto finish;
diff --git a/src/journal/mmap-cache.c b/src/journal/mmap-cache.c
index 4c940aa..ab21cdc 100644
--- a/src/journal/mmap-cache.c
+++ b/src/journal/mmap-cache.c
@@ -29,6 +29,7 @@
#include "log.h"
#include "util.h"
#include "macro.h"
+#include "sigbus.h"
#include "mmap-cache.h"
typedef struct Window Window;
@@ -38,6 +39,7 @@ typedef struct FileDescriptor FileDescriptor;
struct Window {
MMapCache *cache;
+ bool invalidated;
bool keep_always;
bool in_unused;
@@ -65,6 +67,7 @@ struct Context {
struct FileDescriptor {
MMapCache *cache;
int fd;
+ bool sigbus;
LIST_HEAD(Window, windows);
};
@@ -134,6 +137,21 @@ static void window_unlink(Window *w) {
}
}
+static void window_invalidate(Window *w) {
+ assert(w);
+
+ if (w->invalidated)
+ return;
+
+ /* Replace the window with anonymous pages. This is useful
+ * when we hit a SIGBUS and want to make sure the file cannot
+ * trigger any further SIGBUS, possibly overrunning the sigbus
+ * queue. */
+
+ assert_se(mmap(w->ptr, w->size, w->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
+ w->invalidated = true;
+}
+
static void window_free(Window *w) {
assert(w);
@@ -383,6 +401,9 @@ static int try_context(
return 0;
}
+ if (c->window->fd->sigbus)
+ return -EIO;
+
c->window->keep_always |= keep_always;
*ret = (uint8_t*) c->window->ptr + (offset - c->window->offset);
@@ -414,6 +435,9 @@ static int find_mmap(
assert(f->fd == fd);
+ if (f->sigbus)
+ return -EIO;
+
LIST_FOREACH(by_fd, w, f->windows)
if (window_matches(w, fd, prot, offset, size))
break;
@@ -572,27 +596,111 @@ int mmap_cache_get(
return add_mmap(m, fd, prot, context, keep_always, offset, size, st, ret);
}
-void mmap_cache_close_fd(MMapCache *m, int fd) {
+unsigned mmap_cache_get_hit(MMapCache *m) {
+ assert(m);
+
+ return m->n_hit;
+}
+
+unsigned mmap_cache_get_missed(MMapCache *m) {
+ assert(m);
+
+ return m->n_missed;
+}
+
+static void mmap_cache_process_sigbus(MMapCache *m) {
+ bool found = false;
FileDescriptor *f;
+ Iterator i;
+ int r;
assert(m);
- assert(fd >= 0);
- f = hashmap_get(m->fds, INT_TO_PTR(fd + 1));
- if (!f)
+ /* Iterate through all triggered pages and mark their files as
+ * invalidated */
+ for (;;) {
+ bool ours;
+ void *addr;
+
+ r = sigbus_pop(&addr);
+ if (_likely_(r == 0))
+ break;
+ if (r < 0) {
+ log_error_errno(r, "SIGBUS handling failed: %m");
+ abort();
+ }
+
+ ours = false;
+ HASHMAP_FOREACH(f, m->fds, i) {
+ Window *w;
+
+ LIST_FOREACH(by_fd, w, f->windows) {
+ if ((uint8_t*) addr >= (uint8_t*) w->ptr &&
+ (uint8_t*) addr < (uint8_t*) w->ptr + w->size) {
+ found = ours = f->sigbus = true;
+ break;
+ }
+ }
+
+ if (ours)
+ break;
+ }
+
+ /* Didn't find a matching window, give up */
+ if (!ours) {
+ log_error("Unknown SIGBUS page, aborting.");
+ abort();
+ }
+ }
+
+ /* The list of triggered pages is now empty. Now, let's remap
+ * all windows of the triggered file to anonymous maps, so
+ * that no page of the file in question is triggered again, so
+ * that we can be sure not to hit the queue size limit. */
+ if (_likely_(!found))
return;
- fd_free(f);
+ HASHMAP_FOREACH(f, m->fds, i) {
+ Window *w;
+
+ if (!f->sigbus)
+ continue;
+
+ LIST_FOREACH(by_fd, w, f->windows)
+ window_invalidate(w);
+ }
}
-unsigned mmap_cache_get_hit(MMapCache *m) {
+bool mmap_cache_got_sigbus(MMapCache *m, int fd) {
+ FileDescriptor *f;
+
assert(m);
+ assert(fd >= 0);
- return m->n_hit;
+ mmap_cache_process_sigbus(m);
+
+ f = hashmap_get(m->fds, INT_TO_PTR(fd + 1));
+ if (!f)
+ return false;
+
+ return f->sigbus;
}
-unsigned mmap_cache_get_missed(MMapCache *m) {
+void mmap_cache_close_fd(MMapCache *m, int fd) {
+ FileDescriptor *f;
+
assert(m);
+ assert(fd >= 0);
- return m->n_missed;
+ /* Make sure that any queued SIGBUS are first dispatched, so
+ * that we don't end up with a SIGBUS entry we cannot relate
+ * to any existing memory map */
+
+ mmap_cache_process_sigbus(m);
+
+ f = hashmap_get(m->fds, INT_TO_PTR(fd + 1));
+ if (!f)
+ return;
+
+ fd_free(f);
}
diff --git a/src/journal/mmap-cache.h b/src/journal/mmap-cache.h
index fe2c83d..a85c2b6 100644
--- a/src/journal/mmap-cache.h
+++ b/src/journal/mmap-cache.h
@@ -25,7 +25,8 @@
#include <stdbool.h>
#include <sys/stat.h>
-#define MMAP_CACHE_MAX_CONTEXTS 8
+/* One context per object type, plus one of the header, plus one "additional" one */
+#define MMAP_CACHE_MAX_CONTEXTS 9
typedef struct MMapCache MMapCache;
@@ -47,3 +48,5 @@ void mmap_cache_close_fd(MMapCache *m, int fd);
unsigned mmap_cache_get_hit(MMapCache *m);
unsigned mmap_cache_get_missed(MMapCache *m);
+
+bool mmap_cache_got_sigbus(MMapCache *m, int fd);
diff --git a/src/shared/sigbus.c b/src/shared/sigbus.c
new file mode 100644
index 0000000..0108603
--- /dev/null
+++ b/src/shared/sigbus.c
@@ -0,0 +1,152 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+ This file is part of systemd.
+
+ Copyright 2014 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <signal.h>
+#include <sys/mman.h>
+
+#include "macro.h"
+#include "util.h"
+#include "sigbus.h"
+
+#define SIGBUS_QUEUE_MAX 64
+
+static struct sigaction old_sigaction;
+static unsigned n_installed = 0;
+
+/* We maintain a fixed size list of page addresses that triggered a
+ SIGBUS. We access with list with atomic operations, so that we
+ don't have to deal with locks between signal handler and main
+ programs in possibly multiple threads. */
+
+static void* volatile sigbus_queue[SIGBUS_QUEUE_MAX];
+static volatile sig_atomic_t n_sigbus_queue = 0;
+
+static void sigbus_push(void *addr) {
+ unsigned u;
+
+ assert(addr);
+
+ /* Find a free place, increase the number of entries and leave, if we can */
+ for (u = 0; u < SIGBUS_QUEUE_MAX; u++)
+ if (__sync_bool_compare_and_swap(&sigbus_queue[u], NULL, addr)) {
+ __sync_fetch_and_add(&n_sigbus_queue, 1);
+ return;
+ }
+
+ /* If we can't, make sure the queue size is out of bounds, to
+ * mark it as overflow */
+ for (;;) {
+ unsigned c;
+
+ __sync_synchronize();
+ c = n_sigbus_queue;
+
+ if (c > SIGBUS_QUEUE_MAX) /* already overflow */
+ return;
+
+ if (__sync_bool_compare_and_swap(&n_sigbus_queue, c, c + SIGBUS_QUEUE_MAX))
+ return;
+ }
+}
+
+int sigbus_pop(void **ret) {
+ assert(ret);
+
+ for (;;) {
+ unsigned u, c;
+
+ __sync_synchronize();
+ c = n_sigbus_queue;
+
+ if (_likely_(c == 0))
+ return 0;
+
+ if (_unlikely_(c >= SIGBUS_QUEUE_MAX))
+ return -EOVERFLOW;
+
+ for (u = 0; u < SIGBUS_QUEUE_MAX; u++) {
+ void *addr;
+
+ addr = sigbus_queue[u];
+ if (!addr)
+ continue;
+
+ if (__sync_bool_compare_and_swap(&sigbus_queue[u], addr, NULL)) {
+ __sync_fetch_and_sub(&n_sigbus_queue, 1);
+ *ret = addr;
+ return 1;
+ }
+ }
+ }
+}
+
+static void sigbus_handler(int sn, siginfo_t *si, void *data) {
+ unsigned long ul;
+ void *aligned;
+
+ assert(sn == SIGBUS);
+ assert(si);
+
+ if (si->si_code != BUS_ADRERR || !si->si_addr) {
+ assert_se(sigaction(SIGBUS, &old_sigaction, NULL) == 0);
+ raise(SIGBUS);
+ return;
+ }
+
+ ul = (unsigned long) si->si_addr;
+ ul = ul / page_size();
+ ul = ul * page_size();
+ aligned = (void*) ul;
+
+ /* Let's remember which address failed */
+ sigbus_push(aligned);
+
+ /* Replace mapping with an anonymous page, so that the
+ * execution can continue, however with a zeroed out page */
+ assert_se(mmap(aligned, page_size(), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == aligned);
+}
+
+void sigbus_install(void) {
+ struct sigaction sa = {
+ .sa_sigaction = sigbus_handler,
+ .sa_flags = SA_SIGINFO,
+ };
+
+ n_installed++;
+
+ if (n_installed == 1)
+ assert_se(sigaction(SIGBUS, &sa, &old_sigaction) == 0);
+
+ return;
+}
+
+void sigbus_reset(void) {
+
+ if (n_installed <= 0)
+ return;
+
+ n_installed--;
+
+ if (n_installed == 0)
+ assert_se(sigaction(SIGBUS, &old_sigaction, NULL) == 0);
+
+ return;
+}
diff --git a/src/shared/sigbus.h b/src/shared/sigbus.h
new file mode 100644
index 0000000..25593af
--- /dev/null
+++ b/src/shared/sigbus.h
@@ -0,0 +1,25 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+ This file is part of systemd.
+
+ Copyright 2014 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+void sigbus_install(void);
+void sigbus_reset(void);
+
+int sigbus_pop(void **ret);
diff --git a/src/test/test-sigbus.c b/src/test/test-sigbus.c
new file mode 100644
index 0000000..39d0fec
--- /dev/null
+++ b/src/test/test-sigbus.c
@@ -0,0 +1,62 @@
+/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
+
+/***
+ This file is part of systemd.
+
+ Copyright 2014 Lennart Poettering
+
+ systemd is free software; you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published by
+ the Free Software Foundation; either version 2.1 of the License, or
+ (at your option) any later version.
+
+ systemd is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with systemd; If not, see <http://www.gnu.org/licenses/>.
+***/
+
+#include <sys/mman.h>
+
+#include "util.h"
+#include "sigbus.h"
+
+int main(int argc, char *argv[]) {
+ _cleanup_close_ int fd = -1;
+ char template[] = "/tmp/sigbus-test-XXXXXX";
+ void *addr = NULL;
+ uint8_t *p;
+
+ sigbus_install();
+
+ assert(sigbus_pop(&addr) == 0);
+
+ assert_se((fd = mkostemp(template, O_RDWR|O_CREAT|O_EXCL)) >= 0);
+ assert_se(unlink(template) >= 0);
+ assert_se(fallocate(fd, 0, 0, page_size() * 8) >= 0);
+
+ p = mmap(NULL, page_size() * 16, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ assert_se(p != MAP_FAILED);
+
+ assert_se(sigbus_pop(&addr) == 0);
+
+ p[0] = 0xFF;
+ assert_se(sigbus_pop(&addr) == 0);
+
+ p[page_size()] = 0xFF;
+ assert_se(sigbus_pop(&addr) == 0);
+
+ p[page_size()*8] = 0xFF;
+ p[page_size()*8+1] = 0xFF;
+ p[page_size()*10] = 0xFF;
+ assert_se(sigbus_pop(&addr) > 0);
+ assert_se(addr == p + page_size() * 8);
+ assert_se(sigbus_pop(&addr) > 0);
+ assert_se(addr == p + page_size() * 10);
+ assert_se(sigbus_pop(&addr) == 0);
+
+ sigbus_reset();
+}
commit f93bf4363395018ef48d744c4624158623afd693
Author: Lennart Poettering <lennart at poettering.net>
Date: Tue Dec 30 02:19:04 2014 +0100
update TODO
diff --git a/TODO b/TODO
index 79d9be4..b031f4a 100644
--- a/TODO
+++ b/TODO
@@ -578,7 +578,6 @@ Features:
* currently x-systemd.timeout is lost in the initrd, since crypttab is copied into dracut, but fstab is not
* nspawn:
- - bind mount read-only the cgroup tree higher than nspawn
- refuses to boot containers without /etc/machine-id (OK?), and with empty /etc/machine-id (not OK).
* cryptsetup:
commit b12afc8c5c5c3ee5720780df9a602288bbcc24ea
Author: Lennart Poettering <lennart at poettering.net>
Date: Tue Dec 30 01:57:23 2014 +0100
nspawn: mount most of the cgroup tree read-only in nspawn containers except for the container's own subtree in the name=systemd hierarchy
More specifically mount all other hierarchies in their entirety and the
name=systemd above the container's subtree read-only.
diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c
index 342f552..bd3a035 100644
--- a/src/core/mount-setup.c
+++ b/src/core/mount-setup.c
@@ -44,6 +44,7 @@
#include "efivars.h"
#include "smack-util.h"
#include "def.h"
+#include "cgroup-util.h"
typedef enum MountMode {
MNT_NONE = 0,
@@ -227,49 +228,17 @@ int mount_setup_early(void) {
int mount_cgroup_controllers(char ***join_controllers) {
_cleanup_set_free_free_ Set *controllers = NULL;
- _cleanup_fclose_ FILE *f;
- char buf[LINE_MAX];
int r;
/* Mount all available cgroup controllers that are built into the kernel. */
- f = fopen("/proc/cgroups", "re");
- if (!f) {
- log_error_errno(errno, "Failed to enumerate cgroup controllers: %m");
- return 0;
- }
-
controllers = set_new(&string_hash_ops);
if (!controllers)
return log_oom();
- /* Ignore the header line */
- (void) fgets(buf, sizeof(buf), f);
-
- for (;;) {
- char *controller;
- int enabled = 0;
-
- if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
-
- if (feof(f))
- break;
-
- log_error("Failed to parse /proc/cgroups.");
- return -EIO;
- }
-
- if (!enabled) {
- free(controller);
- continue;
- }
-
- r = set_consume(controllers, controller);
- if (r < 0) {
- log_error("Failed to add controller to set.");
- return r;
- }
- }
+ r = cg_kernel_controllers(controllers);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
for (;;) {
_cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
@@ -348,7 +317,7 @@ int mount_cgroup_controllers(char ***join_controllers) {
/* Now that we mounted everything, let's make the tmpfs the
* cgroup file systems are mounted into read-only. */
- mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
+ (void) mount("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
return 0;
}
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index 6aaceac..1ac0a70 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -869,6 +869,112 @@ static int mount_binds(const char *dest, char **l, bool ro) {
return 0;
}
+static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
+ char *to;
+ int r;
+
+ to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
+
+ r = path_is_mount_point(to, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
+ if (r > 0)
+ return 0;
+
+ mkdir_p(to, 0755);
+
+ if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
+ return log_error_errno(errno, "Failed to mount to %s: %m", to);
+
+ return 1;
+}
+
+static int mount_cgroup(const char *dest) {
+ _cleanup_set_free_free_ Set *controllers = NULL;
+ _cleanup_free_ char *own_cgroup_path = NULL;
+ const char *cgroup_root, *systemd_root, *systemd_own;
+ int r;
+
+ controllers = set_new(&string_hash_ops);
+ if (!controllers)
+ return log_oom();
+
+ r = cg_kernel_controllers(controllers);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine cgroup controllers: %m");
+
+ r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine our own cgroup path: %m");
+
+ cgroup_root = strappenda(dest, "/sys/fs/cgroup");
+ if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
+ return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
+
+ for (;;) {
+ _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
+
+ controller = set_steal_first(controllers);
+ if (!controller)
+ break;
+
+ origin = strappend("/sys/fs/cgroup/", controller);
+ if (!origin)
+ return log_oom();
+
+ r = readlink_malloc(origin, &combined);
+ if (r == -EINVAL) {
+ /* Not a symbolic link, but directly a single cgroup hierarchy */
+
+ r = mount_cgroup_hierarchy(dest, controller, controller, true);
+ if (r < 0)
+ return r;
+
+ } else if (r < 0)
+ return log_error_errno(r, "Failed to read link %s: %m", origin);
+ else {
+ _cleanup_free_ char *target = NULL;
+
+ target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
+ if (!target)
+ return log_oom();
+
+ /* A symbolic link, a combination of controllers in one hierarchy */
+
+ if (!filename_is_valid(combined)) {
+ log_warning("Ignoring invalid combined hierarchy %s.", combined);
+ continue;
+ }
+
+ r = mount_cgroup_hierarchy(dest, combined, combined, true);
+ if (r < 0)
+ return r;
+
+ if (symlink(combined, target) < 0)
+ return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
+ }
+ }
+
+ r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
+ if (r < 0)
+ return r;
+
+ /* Make our own cgroup a (writable) bind mount */
+ systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
+ if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
+ return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
+
+ /* And then remount the systemd cgroup root read-only */
+ systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
+ if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
+ return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
+
+ if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
+ return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
+
+ return 0;
+}
+
static int mount_tmpfs(const char *dest) {
char **i, **o;
@@ -3309,6 +3415,11 @@ int main(int argc, char *argv[]) {
kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
+ /* Tell the parent that we are ready, and that
+ * it can cgroupify us to that we lack access
+ * to certain devices and resources. */
+ (void) barrier_place(&barrier);
+
if (setup_boot_id(arg_directory) < 0)
_exit(EXIT_FAILURE);
@@ -3330,10 +3441,12 @@ int main(int argc, char *argv[]) {
if (mount_tmpfs(arg_directory) < 0)
_exit(EXIT_FAILURE);
- /* Tell the parent that we are ready, and that
- * it can cgroupify us to that we lack access
- * to certain devices and resources. */
- (void)barrier_place(&barrier);
+ /* Wait until we are cgroup-ified, so that we
+ * can mount the right cgroup path writable */
+ (void) barrier_sync_next(&barrier);
+
+ if (mount_cgroup(arg_directory) < 0)
+ _exit(EXIT_FAILURE);
if (chdir(arg_directory) < 0) {
log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
@@ -3472,8 +3585,10 @@ int main(int argc, char *argv[]) {
fdset_free(fds);
fds = NULL;
- /* wait for child-setup to be done */
- if (barrier_place_and_sync(&barrier)) {
+ /* Wait for the most basic Child-setup to be done,
+ * before we add hardware to it, and place it in a
+ * cgroup. */
+ if (barrier_sync_next(&barrier)) {
_cleanup_event_unref_ sd_event *event = NULL;
_cleanup_(pty_forward_freep) PTYForward *forward = NULL;
char last_char = 0;
@@ -3515,6 +3630,9 @@ int main(int argc, char *argv[]) {
* control to the code to run inside the container. */
(void) barrier_place(&barrier);
+ /* And wait that the child is completely ready now. */
+ (void) barrier_place_and_sync(&barrier);
+
sd_notify(false,
"READY=1\n"
"STATUS=Container running.");
diff --git a/src/shared/cgroup-util.c b/src/shared/cgroup-util.c
index 1bcba01..86729f1 100644
--- a/src/shared/cgroup-util.c
+++ b/src/shared/cgroup-util.c
@@ -502,14 +502,16 @@ int cg_get_path(const char *controller, const char *path, const char *suffix, ch
}
static int check_hierarchy(const char *p) {
- char *cc;
+ const char *cc;
assert(p);
+ if (!filename_is_valid(p))
+ return 0;
+
/* Check if this controller actually really exists */
- cc = alloca(strlen("/sys/fs/cgroup/") + strlen(p) + 1);
- strcpy(stpcpy(cc, "/sys/fs/cgroup/"), p);
- if (access(cc, F_OK) < 0)
+ cc = strappenda("/sys/fs/cgroup/", p);
+ if (laccess(cc, F_OK) < 0)
return -errno;
return 0;
@@ -1732,3 +1734,54 @@ CGroupControllerMask cg_mask_supported(void) {
return mask;
}
+
+int cg_kernel_controllers(Set *controllers) {
+ _cleanup_fclose_ FILE *f = NULL;
+ char buf[LINE_MAX];
+ int r;
+
+ assert(controllers);
+
+ f = fopen("/proc/cgroups", "re");
+ if (!f) {
+ if (errno == ENOENT)
+ return 0;
+ return -errno;
+ }
+
+ /* Ignore the header line */
+ (void) fgets(buf, sizeof(buf), f);
+
+ for (;;) {
+ char *controller;
+ int enabled = 0;
+
+ errno = 0;
+ if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
+
+ if (feof(f))
+ break;
+
+ if (ferror(f) && errno)
+ return -errno;
+
+ return -EBADMSG;
+ }
+
+ if (!enabled) {
+ free(controller);
+ continue;
+ }
+
+ if (!filename_is_valid(controller)) {
+ free(controller);
+ return -EBADMSG;
+ }
+
+ r = set_consume(controllers, controller);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
diff --git a/src/shared/cgroup-util.h b/src/shared/cgroup-util.h
index 5e1e445..89dc2b1 100644
--- a/src/shared/cgroup-util.h
+++ b/src/shared/cgroup-util.h
@@ -132,3 +132,5 @@ int cg_migrate_everywhere(CGroupControllerMask supported, const char *from, cons
int cg_trim_everywhere(CGroupControllerMask supported, const char *path, bool delete_root);
CGroupControllerMask cg_mask_supported(void);
+
+int cg_kernel_controllers(Set *controllers);
commit 714e2e1d56b97dcf2ebae2d0447b48f21e38a600
Author: Lennart Poettering <lennart at poettering.net>
Date: Tue Dec 30 01:56:42 2014 +0100
cgroup: downgrade log messages when we cannot write to cgroup trees that are mounted read-only
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index 35b862d..3d5d889 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -200,7 +200,8 @@ static int whitelist_device(const char *path, const char *node, const char *acc)
r = cg_set_attribute("devices", path, "devices.allow", buf);
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set devices.allow on %s: %m", path);
return r;
}
@@ -270,7 +271,8 @@ static int whitelist_major(const char *path, const char *name, char type, const
r = cg_set_attribute("devices", path, "devices.allow", buf);
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set devices.allow on %s: %m", path);
}
return 0;
@@ -294,6 +296,10 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
* hence silently ignore */
is_root = isempty(path) || path_equal(path, "/");
+ /* We generally ignore errors caused by read-only mounted
+ * cgroup trees (assuming we are running in a container then),
+ * and missing cgroups, i.e. EROFS and ENOENT. */
+
if ((mask & CGROUP_CPU) && !is_root) {
char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
@@ -302,12 +308,14 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
r = cg_set_attribute("cpu", path, "cpu.shares", buf);
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.shares on %s: %s", path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set cpu.shares on %s: %m", path);
sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set cpu.cfs_period_us on %s: %m", path);
if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
@@ -315,7 +323,8 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
} else
r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set cpu.cfs_quota_us on %s: %m", path);
}
if (mask & CGROUP_BLKIO) {
@@ -330,7 +339,8 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
r = cg_set_attribute("blkio", path, "blkio.weight", buf);
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight on %s: %s", path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set blkio.weight on %s: %m", path);
/* FIXME: no way to reset this list */
LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
@@ -343,7 +353,8 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set blkio.weight_device on %s: %m", path);
}
}
@@ -361,7 +372,8 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
r = cg_set_attribute("blkio", path, a, buf);
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set %s on %s: %s", a, path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set %s on %s: %m", a, path);
}
}
@@ -375,18 +387,24 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to set memory.limit_in_bytes on %s: %m", path);
}
if ((mask & CGROUP_DEVICE) && !is_root) {
CGroupDeviceAllow *a;
+ /* Changing the devices list of a populated cgroup
+ * might result in EINVAL, hence ignore EINVAL
+ * here. */
+
if (c->device_allow || c->device_policy != CGROUP_AUTO)
r = cg_set_attribute("devices", path, "devices.deny", "a");
else
r = cg_set_attribute("devices", path, "devices.allow", "a");
if (r < 0)
- log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to reset devices.list on %s: %s", path, strerror(-r));
+ log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL) ? LOG_DEBUG : LOG_WARNING, r,
+ "Failed to reset devices.list on %s: %m", path);
if (c->device_policy == CGROUP_CLOSED ||
(c->device_policy == CGROUP_AUTO && c->device_allow)) {
More information about the systemd-commits
mailing list