[systemd-commits] 6 commits - src/core src/login src/shared

Lennart Poettering lennart at kemper.freedesktop.org
Fri Feb 7 06:16:36 PST 2014


 src/core/dbus-scope.c      |   17 +++--
 src/core/manager.c         |  120 ++++++++++++++++++++++--------------
 src/core/manager.h         |    9 ++
 src/core/scope.c           |   98 ++++++++++++++++++++----------
 src/core/scope.h           |    5 -
 src/core/service.c         |  134 ++++++++++++++++++++++++-----------------
 src/core/unit.c            |  146 ++++++++++++++++++++++++++++++++++++++++-----
 src/core/unit.h            |    9 ++
 src/login/logind-dbus.c    |   55 ++++++++++------
 src/login/logind-session.c |  100 ++++++++++++++++++++----------
 src/login/logind-session.h |    5 +
 src/login/logind-user.c    |   23 ++++---
 src/login/logind-user.h    |    1 
 src/login/logind.c         |   14 +++-
 src/login/logind.h         |    3 
 src/login/pam-module.c     |   28 +++-----
 src/shared/cgroup-util.c   |    6 -
 17 files changed, 529 insertions(+), 244 deletions(-)

New commits:
commit a50df72b37ce2a7caf7775c70d18c3f9504b9e80
Author: Lennart Poettering <lennart at poettering.net>
Date:   Thu Feb 6 19:04:51 2014 +0100

    logind: given that we can now relatively safely shutdown sessions copes
    without working cgroup empty notifications there's no need to set the
    stop timeout of sessions scopes low

diff --git a/src/login/logind-dbus.c b/src/login/logind-dbus.c
index 4088555..642ba07 100644
--- a/src/login/logind-dbus.c
+++ b/src/login/logind-dbus.c
@@ -2228,10 +2228,6 @@ int manager_start_scope(
          * stop timeout for sessions, so that we don't wait
          * forever. */
 
-        r = sd_bus_message_append(m, "(sv)", "TimeoutStopUSec", "t", 500 * USEC_PER_MSEC);
-        if (r < 0)
-                return r;
-
         /* Make sure that the session shells are terminated with
          * SIGHUP since bash and friends tend to ignore SIGTERM */
         r = sd_bus_message_append(m, "(sv)", "SendSIGHUP", "b", true);

commit 5ba6985b6c8ef85a8bcfeb1b65239c863436e75b
Author: Lennart Poettering <lennart at poettering.net>
Date:   Fri Feb 7 11:58:25 2014 +0100

    core: allow PIDs to be watched by two units at the same time
    
    In some cases it is interesting to map a PID to two units at the same
    time. For example, when a user logs in via a getty, which is reexeced to
    /sbin/login that binary will be explicitly referenced as main pid of the
    getty service, as well as implicitly referenced as part of the session
    scope.

diff --git a/src/core/manager.c b/src/core/manager.c
index c0bcc79..b58b98c 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -447,10 +447,6 @@ int manager_new(SystemdRunningAs running_as, Manager **_m) {
         if (r < 0)
                 goto fail;
 
-        r = hashmap_ensure_allocated(&m->watch_pids, trivial_hash_func, trivial_compare_func);
-        if (r < 0)
-                goto fail;
-
         r = hashmap_ensure_allocated(&m->watch_bus, string_hash_func, string_compare_func);
         if (r < 0)
                 goto fail;
@@ -778,7 +774,8 @@ void manager_free(Manager *m) {
 
         hashmap_free(m->units);
         hashmap_free(m->jobs);
-        hashmap_free(m->watch_pids);
+        hashmap_free(m->watch_pids1);
+        hashmap_free(m->watch_pids2);
         hashmap_free(m->watch_bus);
 
         sd_event_source_unref(m->signal_event_source);
@@ -1319,6 +1316,26 @@ static unsigned manager_dispatch_dbus_queue(Manager *m) {
         return n;
 }
 
+static void manager_invoke_notify_message(Manager *m, Unit *u, pid_t pid, char *buf, size_t n) {
+        _cleanup_strv_free_ char **tags = NULL;
+
+        assert(m);
+        assert(u);
+        assert(buf);
+        assert(n > 0);
+
+        tags = strv_split(buf, "\n\r");
+        if (!tags) {
+                log_oom();
+                return;
+        }
+
+        log_debug_unit(u->id, "Got notification message for unit %s", u->id);
+
+        if (UNIT_VTABLE(u)->notify_message)
+                UNIT_VTABLE(u)->notify_message(u, pid, tags);
+}
+
 static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
         Manager *m = userdata;
         ssize_t n;
@@ -1337,6 +1354,7 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t
                         .iov_base = buf,
                         .iov_len = sizeof(buf)-1,
                 };
+                bool found = false;
 
                 union {
                         struct cmsghdr cmsghdr;
@@ -1351,7 +1369,6 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t
                 };
                 struct ucred *ucred;
                 Unit *u;
-                _cleanup_strv_free_ char **tags = NULL;
 
                 n = recvmsg(m->notify_fd, &msghdr, MSG_DONTWAIT);
                 if (n <= 0) {
@@ -1374,36 +1391,50 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t
 
                 ucred = (struct ucred*) CMSG_DATA(&control.cmsghdr);
 
-                u = hashmap_get(m->watch_pids, LONG_TO_PTR(ucred->pid));
-                if (!u) {
-                        u = manager_get_unit_by_pid(m, ucred->pid);
-                        if (!u) {
-                                log_warning("Cannot find unit for notify message of PID "PID_FMT".", ucred->pid);
-                                continue;
-                        }
-                }
-
                 assert((size_t) n < sizeof(buf));
                 buf[n] = 0;
-                tags = strv_split(buf, "\n\r");
-                if (!tags)
-                        return log_oom();
 
-                log_debug_unit(u->id, "Got notification message for unit %s", u->id);
+                u = manager_get_unit_by_pid(m, ucred->pid);
+                if (u) {
+                        manager_invoke_notify_message(m, u, ucred->pid, buf, n);
+                        found = true;
+                }
+
+                u = hashmap_get(m->watch_pids1, LONG_TO_PTR(ucred->pid));
+                if (u) {
+                        manager_invoke_notify_message(m, u, ucred->pid, buf, n);
+                        found = true;
+                }
+
+                u = hashmap_get(m->watch_pids2, LONG_TO_PTR(ucred->pid));
+                if (u) {
+                        manager_invoke_notify_message(m, u, ucred->pid, buf, n);
+                        found = true;
+                }
 
-                if (UNIT_VTABLE(u)->notify_message)
-                        UNIT_VTABLE(u)->notify_message(u, ucred->pid, tags);
+                if (!found)
+                        log_warning("Cannot find unit for notify message of PID "PID_FMT".", ucred->pid);
         }
 
         return 0;
 }
 
+static void invoke_sigchld_event(Manager *m, Unit *u, siginfo_t *si) {
+        assert(m);
+        assert(u);
+        assert(si);
+
+        log_debug_unit(u->id, "Child "PID_FMT" belongs to %s", si->si_pid, u->id);
+
+        unit_unwatch_pid(u, si->si_pid);
+        UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status);
+}
+
 static int manager_dispatch_sigchld(Manager *m) {
         assert(m);
 
         for (;;) {
                 siginfo_t si = {};
-                Unit *u;
 
                 /* First we call waitd() for a PID and do not reap the
                  * zombie. That way we can still access /proc/$PID for
@@ -1424,15 +1455,30 @@ static int manager_dispatch_sigchld(Manager *m) {
 
                 if (si.si_code == CLD_EXITED || si.si_code == CLD_KILLED || si.si_code == CLD_DUMPED) {
                         _cleanup_free_ char *name = NULL;
+                        Unit *u;
 
                         get_process_comm(si.si_pid, &name);
-                        log_debug("Got SIGCHLD for process "PID_FMT" (%s)", si.si_pid, strna(name));
-                }
 
-                /* And now figure out the unit this belongs to */
-                u = hashmap_get(m->watch_pids, LONG_TO_PTR(si.si_pid));
-                if (!u)
+                        log_debug("Child "PID_FMT" (%s) died (code=%s, status=%i/%s)",
+                                  si.si_pid, strna(name),
+                                  sigchld_code_to_string(si.si_code),
+                                  si.si_status,
+                                  strna(si.si_code == CLD_EXITED
+                                        ? exit_status_to_string(si.si_status, EXIT_STATUS_FULL)
+                                        : signal_to_string(si.si_status)));
+
+                        /* And now figure out the unit this belongs
+                         * to, it might be multiple... */
                         u = manager_get_unit_by_pid(m, si.si_pid);
+                        if (u)
+                                invoke_sigchld_event(m, u, &si);
+                        u = hashmap_get(m->watch_pids1, LONG_TO_PTR(si.si_pid));
+                        if (u)
+                                invoke_sigchld_event(m, u, &si);
+                        u = hashmap_get(m->watch_pids2, LONG_TO_PTR(si.si_pid));
+                        if (u)
+                                invoke_sigchld_event(m, u, &si);
+                }
 
                 /* And now, we actually reap the zombie. */
                 if (waitid(P_PID, si.si_pid, &si, WEXITED) < 0) {
@@ -1441,26 +1487,6 @@ static int manager_dispatch_sigchld(Manager *m) {
 
                         return -errno;
                 }
-
-                if (si.si_code != CLD_EXITED && si.si_code != CLD_KILLED && si.si_code != CLD_DUMPED)
-                        continue;
-
-                log_debug("Child %lu died (code=%s, status=%i/%s)",
-                          (long unsigned) si.si_pid,
-                          sigchld_code_to_string(si.si_code),
-                          si.si_status,
-                          strna(si.si_code == CLD_EXITED
-                                ? exit_status_to_string(si.si_status, EXIT_STATUS_FULL)
-                                : signal_to_string(si.si_status)));
-
-                if (!u)
-                        continue;
-
-                log_debug_unit(u->id,
-                               "Child %lu belongs to %s", (long unsigned) si.si_pid, u->id);
-
-                unit_unwatch_pid(u, si.si_pid);
-                UNIT_VTABLE(u)->sigchld_event(u, si.si_pid, si.si_code, si.si_status);
         }
 
         return 0;
diff --git a/src/core/manager.h b/src/core/manager.h
index 358aba7..948ea98 100644
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -96,7 +96,14 @@ struct Manager {
 
         sd_event *event;
 
-        Hashmap *watch_pids;  /* pid => Unit object n:1 */
+        /* We use two hash tables here, since the same PID might be
+         * watched by two different units: once the unit that forked
+         * it off, and possibly a different unit to which it was
+         * joined as cgroup member. Since we know that it is either
+         * one or two units for each PID we just use to hashmaps
+         * here. */
+        Hashmap *watch_pids1;  /* pid => Unit object n:1 */
+        Hashmap *watch_pids2;  /* pid => Unit object n:1 */
 
         sd_event_source *run_queue_event_source;
 
diff --git a/src/core/unit.c b/src/core/unit.c
index 6e1c112..b0bb026 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -1704,16 +1704,27 @@ int unit_watch_pid(Unit *u, pid_t pid) {
         assert(u);
         assert(pid >= 1);
 
+        /* Watch a specific PID. We only support one or two units
+         * watching each PID for now, not more. */
+
+        r = hashmap_ensure_allocated(&u->manager->watch_pids1, trivial_hash_func, trivial_compare_func);
+        if (r < 0)
+                return r;
+
         r = set_ensure_allocated(&u->pids, trivial_hash_func, trivial_compare_func);
         if (r < 0)
                 return r;
 
-        /* Watch a specific PID. We only support one unit watching
-         * each PID for now. */
+        r = hashmap_put(u->manager->watch_pids1, LONG_TO_PTR(pid), u);
+        if (r == -EEXIST) {
+                r = hashmap_ensure_allocated(&u->manager->watch_pids2, trivial_hash_func, trivial_compare_func);
+                if (r < 0)
+                        return r;
 
-        r = set_put(u->pids, LONG_TO_PTR(pid));
+                r = hashmap_put(u->manager->watch_pids2, LONG_TO_PTR(pid), u);
+        }
 
-        q = hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u);
+        q = set_put(u->pids, LONG_TO_PTR(pid));
         if (q < 0)
                 return q;
 
@@ -1724,7 +1735,8 @@ void unit_unwatch_pid(Unit *u, pid_t pid) {
         assert(u);
         assert(pid >= 1);
 
-        hashmap_remove_value(u->manager->watch_pids, LONG_TO_PTR(pid), u);
+        hashmap_remove_value(u->manager->watch_pids1, LONG_TO_PTR(pid), u);
+        hashmap_remove_value(u->manager->watch_pids2, LONG_TO_PTR(pid), u);
         set_remove(u->pids, LONG_TO_PTR(pid));
 }
 
@@ -1797,8 +1809,10 @@ void unit_unwatch_all_pids(Unit *u) {
 
         assert(u);
 
-        SET_FOREACH(e, u->pids, i)
-                hashmap_remove_value(u->manager->watch_pids, e, u);
+        SET_FOREACH(e, u->pids, i) {
+                hashmap_remove_value(u->manager->watch_pids1, e, u);
+                hashmap_remove_value(u->manager->watch_pids2, e, u);
+        }
 
         set_free(u->pids);
         u->pids = NULL;

commit 8190da36f71a887945297cd65d6426c78b466a50
Author: Lennart Poettering <lennart at poettering.net>
Date:   Thu Feb 6 19:46:46 2014 +0100

    core: don't send duplicate SIGCONT when killing units

diff --git a/src/core/unit.c b/src/core/unit.c
index 07eedcd..6e1c112 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -3098,7 +3098,7 @@ int unit_kill_context(
                                 if (!pid_set)
                                         return -ENOMEM;
 
-                                cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, SIGHUP, true, true, false, pid_set);
+                                cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, SIGHUP, false, true, false, pid_set);
                         }
                 }
         }

commit 6e8314c420eb375847c9e526745c2caec802399d
Author: Lennart Poettering <lennart at poettering.net>
Date:   Thu Feb 6 19:27:59 2014 +0100

    cgroup: make sure to properly send SIGCONT to all processes of a cgroup if that's requested

diff --git a/src/shared/cgroup-util.c b/src/shared/cgroup-util.c
index 4ce8856..e6ceb99 100644
--- a/src/shared/cgroup-util.c
+++ b/src/shared/cgroup-util.c
@@ -194,12 +194,12 @@ int cg_kill(const char *controller, const char *path, int sig, bool sigcont, boo
                         if (kill(pid, sig) < 0) {
                                 if (ret >= 0 && errno != ESRCH)
                                         ret = -errno;
-                        } else if (ret == 0) {
-
+                        } else {
                                 if (sigcont)
                                         kill(pid, SIGCONT);
 
-                                ret = 1;
+                                if (ret == 0)
+                                        ret = 1;
                         }
 
                         done = false;

commit 5f41d1f10fd97e93517b6a762b1bec247f4d1171
Author: Lennart Poettering <lennart at poettering.net>
Date:   Thu Feb 6 18:32:14 2014 +0100

    logind: rework session shutdown logic
    
    Simplify the shutdown logic a bit:
    
    - Keep the session FIFO around in the PAM module, even after the session
      shutdown hook has been finished. This allows logind to track precisely
      when the PAM handler goes away.
    
    - In the ReleaseSession() call start a timer, that will stop terminate
      the session when elapsed.
    
    - Never fiddle with the KillMode of scopes to configure whether user
      processes should be killed or not. Instead, simply leave the scope
      units around when we terminate a session whose processes should not be
      killed.
    
    - When killing is enabled, stop the session scope on FIFO EOF or after
      the ReleaseSession() timeout. When killing is disabled, simply tell
      PID 1 to abandon the scope.
    
    Because the scopes stay around and hence all processes are always member
    of a scope, the system shutdown logic should be more robust, as the
    scopes can be shutdown as part of the usual shutdown logic.

diff --git a/src/login/logind-dbus.c b/src/login/logind-dbus.c
index 4745961..4088555 100644
--- a/src/login/logind-dbus.c
+++ b/src/login/logind-dbus.c
@@ -748,15 +748,7 @@ static int method_release_session(sd_bus *bus, sd_bus_message *message, void *us
         if (!session)
                 return sd_bus_error_setf(error, BUS_ERROR_NO_SUCH_SESSION, "No session '%s' known", name);
 
-        /* We use the FIFO to detect stray sessions where the process
-           invoking PAM dies abnormally. We need to make sure that
-           that process is not killed if at the clean end of the
-           session it closes the FIFO. Hence, with this call
-           explicitly turn off the FIFO logic, so that the PAM code
-           can finish clean up on its own */
-        session_remove_fifo(session);
-        session_save(session);
-        user_save(session->user);
+        session_release(session);
 
         return sd_bus_reply_method_return(message, NULL);
 }
@@ -2185,7 +2177,6 @@ int manager_start_scope(
                 const char *slice,
                 const char *description,
                 const char *after,
-                const char *kill_mode,
                 sd_bus_error *error,
                 char **job) {
 
@@ -2232,12 +2223,6 @@ int manager_start_scope(
                         return r;
         }
 
-        if (!isempty(kill_mode)) {
-                r = sd_bus_message_append(m, "(sv)", "KillMode", "s", kill_mode);
-                if (r < 0)
-                        return r;
-        }
-
         /* cgroup empty notification is not available in containers
          * currently. To make this less problematic, let's shorten the
          * stop timeout for sessions, so that we don't wait
@@ -2372,6 +2357,40 @@ int manager_stop_unit(Manager *manager, const char *unit, sd_bus_error *error, c
         return 1;
 }
 
+int manager_abandon_scope(Manager *manager, const char *scope, sd_bus_error *error) {
+        _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
+        _cleanup_free_ char *path = NULL;
+        int r;
+
+        assert(manager);
+        assert(scope);
+
+        path = unit_dbus_path_from_name(scope);
+        if (!path)
+                return -ENOMEM;
+
+        r = sd_bus_call_method(
+                        manager->bus,
+                        "org.freedesktop.systemd1",
+                        path,
+                        "org.freedesktop.systemd1.Scope",
+                        "Abandon",
+                        error,
+                        NULL,
+                        NULL);
+        if (r < 0) {
+                if (sd_bus_error_has_name(error, BUS_ERROR_NO_SUCH_UNIT) ||
+                    sd_bus_error_has_name(error, BUS_ERROR_LOAD_FAILED)) {
+                        sd_bus_error_free(error);
+                        return 0;
+                }
+
+                return r;
+        }
+
+        return 1;
+}
+
 int manager_kill_unit(Manager *manager, const char *unit, KillWho who, int signo, sd_bus_error *error) {
         assert(manager);
         assert(unit);
diff --git a/src/login/logind-session.c b/src/login/logind-session.c
index bec59c0..95105e5 100644
--- a/src/login/logind-session.c
+++ b/src/login/logind-session.c
@@ -40,6 +40,10 @@
 #include "bus-error.h"
 #include "logind-session.h"
 
+#define RELEASE_USEC (20*USEC_PER_SEC)
+
+static void session_remove_fifo(Session *s);
+
 static unsigned long devt_hash_func(const void *p, const uint8_t hash_key[HASH_KEY_SIZE]) {
         uint64_t u = *(const dev_t*)p;
 
@@ -103,6 +107,8 @@ void session_free(Session *s) {
         if (s->in_gc_queue)
                 LIST_REMOVE(gc_queue, s->manager->session_gc_queue, s);
 
+        s->timer_event_source = sd_event_source_unref(s->timer_event_source);
+
         session_remove_fifo(s);
 
         session_drop_controller(s);
@@ -147,6 +153,8 @@ void session_free(Session *s) {
 
         hashmap_remove(s->manager->sessions, s->id);
 
+        s->vt_source = sd_event_source_unref(s->vt_source);
+
         free(s->state_file);
         free(s);
 }
@@ -472,7 +480,6 @@ static int session_start_scope(Session *s) {
         if (!s->scope) {
                 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
                 _cleanup_free_ char *description = NULL;
-                const char *kill_mode;
                 char *scope, *job;
 
                 description = strjoin("Session ", s->id, " of user ", s->user->name, NULL);
@@ -483,9 +490,7 @@ static int session_start_scope(Session *s) {
                 if (!scope)
                         return log_oom();
 
-                kill_mode = manager_shall_kill(s->manager, s->user->name) ? "control-group" : "none";
-
-                r = manager_start_scope(s->manager, scope, s->leader, s->user->slice, description, "systemd-user-sessions.service", kill_mode, &error, &job);
+                r = manager_start_scope(s->manager, scope, s->leader, s->user->slice, description, "systemd-logind.service", &error, &job);
                 if (r < 0) {
                         log_error("Failed to start session scope %s: %s %s",
                                   scope, bus_error_message(&error, r), error.name);
@@ -541,23 +546,22 @@ int session_start(Session *s) {
 
         s->started = true;
 
-        /* Save session data */
+        /* Save data */
         session_save(s);
         user_save(s->user);
+        if (s->seat)
+                seat_save(s->seat);
 
+        /* Send signals */
         session_send_signal(s, true);
-
+        user_send_changed(s->user, "Sessions", NULL);
         if (s->seat) {
-                seat_save(s->seat);
-
                 if (s->seat->active == s)
                         seat_send_changed(s->seat, "Sessions", "ActiveSession", NULL);
                 else
                         seat_send_changed(s->seat, "Sessions", NULL);
         }
 
-        user_send_changed(s->user, "Sessions", NULL);
-
         return 0;
 }
 
@@ -571,14 +575,22 @@ static int session_stop_scope(Session *s) {
         if (!s->scope)
                 return 0;
 
-        r = manager_stop_unit(s->manager, s->scope, &error, &job);
-        if (r < 0) {
-                log_error("Failed to stop session scope: %s", bus_error_message(&error, r));
-                return r;
-        }
+        if (manager_shall_kill(s->manager, s->user->name)) {
+                r = manager_stop_unit(s->manager, s->scope, &error, &job);
+                if (r < 0) {
+                        log_error("Failed to stop session scope: %s", bus_error_message(&error, r));
+                        return r;
+                }
 
-        free(s->scope_job);
-        s->scope_job = job;
+                free(s->scope_job);
+                s->scope_job = job;
+        } else {
+                r = manager_abandon_scope(s->manager, s->scope, &error);
+                if (r < 0) {
+                        log_error("Failed to abandon session scope: %s", bus_error_message(&error, r));
+                        return r;
+                }
+        }
 
         return 0;
 }
@@ -591,9 +603,16 @@ int session_stop(Session *s) {
         if (!s->user)
                 return -ESTALE;
 
+        s->timer_event_source = sd_event_source_unref(s->timer_event_source);
+
+        /* We are going down, don't care about FIFOs anymore */
+        session_remove_fifo(s);
+
         /* Kill cgroup */
         r = session_stop_scope(s);
 
+        s->stopping = true;
+
         session_save(s);
         user_save(s->user);
 
@@ -618,6 +637,8 @@ int session_finalize(Session *s) {
                            "MESSAGE=Removed session %s.", s->id,
                            NULL);
 
+        s->timer_event_source = sd_event_source_unref(s->timer_event_source);
+
         /* Kill session devices */
         while ((sd = hashmap_first(s->devices)))
                 session_device_free(sd);
@@ -635,16 +656,36 @@ int session_finalize(Session *s) {
                 if (s->seat->active == s)
                         seat_set_active(s->seat, NULL);
 
-                seat_send_changed(s->seat, "Sessions", NULL);
                 seat_save(s->seat);
+                seat_send_changed(s->seat, "Sessions", NULL);
         }
 
-        user_send_changed(s->user, "Sessions", NULL);
         user_save(s->user);
+        user_send_changed(s->user, "Sessions", NULL);
 
         return r;
 }
 
+static int release_timeout_callback(sd_event_source *es, uint64_t usec, void *userdata) {
+        Session *s = userdata;
+
+        assert(es);
+        assert(s);
+
+        session_stop(s);
+        return 0;
+}
+
+void session_release(Session *s) {
+        assert(s);
+
+        if (!s->started || s->stopping)
+                return;
+
+        if (!s->timer_event_source)
+                sd_event_add_monotonic(s->manager->event, now(CLOCK_MONOTONIC) + RELEASE_USEC, 0, release_timeout_callback, s, &s->timer_event_source);
+}
+
 bool session_is_active(Session *s) {
         assert(s);
 
@@ -820,7 +861,7 @@ int session_create_fifo(Session *s) {
         return r;
 }
 
-void session_remove_fifo(Session *s) {
+static void session_remove_fifo(Session *s) {
         assert(s);
 
         if (s->fifo_event_source)
@@ -839,8 +880,6 @@ void session_remove_fifo(Session *s) {
 }
 
 bool session_check_gc(Session *s, bool drop_not_started) {
-        int r;
-
         assert(s);
 
         if (drop_not_started && !s->started)
@@ -850,11 +889,7 @@ bool session_check_gc(Session *s, bool drop_not_started) {
                 return false;
 
         if (s->fifo_fd >= 0) {
-                r = pipe_eof(s->fifo_fd);
-                if (r < 0)
-                        return true;
-
-                if (r == 0)
+                if (pipe_eof(s->fifo_fd) <= 0)
                         return true;
         }
 
@@ -880,12 +915,12 @@ void session_add_to_gc_queue(Session *s) {
 SessionState session_get_state(Session *s) {
         assert(s);
 
+        if (s->stopping || s->timer_event_source)
+                return SESSION_CLOSING;
+
         if (s->scope_job)
                 return SESSION_OPENING;
 
-        if (s->fifo_fd < 0)
-                return SESSION_CLOSING;
-
         if (session_is_active(s))
                 return SESSION_ACTIVE;
 
@@ -902,7 +937,7 @@ int session_kill(Session *s, KillWho who, int signo) {
 }
 
 static int session_open_vt(Session *s) {
-        char path[128];
+        char path[sizeof("/dev/tty") + DECIMAL_STR_MAX(s->vtnr)];
 
         if (!s->vtnr)
                 return -1;
@@ -980,8 +1015,7 @@ void session_restore_vt(Session *s) {
         if (vt < 0)
                 return;
 
-        sd_event_source_unref(s->vt_source);
-        s->vt_source = NULL;
+        s->vt_source = sd_event_source_unref(s->vt_source);
 
         ioctl(vt, KDSETMODE, KD_TEXT);
 
diff --git a/src/login/logind-session.h b/src/login/logind-session.h
index 7bf1932..42552bc 100644
--- a/src/login/logind-session.h
+++ b/src/login/logind-session.h
@@ -110,9 +110,12 @@ struct Session {
 
         bool in_gc_queue:1;
         bool started:1;
+        bool stopping:1;
 
         sd_bus_message *create_message;
 
+        sd_event_source *timer_event_source;
+
         char *controller;
         Hashmap *devices;
 
@@ -132,10 +135,10 @@ bool session_is_active(Session *s);
 int session_get_idle_hint(Session *s, dual_timestamp *t);
 void session_set_idle_hint(Session *s, bool b);
 int session_create_fifo(Session *s);
-void session_remove_fifo(Session *s);
 int session_start(Session *s);
 int session_stop(Session *s);
 int session_finalize(Session *s);
+void session_release(Session *s);
 int session_save(Session *s);
 int session_load(Session *s);
 int session_kill(Session *s, KillWho who, int signo);
diff --git a/src/login/logind-user.c b/src/login/logind-user.c
index bdb6915..fdbf6e3 100644
--- a/src/login/logind-user.c
+++ b/src/login/logind-user.c
@@ -515,6 +515,8 @@ int user_stop(User *u) {
         if (k < 0)
                 r = k;
 
+        u->stopping = true;
+
         user_save(u);
 
         return r;
@@ -633,22 +635,27 @@ void user_add_to_gc_queue(User *u) {
 
 UserState user_get_state(User *u) {
         Session *i;
-        bool all_closing = true;
 
         assert(u);
 
+        if (u->stopping)
+                return USER_CLOSING;
+
         if (u->slice_job || u->service_job)
                 return USER_OPENING;
 
-        LIST_FOREACH(sessions_by_user, i, u->sessions) {
-                if (session_is_active(i))
-                        return USER_ACTIVE;
-                if (session_get_state(i) != SESSION_CLOSING)
-                        all_closing = false;
-        }
+        if (u->sessions) {
+                bool all_closing = true;
+
+                LIST_FOREACH(sessions_by_user, i, u->sessions) {
+                        if (session_is_active(i))
+                                return USER_ACTIVE;
+                        if (session_get_state(i) != SESSION_CLOSING)
+                                all_closing = false;
+                }
 
-        if (u->sessions)
                 return all_closing ? USER_CLOSING : USER_ONLINE;
+        }
 
         if (user_check_linger_file(u) > 0)
                 return USER_LINGERING;
diff --git a/src/login/logind-user.h b/src/login/logind-user.h
index 0062880..b0fefe9 100644
--- a/src/login/logind-user.h
+++ b/src/login/logind-user.h
@@ -61,6 +61,7 @@ struct User {
 
         bool in_gc_queue:1;
         bool started:1;
+        bool stopping:1;
 
         LIST_HEAD(Session, sessions);
         LIST_FIELDS(User, gc_queue);
diff --git a/src/login/logind.c b/src/login/logind.c
index 48da7b1..a6f84e8 100644
--- a/src/login/logind.c
+++ b/src/login/logind.c
@@ -871,8 +871,15 @@ void manager_gc(Manager *m, bool drop_not_started) {
                 LIST_REMOVE(gc_queue, m->session_gc_queue, session);
                 session->in_gc_queue = false;
 
-                if (!session_check_gc(session, drop_not_started)) {
+                /* First, if we are not closing yet, initiate stopping */
+                if (!session_check_gc(session, drop_not_started) &&
+                    session_get_state(session) != SESSION_CLOSING)
                         session_stop(session);
+
+                /* Normally, this should make the session busy again,
+                 * if it doesn't then let's get rid of it
+                 * immediately */
+                if (!session_check_gc(session, drop_not_started)) {
                         session_finalize(session);
                         session_free(session);
                 }
@@ -882,8 +889,11 @@ void manager_gc(Manager *m, bool drop_not_started) {
                 LIST_REMOVE(gc_queue, m->user_gc_queue, user);
                 user->in_gc_queue = false;
 
-                if (!user_check_gc(user, drop_not_started)) {
+                if (!user_check_gc(user, drop_not_started) &&
+                    user_get_state(user) != USER_CLOSING)
                         user_stop(user);
+
+                if (!user_check_gc(user, drop_not_started)) {
                         user_finalize(user);
                         user_free(user);
                 }
diff --git a/src/login/logind.h b/src/login/logind.h
index b84137c..c1a5b6a 100644
--- a/src/login/logind.h
+++ b/src/login/logind.h
@@ -162,9 +162,10 @@ int manager_send_changed(Manager *manager, const char *property, ...) _sentinel_
 
 int manager_dispatch_delayed(Manager *manager);
 
-int manager_start_scope(Manager *manager, const char *scope, pid_t pid, const char *slice, const char *description, const char *after, const char *kill_mode, sd_bus_error *error, char **job);
+int manager_start_scope(Manager *manager, const char *scope, pid_t pid, const char *slice, const char *description, const char *after, sd_bus_error *error, char **job);
 int manager_start_unit(Manager *manager, const char *unit, sd_bus_error *error, char **job);
 int manager_stop_unit(Manager *manager, const char *unit, sd_bus_error *error, char **job);
+int manager_abandon_scope(Manager *manager, const char *scope, sd_bus_error *error);
 int manager_kill_unit(Manager *manager, const char *unit, KillWho who, int signo, sd_bus_error *error);
 int manager_unit_is_active(Manager *manager, const char *unit);
 int manager_job_is_active(Manager *manager, const char *path);
diff --git a/src/login/pam-module.c b/src/login/pam-module.c
index 3b2966b..79a9042 100644
--- a/src/login/pam-module.c
+++ b/src/login/pam-module.c
@@ -499,7 +499,7 @@ _public_ PAM_EXTERN int pam_sm_close_session(
 
         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
         _cleanup_bus_unref_ sd_bus *bus = NULL;
-        const void *p = NULL, *existing = NULL;
+        const void *existing = NULL;
         const char *id;
         int r;
 
@@ -519,10 +519,8 @@ _public_ PAM_EXTERN int pam_sm_close_session(
 
                 r = sd_bus_open_system(&bus);
                 if (r < 0) {
-                        pam_syslog(handle, LOG_ERR,
-                                  "Failed to connect to system bus: %s", strerror(-r));
-                        r = PAM_SESSION_ERR;
-                        goto finish;
+                        pam_syslog(handle, LOG_ERR, "Failed to connect to system bus: %s", strerror(-r));
+                        return PAM_SESSION_ERR;
                 }
 
                 r = sd_bus_call_method(bus,
@@ -535,20 +533,16 @@ _public_ PAM_EXTERN int pam_sm_close_session(
                                        "s",
                                        id);
                 if (r < 0) {
-                        pam_syslog(handle, LOG_ERR,
-                                   "Failed to release session: %s", bus_error_message(&error, r));
-
-                        r = PAM_SESSION_ERR;
-                        goto finish;
+                        pam_syslog(handle, LOG_ERR, "Failed to release session: %s", bus_error_message(&error, r));
+                        return PAM_SESSION_ERR;
                 }
         }
 
-        r = PAM_SUCCESS;
+        /* Note that we are knowingly leaking the FIFO fd here. This
+         * way, logind can watch us die. If we closed it here it would
+         * not have any clue when that is completed. Given that one
+         * cannot really have multiple PAM sessions open from the same
+         * process this means we will leak one FD at max. */
 
-finish:
-        pam_get_data(handle, "systemd.session-fd", &p);
-        if (p)
-                close_nointr(PTR_TO_INT(p) - 1);
-
-        return r;
+        return PAM_SUCCESS;
 }

commit a911bb9ab27ac0eb3bbf4e8b4109e5da9b88eee3
Author: Lennart Poettering <lennart at poettering.net>
Date:   Thu Feb 6 17:17:51 2014 +0100

    core: watch SIGCHLD more closely to track processes of units with no reliable cgroup empty notifier
    
    When a process dies that we can associate with a specific unit, start
    watching all other processes of that unit, so that we can associate
    those processes with the unit too.
    
    Also, for service units start doing this as soon as we get the first
    SIGCHLD for either control or main process, so that we can follow the
    processes of the service from one to the other, as long as process that
    remain are processes of the ones we watched that died and got reassigned
    to us as parent.
    
    Similar, for scope units start doing this as soon as the scope
    controller abandons the unit, and thus management entirely reverts to
    systemd. To abandon a unit introduce a new Abandon() scope unit method
    call.

diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c
index d5a2048..c46c972 100644
--- a/src/core/dbus-scope.c
+++ b/src/core/dbus-scope.c
@@ -28,6 +28,16 @@
 #include "bus-util.h"
 #include "bus-internal.h"
 
+static int bus_scope_abandon(sd_bus *bus, sd_bus_message *message, void *userdata, sd_bus_error *error) {
+        Scope *s = userdata;
+
+        assert(bus);
+        assert(message);
+        assert(s);
+
+        return scope_abandon(s);
+}
+
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_result, scope_result, ScopeResult);
 
 const sd_bus_vtable bus_scope_vtable[] = {
@@ -36,6 +46,7 @@ const sd_bus_vtable bus_scope_vtable[] = {
         SD_BUS_PROPERTY("TimeoutStopUSec", "t", bus_property_get_usec, offsetof(Scope, timeout_stop_usec), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("Result", "s", property_get_result, offsetof(Scope, result), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
         SD_BUS_SIGNAL("RequestStop", NULL, 0),
+        SD_BUS_METHOD("Abandon", NULL, NULL, bus_scope_abandon, 0),
         SD_BUS_VTABLE_END
 };
 
@@ -56,10 +67,6 @@ static int bus_scope_set_transient_property(
                 unsigned n = 0;
                 uint32_t pid;
 
-                r = set_ensure_allocated(&s->pids, trivial_hash_func, trivial_compare_func);
-                if (r < 0)
-                        return r;
-
                 r = sd_bus_message_enter_container(message, 'a', "u");
                 if (r < 0)
                         return r;
@@ -70,7 +77,7 @@ static int bus_scope_set_transient_property(
                                 return -EINVAL;
 
                         if (mode != UNIT_CHECK) {
-                                r = set_put(s->pids, LONG_TO_PTR(pid));
+                                r = unit_watch_pid(UNIT(s), pid);
                                 if (r < 0 && r != -EEXIST)
                                         return r;
                         }
diff --git a/src/core/manager.c b/src/core/manager.c
index 45f5f70..c0bcc79 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -1459,7 +1459,7 @@ static int manager_dispatch_sigchld(Manager *m) {
                 log_debug_unit(u->id,
                                "Child %lu belongs to %s", (long unsigned) si.si_pid, u->id);
 
-                hashmap_remove(m->watch_pids, LONG_TO_PTR(si.si_pid));
+                unit_unwatch_pid(u, si.si_pid);
                 UNIT_VTABLE(u)->sigchld_event(u, si.si_pid, si.si_code, si.si_status);
         }
 
diff --git a/src/core/scope.c b/src/core/scope.c
index 0c1d17e..940e40d 100644
--- a/src/core/scope.c
+++ b/src/core/scope.c
@@ -35,6 +35,7 @@
 static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
         [SCOPE_DEAD] = UNIT_INACTIVE,
         [SCOPE_RUNNING] = UNIT_ACTIVE,
+        [SCOPE_ABANDONED] = UNIT_ACTIVE,
         [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
         [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
         [SCOPE_FAILED] = UNIT_FAILED
@@ -66,9 +67,6 @@ static void scope_done(Unit *u) {
 
         free(s->controller);
 
-        set_free(s->pids);
-        s->pids = NULL;
-
         s->timer_event_source = sd_event_source_unref(s->timer_event_source);
 }
 
@@ -100,15 +98,14 @@ static void scope_set_state(Scope *s, ScopeState state) {
         old_state = s->state;
         s->state = state;
 
-        if (state != SCOPE_STOP_SIGTERM &&
-            state != SCOPE_STOP_SIGKILL)
+        if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
                 s->timer_event_source = sd_event_source_unref(s->timer_event_source);
 
+        if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED))
+                unit_unwatch_all_pids(UNIT(s));
+
         if (state != old_state)
-                log_debug("%s changed %s -> %s",
-                          UNIT(s)->id,
-                          scope_state_to_string(old_state),
-                          scope_state_to_string(state));
+                log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state));
 
         unit_notify(UNIT(s), state_translation_table[old_state], state_translation_table[state], true);
 }
@@ -135,7 +132,7 @@ static int scope_verify(Scope *s) {
         if (UNIT(s)->load_state != UNIT_LOADED)
                 return 0;
 
-        if (set_size(s->pids) <= 0 && UNIT(s)->manager->n_reloading <= 0) {
+        if (set_isempty(UNIT(s)->pids) && UNIT(s)->manager->n_reloading <= 0) {
                 log_error_unit(UNIT(s)->id, "Scope %s has no PIDs. Refusing.", UNIT(s)->id);
                 return -EINVAL;
         }
@@ -181,12 +178,15 @@ static int scope_coldplug(Unit *u) {
 
         if (s->deserialized_state != s->state) {
 
-                if (s->deserialized_state == SCOPE_STOP_SIGKILL || s->deserialized_state == SCOPE_STOP_SIGTERM) {
+                if (IN_SET(s->deserialized_state, SCOPE_STOP_SIGKILL, SCOPE_STOP_SIGTERM)) {
                         r = scope_arm_timer(s);
                         if (r < 0)
                                 return r;
                 }
 
+                if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED))
+                        unit_watch_all_pids(UNIT(s));
+
                 scope_set_state(s, s->deserialized_state);
         }
 
@@ -227,6 +227,8 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
         if (f != SCOPE_SUCCESS)
                 s->result = f;
 
+        unit_watch_all_pids(UNIT(s));
+
         /* If we have a controller set let's ask the controller nicely
          * to terminate the scope, instead of us going directly into
          * SIGTERM beserk mode */
@@ -288,13 +290,10 @@ static int scope_start(Unit *u) {
                 return r;
         }
 
-        r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, s->pids);
+        r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, UNIT(s)->pids);
         if (r < 0)
                 return r;
 
-        set_free(s->pids);
-        s->pids = NULL;
-
         s->result = SCOPE_SUCCESS;
 
         scope_set_state(s, SCOPE_RUNNING);
@@ -305,13 +304,13 @@ static int scope_stop(Unit *u) {
         Scope *s = SCOPE(u);
 
         assert(s);
-        assert(s->state == SCOPE_RUNNING);
 
         if (s->state == SCOPE_STOP_SIGTERM ||
             s->state == SCOPE_STOP_SIGKILL)
                 return 0;
 
-        assert(s->state == SCOPE_RUNNING);
+        assert(s->state == SCOPE_RUNNING ||
+               s->state == SCOPE_ABANDONED);
 
         scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS);
         return 0;
@@ -389,8 +388,8 @@ static bool scope_check_gc(Unit *u) {
         /* Never clean up scopes that still have a process around,
          * even if the scope is formally dead. */
 
-        if (UNIT(s)->cgroup_path) {
-                r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path, true);
+        if (u->cgroup_path) {
+                r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
                 if (r <= 0)
                         return true;
         }
@@ -398,6 +397,32 @@ static bool scope_check_gc(Unit *u) {
         return false;
 }
 
+static void scope_notify_cgroup_empty_event(Unit *u) {
+        Scope *s = SCOPE(u);
+        assert(u);
+
+        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
+
+        if (IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL))
+                scope_enter_dead(s, SCOPE_SUCCESS);
+}
+
+static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+
+        /* If we get a SIGCHLD event for one of the processes we were
+           interested in, then we look for others to watch, under the
+           assumption that we'll sooner or later get a SIGCHLD for
+           them, as the original process we watched was probably the
+           parent of them, and they are hence now our children. */
+
+        unit_tidy_watch_pids(u, 0, 0);
+        unit_watch_all_pids(u);
+
+        /* If the PID set is empty now, then let's finish this off */
+        if (set_isempty(u->pids))
+                scope_notify_cgroup_empty_event(u);
+}
+
 static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
         Scope *s = SCOPE(userdata);
 
@@ -429,24 +454,30 @@ static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *user
         return 0;
 }
 
-static void scope_notify_cgroup_empty_event(Unit *u) {
-        Scope *s = SCOPE(u);
-        assert(u);
+int scope_abandon(Scope *s) {
+        assert(s);
 
-        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
+        if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED))
+                return -ESTALE;
 
-        switch (s->state) {
+        free(s->controller);
+        s->controller = NULL;
 
-        case SCOPE_RUNNING:
-        case SCOPE_STOP_SIGTERM:
-        case SCOPE_STOP_SIGKILL:
-                scope_enter_dead(s, SCOPE_SUCCESS);
+        /* The client is no longer watching the remaining processes,
+         * so let's step in here, under the assumption that the
+         * remaining processes will be sooner or later reassigned to
+         * us as parent. */
 
-                break;
+        unit_tidy_watch_pids(UNIT(s), 0, 0);
+        unit_watch_all_pids(UNIT(s));
 
-        default:
-                ;
-        }
+        /* If the PID set is empty now, then let's finish this off */
+        if (set_isempty(UNIT(s)->pids))
+                scope_notify_cgroup_empty_event(UNIT(s));
+        else
+                scope_set_state(s, SCOPE_ABANDONED);
+
+        return 0;
 }
 
 _pure_ static UnitActiveState scope_active_state(Unit *u) {
@@ -464,6 +495,7 @@ _pure_ static const char *scope_sub_state_to_string(Unit *u) {
 static const char* const scope_state_table[_SCOPE_STATE_MAX] = {
         [SCOPE_DEAD] = "dead",
         [SCOPE_RUNNING] = "running",
+        [SCOPE_ABANDONED] = "abandoned",
         [SCOPE_STOP_SIGTERM] = "stop-sigterm",
         [SCOPE_STOP_SIGKILL] = "stop-sigkill",
         [SCOPE_FAILED] = "failed",
@@ -516,6 +548,8 @@ const UnitVTable scope_vtable = {
 
         .check_gc = scope_check_gc,
 
+        .sigchld_event = scope_sigchld_event,
+
         .reset_failed = scope_reset_failed,
 
         .notify_cgroup_empty = scope_notify_cgroup_empty_event,
diff --git a/src/core/scope.h b/src/core/scope.h
index 014b50c..6c59126 100644
--- a/src/core/scope.h
+++ b/src/core/scope.h
@@ -29,6 +29,7 @@ typedef struct Scope Scope;
 typedef enum ScopeState {
         SCOPE_DEAD,
         SCOPE_RUNNING,
+        SCOPE_ABANDONED,
         SCOPE_STOP_SIGTERM,
         SCOPE_STOP_SIGKILL,
         SCOPE_FAILED,
@@ -57,13 +58,13 @@ struct Scope {
 
         char *controller;
 
-        Set *pids;
-
         sd_event_source *timer_event_source;
 };
 
 extern const UnitVTable scope_vtable;
 
+int scope_abandon(Scope *s);
+
 const char* scope_state_to_string(ScopeState i) _const_;
 ScopeState scope_state_from_string(const char *s) _pure_;
 
diff --git a/src/core/service.c b/src/core/service.c
index 0542eae..8c47e24 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -1485,6 +1485,9 @@ static void service_set_state(Service *s, ServiceState state) {
                 s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
         }
 
+        if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART))
+                unit_unwatch_all_pids(UNIT(s));
+
         if (!IN_SET(state,
                     SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST,
                     SERVICE_RUNNING, SERVICE_RELOAD,
@@ -1587,6 +1590,9 @@ static int service_coldplug(Unit *u) {
                                 return r;
                 }
 
+                if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART))
+                        unit_watch_all_pids(UNIT(s));
+
                 if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD))
                         service_start_watchdog(s);
 
@@ -1895,6 +1901,7 @@ static void service_enter_stop_post(Service *s, ServiceResult f) {
                 s->result = f;
 
         service_unwatch_control_pid(s);
+        unit_watch_all_pids(UNIT(s));
 
         s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST];
         if (s->control_command) {
@@ -1934,6 +1941,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
         if (f != SERVICE_SUCCESS)
                 s->result = f;
 
+        unit_watch_all_pids(UNIT(s));
+
         r = unit_kill_context(
                         UNIT(s),
                         &s->kill_context,
@@ -1983,6 +1992,7 @@ static void service_enter_stop(Service *s, ServiceResult f) {
                 s->result = f;
 
         service_unwatch_control_pid(s);
+        unit_watch_all_pids(UNIT(s));
 
         s->control_command = s->exec_command[SERVICE_EXEC_STOP];
         if (s->control_command) {
@@ -2865,6 +2875,62 @@ fail:
         return 0;
 }
 
+static void service_notify_cgroup_empty_event(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(u);
+
+        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
+
+        switch (s->state) {
+
+                /* Waiting for SIGCHLD is usually more interesting,
+                 * because it includes return codes/signals. Which is
+                 * why we ignore the cgroup events for most cases,
+                 * except when we don't know pid which to expect the
+                 * SIGCHLD for. */
+
+        case SERVICE_START:
+        case SERVICE_START_POST:
+                /* If we were hoping for the daemon to write its PID file,
+                 * we can give up now. */
+                if (s->pid_file_pathspec) {
+                        log_warning_unit(u->id,
+                                         "%s never wrote its PID file. Failing.", UNIT(s)->id);
+                        service_unwatch_pid_file(s);
+                        if (s->state == SERVICE_START)
+                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
+                        else
+                                service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+                }
+                break;
+
+        case SERVICE_RUNNING:
+                /* service_enter_running() will figure out what to do */
+                service_enter_running(s, SERVICE_SUCCESS);
+                break;
+
+        case SERVICE_STOP_SIGTERM:
+        case SERVICE_STOP_SIGKILL:
+
+                if (main_pid_good(s) <= 0 && !control_pid_good(s))
+                        service_enter_stop_post(s, SERVICE_SUCCESS);
+
+                break;
+
+        case SERVICE_STOP_POST:
+        case SERVICE_FINAL_SIGTERM:
+        case SERVICE_FINAL_SIGKILL:
+                if (main_pid_good(s) <= 0 && !control_pid_good(s))
+                        service_enter_dead(s, SERVICE_SUCCESS, true);
+
+                break;
+
+        default:
+                ;
+        }
+}
+
 static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
         Service *s = SERVICE(u);
         ServiceResult f;
@@ -3142,6 +3208,18 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
 
         /* Notify clients about changed exit status */
         unit_add_to_dbus_queue(u);
+
+        /* We got one SIGCHLD for the service, let's watch all
+         * processes that are now running of the service, and watch
+         * that. Among the PIDs we then watch will be children
+         * reassigned to us, which hopefully allows us to identify
+         * when all children are gone */
+        unit_tidy_watch_pids(u, s->main_pid, s->control_pid);
+        unit_watch_all_pids(u);
+
+        /* If the PID set is empty now, then let's finish this off */
+        if (set_isempty(u->pids))
+                service_notify_cgroup_empty_event(u);
 }
 
 static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) {
@@ -3254,62 +3332,6 @@ static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void
         return 0;
 }
 
-static void service_notify_cgroup_empty_event(Unit *u) {
-        Service *s = SERVICE(u);
-
-        assert(u);
-
-        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
-
-        switch (s->state) {
-
-                /* Waiting for SIGCHLD is usually more interesting,
-                 * because it includes return codes/signals. Which is
-                 * why we ignore the cgroup events for most cases,
-                 * except when we don't know pid which to expect the
-                 * SIGCHLD for. */
-
-        case SERVICE_START:
-        case SERVICE_START_POST:
-                /* If we were hoping for the daemon to write its PID file,
-                 * we can give up now. */
-                if (s->pid_file_pathspec) {
-                        log_warning_unit(u->id,
-                                         "%s never wrote its PID file. Failing.", UNIT(s)->id);
-                        service_unwatch_pid_file(s);
-                        if (s->state == SERVICE_START)
-                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
-                        else
-                                service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
-                }
-                break;
-
-        case SERVICE_RUNNING:
-                /* service_enter_running() will figure out what to do */
-                service_enter_running(s, SERVICE_SUCCESS);
-                break;
-
-        case SERVICE_STOP_SIGTERM:
-        case SERVICE_STOP_SIGKILL:
-
-                if (main_pid_good(s) <= 0 && !control_pid_good(s))
-                        service_enter_stop_post(s, SERVICE_SUCCESS);
-
-                break;
-
-        case SERVICE_STOP_POST:
-        case SERVICE_FINAL_SIGTERM:
-        case SERVICE_FINAL_SIGKILL:
-                if (main_pid_good(s) <= 0 && !control_pid_good(s))
-                        service_enter_dead(s, SERVICE_SUCCESS, true);
-
-                break;
-
-        default:
-                ;
-        }
-}
-
 static void service_notify_message(Unit *u, pid_t pid, char **tags) {
         Service *s = SERVICE(u);
         const char *e;
diff --git a/src/core/unit.c b/src/core/unit.c
index 345521a..07eedcd 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -482,6 +482,8 @@ void unit_free(Unit *u) {
 
         set_free_free(u->names);
 
+        unit_unwatch_all_pids(u);
+
         condition_free_list(u->conditions);
 
         unit_ref_unset(&u->slice);
@@ -1697,13 +1699,25 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
 }
 
 int unit_watch_pid(Unit *u, pid_t pid) {
+        int q, r;
+
         assert(u);
         assert(pid >= 1);
 
+        r = set_ensure_allocated(&u->pids, trivial_hash_func, trivial_compare_func);
+        if (r < 0)
+                return r;
+
         /* Watch a specific PID. We only support one unit watching
          * each PID for now. */
 
-        return hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u);
+        r = set_put(u->pids, LONG_TO_PTR(pid));
+
+        q = hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u);
+        if (q < 0)
+                return q;
+
+        return r;
 }
 
 void unit_unwatch_pid(Unit *u, pid_t pid) {
@@ -1711,6 +1725,102 @@ void unit_unwatch_pid(Unit *u, pid_t pid) {
         assert(pid >= 1);
 
         hashmap_remove_value(u->manager->watch_pids, LONG_TO_PTR(pid), u);
+        set_remove(u->pids, LONG_TO_PTR(pid));
+}
+
+static int watch_pids_in_path(Unit *u, const char *path) {
+        _cleanup_closedir_ DIR *d = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int ret = 0, r;
+
+        assert(u);
+        assert(path);
+
+        /* Adds all PIDs from a specific cgroup path to the set of PIDs we watch. */
+
+        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
+        if (r >= 0) {
+                pid_t pid;
+
+                while ((r = cg_read_pid(f, &pid)) > 0) {
+                        r = unit_watch_pid(u, pid);
+                        if (r < 0 && ret >= 0)
+                                ret = r;
+                }
+                if (r < 0 && ret >= 0)
+                        ret = r;
+
+        } else if (ret >= 0)
+                ret = r;
+
+        r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
+        if (r >= 0) {
+                char *fn;
+
+                while ((r = cg_read_subgroup(d, &fn)) > 0) {
+                        _cleanup_free_ char *p = NULL;
+
+                        p = strjoin(path, "/", fn, NULL);
+                        free(fn);
+
+                        if (!p)
+                                return -ENOMEM;
+
+                        r = watch_pids_in_path(u, p);
+                        if (r < 0 && ret >= 0)
+                                ret = r;
+                }
+                if (r < 0 && ret >= 0)
+                        ret = r;
+
+        } else if (ret >= 0)
+                ret = r;
+
+        return ret;
+}
+
+
+int unit_watch_all_pids(Unit *u) {
+        assert(u);
+
+        if (!u->cgroup_path)
+                return -ENOENT;
+
+        /* Adds all PIDs from our cgroup to the set of PIDs we watch */
+
+        return watch_pids_in_path(u, u->cgroup_path);
+}
+
+void unit_unwatch_all_pids(Unit *u) {
+        Iterator i;
+        void *e;
+
+        assert(u);
+
+        SET_FOREACH(e, u->pids, i)
+                hashmap_remove_value(u->manager->watch_pids, e, u);
+
+        set_free(u->pids);
+        u->pids = NULL;
+}
+
+void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) {
+        Iterator i;
+        void *e;
+
+        assert(u);
+
+        /* Cleans dead PIDs from our list */
+
+        SET_FOREACH(e, u->pids, i) {
+                pid_t pid = PTR_TO_LONG(e);
+
+                if (pid == except1 || pid == except2)
+                        continue;
+
+                if (kill(pid, 0) < 0 && errno == ESRCH)
+                        set_remove(u->pids, e);
+        }
 }
 
 bool unit_job_is_applicable(Unit *u, JobType j) {
@@ -2979,17 +3089,7 @@ int unit_kill_context(
                                 log_warning_unit(u->id, "Failed to kill control group: %s", strerror(-r));
                 } else if (r > 0) {
 
-                        /* FIXME: Now, we don't actually wait for any
-                         * of the processes that are neither control
-                         * nor main process. We should wait for them
-                         * of course, but that's hard since the cgroup
-                         * notification logic is so unreliable. It is
-                         * not available at all in containers, and on
-                         * the host it gets confused by
-                         * subgroups. Hence, for now, let's not wait
-                         * for these processes -- but when the kernel
-                         * gets fixed we really should correct
-                         * that. */
+                        wait_for_exit = true;
 
                         if (c->send_sighup && !sigkill) {
                                 set_free(pid_set);
diff --git a/src/core/unit.h b/src/core/unit.h
index c686aec..c104a8a 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -203,6 +203,11 @@ struct Unit {
         /* CGroup realize members queue */
         LIST_FIELDS(Unit, cgroup_queue);
 
+        /* PIDs we keep an eye on. Note that a unit might have many
+         * more, but these are the ones we care enough about to
+         * process SIGCHLD for */
+        Set *pids;
+
         /* Used during GC sweeps */
         unsigned gc_marker;
 
@@ -538,6 +543,10 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
 
 int unit_watch_pid(Unit *u, pid_t pid);
 void unit_unwatch_pid(Unit *u, pid_t pid);
+int unit_watch_all_pids(Unit *u);
+void unit_unwatch_all_pids(Unit *u);
+
+void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2);
 
 int unit_watch_bus_name(Unit *u, const char *name);
 void unit_unwatch_bus_name(Unit *u, const char *name);



More information about the systemd-commits mailing list