Blob Blame History Raw
From d9eb14931b12ded622f5aa4bd547e6075d10100f Mon Sep 17 00:00:00 2001
From: Michal Sekletar <msekleta@redhat.com>
Date: Thu, 27 Feb 2014 18:16:19 +0100
Subject: [PATCH] core: watch SIGCHLD more closely to track processes of units
 with no reliable cgroup empty notifier

When a process dies that we can associate with a specific unit, start
watching all other processes of that unit, so that we can associate
those processes with the unit too.

Also, for service units start doing this as soon as we get the first
SIGCHLD for either control or main process, so that we can follow the
processes of the service from one to the other, as long as process that
remain are processes of the ones we watched that died and got reassigned
to us as parent.

Similar, for scope units start doing this as soon as the scope
controller abandons the unit, and thus management entirely reverts to
systemd. To abandon a unit introduce a new Abandon() scope unit method
call.

Based-on: a911bb9ab27ac0eb3bbf4e8b4109e5da9b88eee3
---
 src/core/dbus-scope.c |  36 +++++++++----
 src/core/manager.c    |   2 +-
 src/core/scope.c      |  87 ++++++++++++++++++++++---------
 src/core/scope.h      |   5 +-
 src/core/service.c    | 140 ++++++++++++++++++++++++++++++--------------------
 src/core/unit.c       | 112 +++++++++++++++++++++++++++++++++++++++-
 src/core/unit.h       |   9 ++++
 7 files changed, 298 insertions(+), 93 deletions(-)

diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c
index b576f760ef..58dd9ff702 100644
--- a/src/core/dbus-scope.c
+++ b/src/core/dbus-scope.c
@@ -30,6 +30,7 @@
 
 #define BUS_SCOPE_INTERFACE                                             \
         " <interface name=\"org.freedesktop.systemd1.Scope\">\n"        \
+        "  <method name=\"Abandon\"/>\n"                                \
         BUS_UNIT_CGROUP_INTERFACE                                       \
         "  <property name=\"Controller\" type=\"s\" access=\"read\"/>\n"\
         "  <property name=\"TimeoutStopUSec\" type=\"t\" access=\"read\"/>\n" \
@@ -66,19 +67,40 @@ static const BusProperty bus_scope_properties[] = {
 
 DBusHandlerResult bus_scope_message_handler(Unit *u, DBusConnection *c, DBusMessage *message) {
         Scope *s = SCOPE(u);
+        _cleanup_dbus_message_unref_ DBusMessage *reply = NULL;
 
-        const BusBoundProperties bps[] = {
+        SELINUX_UNIT_ACCESS_CHECK(u, c, message, "status");
+
+        if (dbus_message_is_method_call(message, "org.freedesktop.systemd1.Scope", "Abandon")) {
+                int r;
+
+                r = scope_abandon(s);
+                if (r < 0)
+                        log_error("Failed to mark scope %s as abandoned : %s", UNIT(s)->id, strerror(-r));
+
+                reply = dbus_message_new_method_return(message);
+                if (!reply)
+                        goto oom;
+        } else {
+                const BusBoundProperties bps[] = {
                 { "org.freedesktop.systemd1.Unit",  bus_unit_properties,           u },
                 { "org.freedesktop.systemd1.Scope", bus_unit_cgroup_properties,    u },
                 { "org.freedesktop.systemd1.Scope", bus_scope_properties,          s },
                 { "org.freedesktop.systemd1.Scope", bus_cgroup_context_properties, &s->cgroup_context },
                 { "org.freedesktop.systemd1.Scope", bus_kill_context_properties,   &s->kill_context   },
                 {}
-        };
+                };
 
-        SELINUX_UNIT_ACCESS_CHECK(u, c, message, "status");
+               return  bus_default_message_handler(c, message, INTROSPECTION, INTERFACES_LIST, bps);
+        }
+
+        if (reply)
+                if (!bus_maybe_send_reply(c, message, reply))
+                        goto oom;
 
-        return bus_default_message_handler(c, message, INTROSPECTION, INTERFACES_LIST, bps);
+        return DBUS_HANDLER_RESULT_HANDLED;
+oom:
+        return DBUS_HANDLER_RESULT_NEED_MEMORY;
 }
 
 static int bus_scope_set_transient_property(
@@ -102,10 +124,6 @@ static int bus_scope_set_transient_property(
                     dbus_message_iter_get_element_type(i) != DBUS_TYPE_UINT32)
                         return -EINVAL;
 
-                r = set_ensure_allocated(&s->pids, trivial_hash_func, trivial_compare_func);
-                if (r < 0)
-                        return r;
-
                 dbus_message_iter_recurse(i, &sub);
                 while (dbus_message_iter_get_arg_type(&sub) == DBUS_TYPE_UINT32) {
                         uint32_t pid;
@@ -116,7 +134,7 @@ static int bus_scope_set_transient_property(
                                 return -EINVAL;
 
                         if (mode != UNIT_CHECK) {
-                                r = set_put(s->pids, LONG_TO_PTR(pid));
+                                r = unit_watch_pid(UNIT(s), pid);
                                 if (r < 0 && r != -EEXIST)
                                         return r;
                         }
diff --git a/src/core/manager.c b/src/core/manager.c
index 69ad4b5010..e7b5234bb4 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -1389,7 +1389,7 @@ static int manager_dispatch_sigchld(Manager *m) {
                 log_debug_unit(u->id,
                                "Child %lu belongs to %s", (long unsigned) si.si_pid, u->id);
 
-                hashmap_remove(m->watch_pids, LONG_TO_PTR(si.si_pid));
+                unit_unwatch_pid(u, si.si_pid);
                 UNIT_VTABLE(u)->sigchld_event(u, si.si_pid, si.si_code, si.si_status);
         }
 
diff --git a/src/core/scope.c b/src/core/scope.c
index e75fc2b58b..22bdfb25d0 100644
--- a/src/core/scope.c
+++ b/src/core/scope.c
@@ -35,6 +35,7 @@
 static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
         [SCOPE_DEAD] = UNIT_INACTIVE,
         [SCOPE_RUNNING] = UNIT_ACTIVE,
+        [SCOPE_ABANDONED] = UNIT_ACTIVE,
         [SCOPE_STOP_SIGTERM] = UNIT_DEACTIVATING,
         [SCOPE_STOP_SIGKILL] = UNIT_DEACTIVATING,
         [SCOPE_FAILED] = UNIT_FAILED
@@ -67,9 +68,6 @@ static void scope_done(Unit *u) {
         free(s->controller);
         s->controller = NULL;
 
-        set_free(s->pids);
-        s->pids = NULL;
-
         unit_unwatch_timer(u, &s->timer_watch);
 }
 
@@ -84,6 +82,9 @@ static void scope_set_state(Scope *s, ScopeState state) {
             state != SCOPE_STOP_SIGKILL)
                 unit_unwatch_timer(UNIT(s), &s->timer_watch);
 
+        if (state == SCOPE_DEAD || state == SCOPE_FAILED)
+                unit_unwatch_all_pids(UNIT(s));
+
         if (state != old_state)
                 log_debug("%s changed %s -> %s",
                           UNIT(s)->id,
@@ -115,7 +116,7 @@ static int scope_verify(Scope *s) {
         if (UNIT(s)->load_state != UNIT_LOADED)
                 return 0;
 
-        if (set_size(s->pids) <= 0 && UNIT(s)->manager->n_reloading <= 0) {
+        if (set_size(UNIT(s)->pids) <= 0 && UNIT(s)->manager->n_reloading <= 0) {
                 log_error_unit(UNIT(s)->id, "Scope %s has no PIDs. Refusing.", UNIT(s)->id);
                 return -EINVAL;
         }
@@ -169,6 +170,9 @@ static int scope_coldplug(Unit *u) {
                                 return r;
                 }
 
+                if (s->deserialized_state != SCOPE_DEAD && s->deserialized_state != SCOPE_FAILED)
+                        unit_watch_all_pids(UNIT(s));
+
                 scope_set_state(s, s->deserialized_state);
         }
 
@@ -209,6 +213,8 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) {
         if (f != SCOPE_SUCCESS)
                 s->result = f;
 
+        unit_watch_all_pids(UNIT(s));
+
         /* If we have a controller set let's ask the controller nicely
          * to terminate the scope, instead of us going directly into
          * SIGTERM beserk mode */
@@ -271,13 +277,10 @@ static int scope_start(Unit *u) {
                 return r;
         }
 
-        r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, s->pids);
+        r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, UNIT(s)->pids);
         if (r < 0)
                 return r;
 
-        set_free(s->pids);
-        s->pids = NULL;
-
         s->result = SCOPE_SUCCESS;
 
         scope_set_state(s, SCOPE_RUNNING);
@@ -288,13 +291,13 @@ static int scope_stop(Unit *u) {
         Scope *s = SCOPE(u);
 
         assert(s);
-        assert(s->state == SCOPE_RUNNING);
 
         if (s->state == SCOPE_STOP_SIGTERM ||
             s->state == SCOPE_STOP_SIGKILL)
                 return 0;
 
-        assert(s->state == SCOPE_RUNNING);
+        assert(s->state == SCOPE_RUNNING ||
+               s->state == SCOPE_ABANDONED);
 
         scope_enter_signal(s, SCOPE_STOP_SIGTERM, SCOPE_SUCCESS);
         return 0;
@@ -358,7 +361,7 @@ static bool scope_check_gc(Unit *u) {
         /* Never clean up scopes that still have a process around,
          * even if the scope is formally dead. */
 
-        if (UNIT(s)->cgroup_path) {
+        if (u->cgroup_path) {
                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, UNIT(s)->cgroup_path, true);
                 if (r <= 0)
                         return true;
@@ -367,6 +370,33 @@ static bool scope_check_gc(Unit *u) {
         return false;
 }
 
+static void scope_notify_cgroup_empty_event(Unit *u) {
+        Scope *s = SCOPE(u);
+
+        assert(u);
+
+        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
+
+        if (s->state == SCOPE_RUNNING || s->state == SCOPE_ABANDONED ||
+            s->state == SCOPE_STOP_SIGTERM || SCOPE_STOP_SIGKILL)
+                scope_enter_dead(s, SCOPE_SUCCESS);
+}
+
+static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) {
+        /* If we get a SIGCHLD event for one of the processes we were
+           interested in, then we look for others to watch, under the
+           assumption that we'll sooner or later get a SIGCHLD for
+           them, as the original process we watched was probably the
+           parent of them, and they are hence now our children. */
+
+        unit_tidy_watch_pids(u, 0, 0);
+        unit_watch_all_pids(u);
+
+        /* If the PID set is empty now, then let's finish this off */
+        if (set_isempty(u->pids))
+                scope_notify_cgroup_empty_event(u);
+}
+
 static void scope_timer_event(Unit *u, uint64_t elapsed, Watch*w) {
         Scope *s = SCOPE(u);
 
@@ -397,24 +427,30 @@ static void scope_timer_event(Unit *u, uint64_t elapsed, Watch*w) {
         }
 }
 
-static void scope_notify_cgroup_empty_event(Unit *u) {
-        Scope *s = SCOPE(u);
-        assert(u);
+int scope_abandon(Scope *s) {
+        assert(s);
 
-        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
+        if (s->state != SCOPE_RUNNING && s->state != SCOPE_ABANDONED)
+                return -ESTALE;
 
-        switch (s->state) {
+        free(s->controller);
+        s->controller = NULL;
 
-        case SCOPE_RUNNING:
-        case SCOPE_STOP_SIGTERM:
-        case SCOPE_STOP_SIGKILL:
-                scope_enter_dead(s, SCOPE_SUCCESS);
+        /* The client is no longer watching the remaining processes,
+         * so let's step in here, under the assumption that the
+         * remaining processes will be sooner or later reassigned to
+         * us as parent. */
 
-                break;
+        unit_tidy_watch_pids(UNIT(s), 0, 0);
+        unit_watch_all_pids(UNIT(s));
 
-        default:
-                ;
-        }
+        /* If the PID set is empty now, then let's finish this off */
+        if (set_isempty(UNIT(s)->pids))
+                scope_notify_cgroup_empty_event(UNIT(s));
+        else
+                scope_set_state(s, SCOPE_ABANDONED);
+
+        return 0;
 }
 
 _pure_ static UnitActiveState scope_active_state(Unit *u) {
@@ -432,6 +468,7 @@ _pure_ static const char *scope_sub_state_to_string(Unit *u) {
 static const char* const scope_state_table[_SCOPE_STATE_MAX] = {
         [SCOPE_DEAD] = "dead",
         [SCOPE_RUNNING] = "running",
+        [SCOPE_ABANDONED] = "abandoned",
         [SCOPE_STOP_SIGTERM] = "stop-sigterm",
         [SCOPE_STOP_SIGKILL] = "stop-sigkill",
         [SCOPE_FAILED] = "failed",
@@ -481,6 +518,8 @@ const UnitVTable scope_vtable = {
 
         .check_gc = scope_check_gc,
 
+        .sigchld_event = scope_sigchld_event,
+
         .timer_event = scope_timer_event,
 
         .reset_failed = scope_reset_failed,
diff --git a/src/core/scope.h b/src/core/scope.h
index b4bafa75bf..1e9f201c82 100644
--- a/src/core/scope.h
+++ b/src/core/scope.h
@@ -29,6 +29,7 @@ typedef struct Scope Scope;
 typedef enum ScopeState {
         SCOPE_DEAD,
         SCOPE_RUNNING,
+        SCOPE_ABANDONED,
         SCOPE_STOP_SIGTERM,
         SCOPE_STOP_SIGKILL,
         SCOPE_FAILED,
@@ -57,13 +58,13 @@ struct Scope {
 
         char *controller;
 
-        Set *pids;
-
         Watch timer_watch;
 };
 
 extern const UnitVTable scope_vtable;
 
+int scope_abandon(Scope *s);
+
 const char* scope_state_to_string(ScopeState i) _const_;
 ScopeState scope_state_from_string(const char *s) _pure_;
 
diff --git a/src/core/service.c b/src/core/service.c
index f0acda102b..41e5cb55f6 100644
--- a/src/core/service.c
+++ b/src/core/service.c
@@ -1546,6 +1546,11 @@ static void service_set_state(Service *s, ServiceState state) {
                 s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID;
         }
 
+        if (state == SERVICE_DEAD ||
+            state == SERVICE_FAILED ||
+            state == SERVICE_AUTO_RESTART)
+                unit_unwatch_all_pids(UNIT(s));
+
         if (state != SERVICE_START_PRE &&
             state != SERVICE_START &&
             state != SERVICE_START_POST &&
@@ -1661,8 +1666,14 @@ static int service_coldplug(Unit *u) {
                                         return r;
                         }
 
+                if (s->deserialized_state != SERVICE_DEAD &&
+                    s->deserialized_state != SERVICE_FAILED &&
+                    s->deserialized_state != SERVICE_AUTO_RESTART)
+                        unit_watch_all_pids(UNIT(s));
+
                 if (s->deserialized_state == SERVICE_START_POST ||
-                    s->deserialized_state == SERVICE_RUNNING)
+                    s->deserialized_state == SERVICE_RUNNING ||
+                    s->deserialized_state == SERVICE_RELOAD)
                         service_handle_watchdog(s);
 
                 service_set_state(s, s->deserialized_state);
@@ -1970,6 +1981,7 @@ static void service_enter_stop_post(Service *s, ServiceResult f) {
                 s->result = f;
 
         service_unwatch_control_pid(s);
+        unit_watch_all_pids(UNIT(s));
 
         s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST];
         if (s->control_command) {
@@ -2010,6 +2022,8 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f
         if (f != SERVICE_SUCCESS)
                 s->result = f;
 
+        unit_watch_all_pids(UNIT(s));
+
         r = unit_kill_context(
                         UNIT(s),
                         &s->kill_context,
@@ -2055,6 +2069,7 @@ static void service_enter_stop(Service *s, ServiceResult f) {
                 s->result = f;
 
         service_unwatch_control_pid(s);
+        unit_watch_all_pids(UNIT(s));
 
         s->control_command = s->exec_command[SERVICE_EXEC_STOP];
         if (s->control_command) {
@@ -2961,6 +2976,62 @@ fail:
         service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_RESOURCES);
 }
 
+static void service_notify_cgroup_empty_event(Unit *u) {
+        Service *s = SERVICE(u);
+
+        assert(u);
+
+        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
+
+        switch (s->state) {
+
+                /* Waiting for SIGCHLD is usually more interesting,
+                 * because it includes return codes/signals. Which is
+                 * why we ignore the cgroup events for most cases,
+                 * except when we don't know pid which to expect the
+                 * SIGCHLD for. */
+
+        case SERVICE_START:
+        case SERVICE_START_POST:
+                /* If we were hoping for the daemon to write its PID file,
+                 * we can give up now. */
+                if (s->pid_file_pathspec) {
+                        log_warning_unit(u->id,
+                                         "%s never wrote its PID file. Failing.", UNIT(s)->id);
+                        service_unwatch_pid_file(s);
+                        if (s->state == SERVICE_START)
+                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
+                        else
+                                service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
+                }
+                break;
+
+        case SERVICE_RUNNING:
+                /* service_enter_running() will figure out what to do */
+                service_enter_running(s, SERVICE_SUCCESS);
+                break;
+
+        case SERVICE_STOP_SIGTERM:
+        case SERVICE_STOP_SIGKILL:
+
+                if (main_pid_good(s) <= 0 && !control_pid_good(s))
+                        service_enter_stop_post(s, SERVICE_SUCCESS);
+
+                break;
+
+        case SERVICE_STOP_POST:
+        case SERVICE_FINAL_SIGTERM:
+        case SERVICE_FINAL_SIGKILL:
+                if (main_pid_good(s) <= 0 && !control_pid_good(s))
+                        service_enter_dead(s, SERVICE_SUCCESS, true);
+
+                break;
+
+        default:
+                ;
+        }
+}
+
 static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
         Service *s = SERVICE(u);
         ServiceResult f;
@@ -3229,6 +3300,18 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
 
         /* Notify clients about changed exit status */
         unit_add_to_dbus_queue(u);
+
+        /* We got one SIGCHLD for the service, let's watch all
+         * processes that are now running of the service, and watch
+         * that. Among the PIDs we then watch will be children
+         * reassigned to us, which hopefully allows us to identify
+         * when all children are gone */
+        unit_tidy_watch_pids(u, s->main_pid, s->control_pid);
+        unit_watch_all_pids(u);
+
+        /* If the PID set is empty now, then let's finish this off */
+        if (set_isempty(u->pids))
+                service_notify_cgroup_empty_event(u);
 }
 
 static void service_timer_event(Unit *u, uint64_t elapsed, Watch* w) {
@@ -3332,61 +3415,6 @@ static void service_timer_event(Unit *u, uint64_t elapsed, Watch* w) {
         }
 }
 
-static void service_notify_cgroup_empty_event(Unit *u) {
-        Service *s = SERVICE(u);
-
-        assert(u);
-
-        log_debug_unit(u->id, "%s: cgroup is empty", u->id);
-
-        switch (s->state) {
-
-                /* Waiting for SIGCHLD is usually more interesting,
-                 * because it includes return codes/signals. Which is
-                 * why we ignore the cgroup events for most cases,
-                 * except when we don't know pid which to expect the
-                 * SIGCHLD for. */
-
-        case SERVICE_START:
-        case SERVICE_START_POST:
-                /* If we were hoping for the daemon to write its PID file,
-                 * we can give up now. */
-                if (s->pid_file_pathspec) {
-                        log_warning_unit(u->id,
-                                         "%s never wrote its PID file. Failing.", UNIT(s)->id);
-                        service_unwatch_pid_file(s);
-                        if (s->state == SERVICE_START)
-                                service_enter_signal(s, SERVICE_FINAL_SIGTERM, SERVICE_FAILURE_RESOURCES);
-                        else
-                                service_enter_stop(s, SERVICE_FAILURE_RESOURCES);
-                }
-                break;
-
-        case SERVICE_RUNNING:
-                /* service_enter_running() will figure out what to do */
-                service_enter_running(s, SERVICE_SUCCESS);
-                break;
-
-        case SERVICE_STOP_SIGTERM:
-        case SERVICE_STOP_SIGKILL:
-
-                if (main_pid_good(s) <= 0 && !control_pid_good(s))
-                        service_enter_stop_post(s, SERVICE_SUCCESS);
-
-                break;
-
-        case SERVICE_FINAL_SIGTERM:
-        case SERVICE_FINAL_SIGKILL:
-                if (main_pid_good(s) <= 0 && !control_pid_good(s))
-                        service_enter_dead(s, SERVICE_SUCCESS, true);
-
-                break;
-
-        default:
-                ;
-        }
-}
-
 static void service_notify_message(Unit *u, pid_t pid, char **tags) {
         Service *s = SERVICE(u);
         const char *e;
diff --git a/src/core/unit.c b/src/core/unit.c
index de34ddc843..57a406d1dd 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -469,6 +469,8 @@ void unit_free(Unit *u) {
 
         set_free_free(u->names);
 
+        unit_unwatch_all_pids(u);
+
         condition_free_list(u->conditions);
 
         unit_ref_unset(&u->slice);
@@ -1656,13 +1658,25 @@ void unit_unwatch_fd(Unit *u, Watch *w) {
 }
 
 int unit_watch_pid(Unit *u, pid_t pid) {
+        int q, r;
+
         assert(u);
         assert(pid >= 1);
 
+        r = set_ensure_allocated(&u->pids, trivial_hash_func, trivial_compare_func);
+        if (r < 0)
+                return r;
+
         /* Watch a specific PID. We only support one unit watching
          * each PID for now. */
 
-        return hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u);
+        r = set_put(u->pids, LONG_TO_PTR(pid));
+
+        q = hashmap_put(u->manager->watch_pids, LONG_TO_PTR(pid), u);
+        if (q < 0)
+                return q;
+
+        return r;
 }
 
 void unit_unwatch_pid(Unit *u, pid_t pid) {
@@ -1670,6 +1684,102 @@ void unit_unwatch_pid(Unit *u, pid_t pid) {
         assert(pid >= 1);
 
         hashmap_remove_value(u->manager->watch_pids, LONG_TO_PTR(pid), u);
+        set_remove(u->pids, LONG_TO_PTR(pid));
+}
+
+static int watch_pids_in_path(Unit *u, const char *path) {
+        _cleanup_closedir_ DIR *d = NULL;
+        _cleanup_fclose_ FILE *f = NULL;
+        int ret = 0, r;
+
+        assert(u);
+        assert(path);
+
+        /* Adds all PIDs from a specific cgroup path to the set of PIDs we watch. */
+
+        r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
+        if (r >= 0) {
+                pid_t pid;
+
+                while ((r = cg_read_pid(f, &pid)) > 0) {
+                        r = unit_watch_pid(u, pid);
+                        if (r < 0 && ret >= 0)
+                                ret = r;
+                }
+                if (r < 0 && ret >= 0)
+                        ret = r;
+
+        } else if (ret >= 0)
+                ret = r;
+
+        r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
+        if (r >= 0) {
+                char *fn;
+
+                while ((r = cg_read_subgroup(d, &fn)) > 0) {
+                        _cleanup_free_ char *p = NULL;
+
+                        p = strjoin(path, "/", fn, NULL);
+                        free(fn);
+
+                        if (!p)
+                                return -ENOMEM;
+
+                        r = watch_pids_in_path(u, p);
+                        if (r < 0 && ret >= 0)
+                                ret = r;
+                }
+                if (r < 0 && ret >= 0)
+                        ret = r;
+
+        } else if (ret >= 0)
+                ret = r;
+
+        return ret;
+}
+
+
+int unit_watch_all_pids(Unit *u) {
+        assert(u);
+
+        if (!u->cgroup_path)
+                return -ENOENT;
+
+        /* Adds all PIDs from our cgroup to the set of PIDs we watch */
+
+        return watch_pids_in_path(u, u->cgroup_path);
+}
+
+void unit_unwatch_all_pids(Unit *u) {
+        Iterator i;
+        void *e;
+
+        assert(u);
+
+        SET_FOREACH(e, u->pids, i)
+                hashmap_remove_value(u->manager->watch_pids, e, u);
+
+        set_free(u->pids);
+        u->pids = NULL;
+}
+
+void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) {
+        Iterator i;
+        void *e;
+
+        assert(u);
+
+        /* Cleans dead PIDs from our list */
+
+        SET_FOREACH(e, u->pids, i) {
+                pid_t pid = PTR_TO_LONG(e);
+
+                if (pid == except1 || pid == except2)
+                        continue;
+
+                if (kill(pid, 0) < 0 && errno == ESRCH)
+                        set_remove(u->pids, e);
+        }
 }
 
 int unit_watch_timer(Unit *u, clockid_t clock_id, bool relative, usec_t usec, Watch *w) {
diff --git a/src/core/unit.h b/src/core/unit.h
index 6dd750f8c2..6dff25e9bc 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -198,6 +198,11 @@ struct Unit {
         /* CGroup realize members queue */
         LIST_FIELDS(Unit, cgroup_queue);
 
+        /* PIDs we keep an eye on. Note that a unit might have many
+         * more, but these are the ones we care enough about to
+         * process SIGCHLD for */
+        Set *pids;
+
         /* Used during GC sweeps */
         unsigned gc_marker;
 
@@ -531,6 +536,10 @@ void unit_unwatch_fd(Unit *u, Watch *w);
 
 int unit_watch_pid(Unit *u, pid_t pid);
 void unit_unwatch_pid(Unit *u, pid_t pid);
+int unit_watch_all_pids(Unit *u);
+void unit_unwatch_all_pids(Unit *u);
+
+void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2);
 
 int unit_watch_timer(Unit *u, clockid_t, bool relative, usec_t usec, Watch *w);
 void unit_unwatch_timer(Unit *u, Watch *w);