From 62a769136df4065ce0711625e1e78ec996447862 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 12 Jan 2018 13:41:05 +0100 Subject: [PATCH] core: rework how we track which PIDs to watch for a unit Previously, we'd maintain two hashmaps keyed by PIDs, pointing to Unit interested in SIGCHLD events for them. This scheme allowed a specific PID to be watched by exactly 0, 1 or 2 units. With this rework this is replaced by a single hashmap which is primarily keyed by the PID and points to a Unit interested in it. However, it optionally also keyed by the negated PID, in which case it points to a NULL terminated array of additional Unit objects also interested. This scheme means arbitrary numbers of Units may now watch the same PID. Runtime and memory behaviour should not be impact by this change, as for the common case (i.e. each PID only watched by a single unit) behaviour stays the same, but for the uncommon case (a PID watched by more than one unit) we only pay with a single additional memory allocation for the array. Why this all? Primarily, because allowing exactly two units to watch a specific PID is not sufficient for some niche cases, as processes can belong to more than one unit these days: 1. sd_notify() with MAINPID= can be used to attach a process from a different cgroup to multiple units. 2. Similar, the PIDFile= setting in unit files can be used for similar setups, 3. By creating a scope unit a main process of a service may join a different unit, too. 4. On cgroupsv1 we frequently end up watching all processes remaining in a scope, and if a process opens lots of scopes one after the other it might thus end up being watch by many of them. This patch hence removes the 2-unit-per-PID limit. It also makes a couple of other changes, some of them quite relevant: - manager_get_unit_by_pid() (and the bus call wrapping it) when there's ambiguity will prefer returning the Unit the process belongs to based on cgroup membership, and only check the watch-pids hashmap if that fails. This change in logic is probably more in line with what people expect and makes things more stable as each process can belong to exactly one cgroup only. - Every SIGCHLD event is now dispatched to all units interested in its PID. Previously, there was some magic conditionalization: the SIGCHLD would only be dispatched to the unit if it was only interested in a single PID only, or the PID belonged to the control or main PID or we didn't dispatch a signle SIGCHLD to the unit in the current event loop iteration yet. These rules were quite arbitrary and also redundant as the the per-unit handlers would filter the PIDs anyway a second time. With this change we'll hence relax the rules: all we do now is dispatch every SIGCHLD event exactly once to each unit interested in it, and it's up to the unit to then use or ignore this. We use a generation counter in the unit to ensure that we only invoke the unit handler once for each event, protecting us from confusion if a unit is both associated with a specific PID through cgroup membership and through the "watch_pids" logic. It also protects us from being confused if the "watch_pids" hashmap is altered while we are dispatching to it (which is a very likely case). - sd_notify() message dispatching has been reworked to be very similar to SIGCHLD handling now. A generation counter is used for dispatching as well. This also adds a new test that validates that "watch_pid" registration and unregstration works correctly. --- src/core/cgroup.c | 24 +++++--- src/core/manager.c | 137 ++++++++++++++++++++++++++++++---------------- src/core/manager.h | 23 +++++--- src/core/unit.c | 91 +++++++++++++++++++++++------- src/core/unit.h | 6 +- src/test/meson.build | 11 ++++ src/test/test-watch-pid.c | 96 ++++++++++++++++++++++++++++++++ 7 files changed, 301 insertions(+), 87 deletions(-) create mode 100644 src/test/test-watch-pid.c diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 47a3b6d..bffdeb6 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2186,40 +2186,46 @@ Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) { Unit *manager_get_unit_by_pid_cgroup(Manager *m, pid_t pid) { _cleanup_free_ char *cgroup = NULL; - int r; assert(m); - if (pid <= 0) + if (!pid_is_valid(pid)) return NULL; - r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); - if (r < 0) + if (cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup) < 0) return NULL; return manager_get_unit_by_cgroup(m, cgroup); } Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) { - Unit *u; + Unit *u, **array; assert(m); - if (pid <= 0) + /* Note that a process might be owned by multiple units, we return only one here, which is good enough for most + * cases, though not strictly correct. We prefer the one reported by cgroup membership, as that's the most + * relevant one as children of the process will be assigned to that one, too, before all else. */ + + if (!pid_is_valid(pid)) return NULL; if (pid == getpid_cached()) return hashmap_get(m->units, SPECIAL_INIT_SCOPE); - u = hashmap_get(m->watch_pids1, PID_TO_PTR(pid)); + u = manager_get_unit_by_pid_cgroup(m, pid); if (u) return u; - u = hashmap_get(m->watch_pids2, PID_TO_PTR(pid)); + u = hashmap_get(m->watch_pids, PID_TO_PTR(pid)); if (u) return u; - return manager_get_unit_by_pid_cgroup(m, pid); + array = hashmap_get(m->watch_pids, PID_TO_PTR(-pid)); + if (array) + return array[0]; + + return NULL; } int manager_notify_cgroup_empty(Manager *m, const char *cgroup) { diff --git a/src/core/manager.c b/src/core/manager.c index 2dec25b..4130b24 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -1205,8 +1205,7 @@ Manager* manager_free(Manager *m) { hashmap_free(m->units); hashmap_free(m->units_by_invocation_id); hashmap_free(m->jobs); - hashmap_free(m->watch_pids1); - hashmap_free(m->watch_pids2); + hashmap_free(m->watch_pids); hashmap_free(m->watch_bus); set_free(m->startup_units); @@ -1915,22 +1914,27 @@ static void manager_invoke_notify_message( const char *buf, FDSet *fds) { - _cleanup_strv_free_ char **tags = NULL; - assert(m); assert(u); assert(ucred); assert(buf); - tags = strv_split(buf, NEWLINE); - if (!tags) { - log_oom(); + if (u->notifygen == m->notifygen) /* Already invoked on this same unit in this same iteration? */ return; - } + u->notifygen = m->notifygen; + + if (UNIT_VTABLE(u)->notify_message) { + _cleanup_strv_free_ char **tags = NULL; + + tags = strv_split(buf, NEWLINE); + if (!tags) { + log_oom(); + return; + } - if (UNIT_VTABLE(u)->notify_message) UNIT_VTABLE(u)->notify_message(u, ucred, tags, fds); - else if (DEBUG_LOGGING) { + + } else if (DEBUG_LOGGING) { _cleanup_free_ char *x = NULL, *y = NULL; x = ellipsize(buf, 20, 90); @@ -1964,9 +1968,11 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t struct cmsghdr *cmsg; struct ucred *ucred = NULL; - Unit *u1, *u2, *u3; + _cleanup_free_ Unit **array_copy = NULL; + Unit *u1, *u2, **array; int r, *fd_array = NULL; unsigned n_fds = 0; + bool found = false; ssize_t n; assert(m); @@ -2033,22 +2039,41 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t /* Make sure it's NUL-terminated. */ buf[n] = 0; - /* Notify every unit that might be interested, but try - * to avoid notifying the same one multiple times. */ + /* Increase the generation counter used for filtering out duplicate unit invocations. */ + m->notifygen++; + + /* Notify every unit that might be interested, which might be multiple. */ u1 = manager_get_unit_by_pid_cgroup(m, ucred->pid); - if (u1) - manager_invoke_notify_message(m, u1, ucred, buf, fds); + u2 = hashmap_get(m->watch_pids, PID_TO_PTR(ucred->pid)); + array = hashmap_get(m->watch_pids, PID_TO_PTR(-ucred->pid)); + if (array) { + size_t k = 0; - u2 = hashmap_get(m->watch_pids1, PID_TO_PTR(ucred->pid)); - if (u2 && u2 != u1) - manager_invoke_notify_message(m, u2, ucred, buf, fds); + while (array[k]) + k++; - u3 = hashmap_get(m->watch_pids2, PID_TO_PTR(ucred->pid)); - if (u3 && u3 != u2 && u3 != u1) - manager_invoke_notify_message(m, u3, ucred, buf, fds); + array_copy = newdup(Unit*, array, k+1); + if (!array_copy) + log_oom(); + } + /* And now invoke the per-unit callbacks. Note that manager_invoke_notify_message() will handle duplicate units + * make sure we only invoke each unit's handler once. */ + if (u1) { + manager_invoke_notify_message(m, u1, ucred, buf, fds); + found = true; + } + if (u2) { + manager_invoke_notify_message(m, u2, ucred, buf, fds); + found = true; + } + if (array_copy) + for (size_t i = 0; array_copy[i]; i++) { + manager_invoke_notify_message(m, array_copy[i], ucred, buf, fds); + found = true; + } - if (!u1 && !u2 && !u3) - log_warning("Cannot find unit for notify message of PID "PID_FMT".", ucred->pid); + if (!found) + log_warning("Cannot find unit for notify message of PID "PID_FMT", ignoring.", ucred->pid); if (fdset_size(fds) > 0) log_warning("Got extra auxiliary fds with notification message, closing them."); @@ -2056,29 +2081,25 @@ static int manager_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t return 0; } -static void invoke_sigchld_event(Manager *m, Unit *u, const siginfo_t *si) { - uint64_t iteration; +static void manager_invoke_sigchld_event( + Manager *m, + Unit *u, + const siginfo_t *si) { assert(m); assert(u); assert(si); - sd_event_get_iteration(m->event, &iteration); - - log_unit_debug(u, "Child "PID_FMT" belongs to %s", si->si_pid, u->id); + /* Already invoked the handler of this unit in this iteration? Then don't process this again */ + if (u->sigchldgen == m->sigchldgen) + return; + u->sigchldgen = m->sigchldgen; + log_unit_debug(u, "Child "PID_FMT" belongs to %s.", si->si_pid, u->id); unit_unwatch_pid(u, si->si_pid); - if (UNIT_VTABLE(u)->sigchld_event) { - if (set_size(u->pids) <= 1 || - iteration != u->sigchldgen || - unit_main_pid(u) == si->si_pid || - unit_control_pid(u) == si->si_pid) { - UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status); - u->sigchldgen = iteration; - } else - log_debug("%s already issued a sigchld this iteration %" PRIu64 ", skipping. Pids still being watched %d", u->id, iteration, set_size(u->pids)); - } + if (UNIT_VTABLE(u)->sigchld_event) + UNIT_VTABLE(u)->sigchld_event(u, si->si_pid, si->si_code, si->si_status); } static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { @@ -2105,8 +2126,9 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { goto turn_off; if (IN_SET(si.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED)) { + _cleanup_free_ Unit **array_copy = NULL; _cleanup_free_ char *name = NULL; - Unit *u1, *u2, *u3; + Unit *u1, *u2, **array; (void) get_process_comm(si.si_pid, &name); @@ -2118,17 +2140,36 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { ? exit_status_to_string(si.si_status, EXIT_STATUS_FULL) : signal_to_string(si.si_status))); - /* And now figure out the unit this belongs - * to, it might be multiple... */ + /* Increase the generation counter used for filtering out duplicate unit invocations */ + m->sigchldgen++; + + /* And now figure out the unit this belongs to, it might be multiple... */ u1 = manager_get_unit_by_pid_cgroup(m, si.si_pid); + u2 = hashmap_get(m->watch_pids, PID_TO_PTR(si.si_pid)); + array = hashmap_get(m->watch_pids, PID_TO_PTR(-si.si_pid)); + if (array) { + size_t n = 0; + + /* Cound how many entries the array has */ + while (array[n]) + n++; + + /* Make a copy of the array so that we don't trip up on the array changing beneath us */ + array_copy = newdup(Unit*, array, n+1); + if (!array_copy) + log_oom(); + } + + /* Finally, execute them all. Note that u1, u2 and the array might contain duplicates, but + * that's fine, manager_invoke_sigchld_event() will ensure we only invoke the handlers once for + * each iteration. */ if (u1) - invoke_sigchld_event(m, u1, &si); - u2 = hashmap_get(m->watch_pids1, PID_TO_PTR(si.si_pid)); - if (u2 && u2 != u1) - invoke_sigchld_event(m, u2, &si); - u3 = hashmap_get(m->watch_pids2, PID_TO_PTR(si.si_pid)); - if (u3 && u3 != u2 && u3 != u1) - invoke_sigchld_event(m, u3, &si); + manager_invoke_sigchld_event(m, u1, &si); + if (u2) + manager_invoke_sigchld_event(m, u2, &si); + if (array_copy) + for (size_t i = 0; array_copy[i]; i++) + manager_invoke_sigchld_event(m, array_copy[i], &si); } /* And now, we actually reap the zombie. */ diff --git a/src/core/manager.h b/src/core/manager.h index 3af780f..90d5258 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -145,14 +145,14 @@ struct Manager { sd_event *event; - /* We use two hash tables here, since the same PID might be - * watched by two different units: once the unit that forked - * it off, and possibly a different unit to which it was - * joined as cgroup member. Since we know that it is either - * one or two units for each PID we just use to hashmaps - * here. */ - Hashmap *watch_pids1; /* pid => Unit object n:1 */ - Hashmap *watch_pids2; /* pid => Unit object n:1 */ + /* This maps PIDs we care about to units that are interested in. We allow multiple units to he interested in + * the same PID and multiple PIDs to be relevant to the same unit. Since in most cases only a single unit will + * be interested in the same PID we use a somewhat special encoding here: the first unit interested in a PID is + * stored directly in the hashmap, keyed by the PID unmodified. If there are other units interested too they'll + * be stored in a NULL-terminated array, and keyed by the negative PID. This is safe as pid_t is signed and + * negative PIDs are not used for regular processes but process groups, which we don't care about in this + * context, but this allows us to use the negative range for our own purposes. */ + Hashmap *watch_pids; /* pid => unit as well as -pid => array of units */ /* A set contains all units which cgroup should be refreshed after startup */ Set *startup_units; @@ -350,8 +350,13 @@ struct Manager { int first_boot; /* tri-state */ - /* prefixes of e.g. RuntimeDirectory= */ + /* Prefixes of e.g. RuntimeDirectory= */ char *prefix[_EXEC_DIRECTORY_TYPE_MAX]; + + /* Used in the SIGCHLD and sd_notify() message invocation logic to avoid that we dispatch the same event + * multiple times on the same unit. */ + unsigned sigchldgen; + unsigned notifygen; }; #define MANAGER_IS_SYSTEM(m) ((m)->unit_file_scope == UNIT_FILE_SYSTEM) diff --git a/src/core/unit.c b/src/core/unit.c index 308fe3f..97585a7 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -2534,44 +2534,97 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su } int unit_watch_pid(Unit *u, pid_t pid) { - int q, r; + int r; assert(u); - assert(pid >= 1); + assert(pid_is_valid(pid)); - /* Watch a specific PID. We only support one or two units - * watching each PID for now, not more. */ + /* Watch a specific PID */ r = set_ensure_allocated(&u->pids, NULL); if (r < 0) return r; - r = hashmap_ensure_allocated(&u->manager->watch_pids1, NULL); + r = hashmap_ensure_allocated(&u->manager->watch_pids, NULL); if (r < 0) return r; - r = hashmap_put(u->manager->watch_pids1, PID_TO_PTR(pid), u); - if (r == -EEXIST) { - r = hashmap_ensure_allocated(&u->manager->watch_pids2, NULL); - if (r < 0) - return r; + /* First try, let's add the unit keyed by "pid". */ + r = hashmap_put(u->manager->watch_pids, PID_TO_PTR(pid), u); + if (r == -EEXIST) { + Unit **array; + bool found = false; + size_t n = 0; - r = hashmap_put(u->manager->watch_pids2, PID_TO_PTR(pid), u); - } + /* OK, the "pid" key is already assigned to a different unit. Let's see if the "-pid" key (which points + * to an array of Units rather than just a Unit), lists us already. */ - q = set_put(u->pids, PID_TO_PTR(pid)); - if (q < 0) - return q; + array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid)); + if (array) + for (; array[n]; n++) + if (array[n] == u) + found = true; - return r; + if (found) /* Found it already? if so, do nothing */ + r = 0; + else { + Unit **new_array; + + /* Allocate a new array */ + new_array = new(Unit*, n + 2); + if (!new_array) + return -ENOMEM; + + memcpy_safe(new_array, array, sizeof(Unit*) * n); + new_array[n] = u; + new_array[n+1] = NULL; + + /* Add or replace the old array */ + r = hashmap_replace(u->manager->watch_pids, PID_TO_PTR(-pid), new_array); + if (r < 0) { + free(new_array); + return r; + } + + free(array); + } + } else if (r < 0) + return r; + + r = set_put(u->pids, PID_TO_PTR(pid)); + if (r < 0) + return r; + + return 0; } void unit_unwatch_pid(Unit *u, pid_t pid) { + Unit **array; + assert(u); - assert(pid >= 1); + assert(pid_is_valid(pid)); + + /* First let's drop the unit in case it's keyed as "pid". */ + (void) hashmap_remove_value(u->manager->watch_pids, PID_TO_PTR(pid), u); + + /* Then, let's also drop the unit, in case it's in the array keyed by -pid */ + array = hashmap_get(u->manager->watch_pids, PID_TO_PTR(-pid)); + if (array) { + size_t n, m = 0; + + /* Let's iterate through the array, dropping our own entry */ + for (n = 0; array[n]; n++) + if (array[n] != u) + array[m++] = array[n]; + array[m] = NULL; + + if (m == 0) { + /* The array is now empty, remove the entire entry */ + assert(hashmap_remove(u->manager->watch_pids, PID_TO_PTR(-pid)) == array); + free(array); + } + } - (void) hashmap_remove_value(u->manager->watch_pids1, PID_TO_PTR(pid), u); - (void) hashmap_remove_value(u->manager->watch_pids2, PID_TO_PTR(pid), u); (void) set_remove(u->pids, PID_TO_PTR(pid)); } diff --git a/src/core/unit.h b/src/core/unit.h index f3e991e..4cd0706 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -236,8 +236,10 @@ struct Unit { * process SIGCHLD for */ Set *pids; - /* Used in sigchld event invocation to avoid repeat events being invoked */ - uint64_t sigchldgen; + /* Used in SIGCHLD and sd_notify() message event invocation logic to avoid that we dispatch the same event + * multiple times on the same unit. */ + unsigned sigchldgen; + unsigned notifygen; /* Used during GC sweeps */ unsigned gc_marker; diff --git a/src/test/meson.build b/src/test/meson.build index 18e957d..9242e49 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -377,6 +377,17 @@ tests += [ libselinux, libblkid]], + [['src/test/test-watch-pid.c', + 'src/test/test-helper.c'], + [libcore, + libshared], + [libmount, + threads, + librt, + libseccomp, + libselinux, + libblkid]], + [['src/test/test-hashmap.c', 'src/test/test-hashmap-plain.c', test_hashmap_ordered_c], diff --git a/src/test/test-watch-pid.c b/src/test/test-watch-pid.c new file mode 100644 index 0000000..ed6c3d0 --- /dev/null +++ b/src/test/test-watch-pid.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "log.h" +#include "manager.h" +#include "rm-rf.h" +#include "test-helper.h" +#include "tests.h" + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; + Unit *a, *b, *c, *u; + Manager *m; + int r; + + log_set_max_level(LOG_DEBUG); + log_parse_environment(); + log_open(); + + if (getuid() != 0) { + log_notice("Not running as root, skipping kernel related tests."); + return EXIT_TEST_SKIP; + } + + r = enter_cgroup_subroot(); + if (r == -ENOMEDIUM) { + log_notice("cgroupfs not available, skipping tests"); + return EXIT_TEST_SKIP; + } + + assert_se(set_unit_path(get_testdata_dir("")) >= 0); + assert_se(runtime_dir = setup_fake_runtime_dir()); + + assert_se(manager_new(UNIT_FILE_USER, true, &m) >= 0); + assert_se(manager_startup(m, NULL, NULL) >= 0); + + assert_se(a = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(a, "a.service") >= 0); + assert_se(set_isempty(a->pids)); + + assert_se(b = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(b, "b.service") >= 0); + assert_se(set_isempty(b->pids)); + + assert_se(c = unit_new(m, sizeof(Service))); + assert_se(unit_add_name(c, "c.service") >= 0); + assert_se(set_isempty(c->pids)); + + assert_se(hashmap_isempty(m->watch_pids)); + assert_se(manager_get_unit_by_pid(m, 4711) == NULL); + + assert_se(unit_watch_pid(a, 4711) >= 0); + assert_se(manager_get_unit_by_pid(m, 4711) == a); + + assert_se(unit_watch_pid(a, 4711) >= 0); + assert_se(manager_get_unit_by_pid(m, 4711) == a); + + assert_se(unit_watch_pid(b, 4711) >= 0); + u = manager_get_unit_by_pid(m, 4711); + assert_se(u == a || u == b); + + assert_se(unit_watch_pid(b, 4711) >= 0); + u = manager_get_unit_by_pid(m, 4711); + assert_se(u == a || u == b); + + assert_se(unit_watch_pid(c, 4711) >= 0); + u = manager_get_unit_by_pid(m, 4711); + assert_se(u == a || u == b || u == c); + + assert_se(unit_watch_pid(c, 4711) >= 0); + u = manager_get_unit_by_pid(m, 4711); + assert_se(u == a || u == b || u == c); + + unit_unwatch_pid(b, 4711); + u = manager_get_unit_by_pid(m, 4711); + assert_se(u == a || u == c); + + unit_unwatch_pid(b, 4711); + u = manager_get_unit_by_pid(m, 4711); + assert_se(u == a || u == c); + + unit_unwatch_pid(a, 4711); + assert_se(manager_get_unit_by_pid(m, 4711) == c); + + unit_unwatch_pid(a, 4711); + assert_se(manager_get_unit_by_pid(m, 4711) == c); + + unit_unwatch_pid(c, 4711); + assert_se(manager_get_unit_by_pid(m, 4711) == NULL); + + unit_unwatch_pid(c, 4711); + assert_se(manager_get_unit_by_pid(m, 4711) == NULL); + + manager_free(m); + + return 0; +} -- 2.7.4