From 50be4f4a4655ac76b893dbe9da82a1710ac7ed0d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 31 May 2018 15:41:59 +0200 Subject: [PATCH] core: rework how we track service and scope PIDs This reworks how systemd tracks processes on cgroupv1 systems where cgroup notification is not reliable. Previously, whenever we had reason to believe that new processes showed up or got removed we'd scan the cgroup of the scope or service unit for new processes, and would tidy up the list of PIDs previously watched. This scanning is relatively slow, and does not scale well. With this change behaviour is changed: instead of scanning for new/removed processes right away we do this work in a per-unit deferred event loop job. This event source is scheduled at a very low priority, so that it is executed when we have time but does not starve other event sources. This has two benefits: this expensive work is coalesced, if events happen in quick succession, and we won't delay SIGCHLD handling for too long. This patch basically replaces all direct invocation of unit_watch_all_pids() in scope.c and service.c with invocations of the new unit_enqueue_rewatch_pids() call which just enqueues a request of watching/tidying up the PID sets (with one exception: in scope_enter_signal() and service_enter_signal() we'll still do unit_watch_all_pids() synchronously first, since we really want to know all processes we are about to kill so that we can track them properly. Moreover, all direct invocations of unit_tidy_watch_pids() and unit_synthesize_cgroup_empty_event() are removed too, when the unit_enqueue_rewatch_pids() call is invoked, as the queued job will run those operations too. All of this is done on cgroupsv1 systems only, and is disabled on cgroupsv2 systems as cgroup-empty notifications are reliable there, and we do not need SIGCHLD events to track processes there. Fixes: #9138 --- src/core/scope.c | 29 ++++++++++---------- src/core/service.c | 34 ++++++++++++----------- src/core/unit.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- src/core/unit.h | 7 ++++- 4 files changed, 115 insertions(+), 34 deletions(-) diff --git a/src/core/scope.c b/src/core/scope.c index 7410f15..0d22c75 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -92,8 +92,10 @@ static void scope_set_state(Scope *s, ScopeState state) { if (!IN_SET(state, SCOPE_STOP_SIGTERM, SCOPE_STOP_SIGKILL)) s->timer_event_source = sd_event_source_unref(s->timer_event_source); - if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) + if (IN_SET(state, SCOPE_DEAD, SCOPE_FAILED)) { unit_unwatch_all_pids(UNIT(s)); + unit_dequeue_rewatch_pids(UNIT(s)); + } if (state != old_state) log_debug("%s changed %s -> %s", UNIT(s)->id, scope_state_to_string(old_state), scope_state_to_string(state)); @@ -212,7 +214,7 @@ static int scope_coldplug(Unit *u) { } if (!IN_SET(s->deserialized_state, SCOPE_DEAD, SCOPE_FAILED)) - unit_watch_all_pids(UNIT(s)); + (void) unit_enqueue_rewatch_pids(u); bus_scope_track_controller(s); @@ -257,7 +259,12 @@ static void scope_enter_signal(Scope *s, ScopeState state, ScopeResult f) { if (s->result == SCOPE_SUCCESS) s->result = f; - unit_watch_all_pids(UNIT(s)); + /* Before sending any signal, make sure we track all members of this cgroup */ + (void) unit_watch_all_pids(UNIT(s)); + + /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have + * died now */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); /* If we have a controller set let's ask the controller nicely to terminate the scope, instead of us going * directly into SIGTERM berserk mode */ @@ -455,17 +462,13 @@ static void scope_notify_cgroup_empty_event(Unit *u) { } static void scope_sigchld_event(Unit *u, pid_t pid, int code, int status) { - assert(u); /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to * watch, under the assumption that we'll sooner or later get a SIGCHLD for them, as the original * process we watched was probably the parent of them, and they are hence now our children. */ - unit_tidy_watch_pids(u, 0, 0); - unit_watch_all_pids(u); - /* If the PID set is empty now, then let's finish this off. */ - unit_synthesize_cgroup_empty_event(u); + (void) unit_enqueue_rewatch_pids(u); } static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { @@ -515,13 +518,9 @@ int scope_abandon(Scope *s) { scope_set_state(s, SCOPE_ABANDONED); - /* The client is no longer watching the remaining processes, - * so let's step in here, under the assumption that the - * remaining processes will be sooner or later reassigned to - * us as parent. */ - - unit_tidy_watch_pids(UNIT(s), 0, 0); - unit_watch_all_pids(UNIT(s)); + /* The client is no longer watching the remaining processes, so let's step in here, under the assumption that + * the remaining processes will be sooner or later reassigned to us as parent. */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); return 0; } diff --git a/src/core/service.c b/src/core/service.c index bd62fbc..db0c1dc 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1058,8 +1058,10 @@ static void service_set_state(Service *s, ServiceState state) { s->control_command_id = _SERVICE_EXEC_COMMAND_INVALID; } - if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) + if (IN_SET(state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) { unit_unwatch_all_pids(UNIT(s)); + unit_dequeue_rewatch_pids(UNIT(s)); + } if (!IN_SET(state, SERVICE_START_PRE, SERVICE_START, SERVICE_START_POST, @@ -1154,17 +1156,15 @@ static int service_coldplug(Unit *u) { return r; } - if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) - unit_watch_all_pids(UNIT(s)); - - if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) - service_start_watchdog(s); - if (!IN_SET(s->deserialized_state, SERVICE_DEAD, SERVICE_FAILED, SERVICE_AUTO_RESTART)) { + (void) unit_enqueue_rewatch_pids(u); (void) unit_setup_dynamic_creds(u); (void) unit_setup_exec_runtime(u); } + if (IN_SET(s->deserialized_state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) + service_start_watchdog(s); + if (UNIT_ISSET(s->accept_socket)) { Socket* socket = SOCKET(UNIT_DEREF(s->accept_socket)); @@ -1679,7 +1679,8 @@ static void service_enter_stop_post(Service *s, ServiceResult f) { s->result = f; service_unwatch_control_pid(s); - unit_watch_all_pids(UNIT(s)); + + (void) unit_enqueue_rewatch_pids(UNIT(s)); s->control_command = s->exec_command[SERVICE_EXEC_STOP_POST]; if (s->control_command) { @@ -1731,7 +1732,12 @@ static void service_enter_signal(Service *s, ServiceState state, ServiceResult f if (s->result == SERVICE_SUCCESS) s->result = f; - unit_watch_all_pids(UNIT(s)); + /* Before sending any signal, make sure we track all members of this cgroup */ + (void) unit_watch_all_pids(UNIT(s)); + + /* Also, enqueue a job that we recheck all our PIDs a bit later, given that it's likely some processes have + * died now */ + (void) unit_enqueue_rewatch_pids(UNIT(s)); r = unit_kill_context( UNIT(s), @@ -1772,7 +1778,7 @@ fail: static void service_enter_stop_by_notify(Service *s) { assert(s); - unit_watch_all_pids(UNIT(s)); + (void) unit_enqueue_rewatch_pids(UNIT(s)); service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), s->timeout_stop_usec)); @@ -1789,7 +1795,7 @@ static void service_enter_stop(Service *s, ServiceResult f) { s->result = f; service_unwatch_control_pid(s); - unit_watch_all_pids(UNIT(s)); + (void) unit_enqueue_rewatch_pids(UNIT(s)); s->control_command = s->exec_command[SERVICE_EXEC_STOP]; if (s->control_command) { @@ -3290,11 +3296,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) { /* If we get a SIGCHLD event for one of the processes we were interested in, then we look for others to watch, * under the assumption that we'll sooner or later get a SIGCHLD for them, as the original process we watched * was probably the parent of them, and they are hence now our children. */ - unit_tidy_watch_pids(u, s->main_pid, s->control_pid); - unit_watch_all_pids(u); - - /* If the PID set is empty now, then let's check if the cgroup is empty too and finish off the unit. */ - unit_synthesize_cgroup_empty_event(u); + (void) unit_enqueue_rewatch_pids(u); } static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata) { diff --git a/src/core/unit.c b/src/core/unit.c index bfe6336..7265f95 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -567,8 +567,9 @@ void unit_free(Unit *u) { unit_done(u); - sd_bus_slot_unref(u->match_bus_slot); + unit_dequeue_rewatch_pids(u); + sd_bus_slot_unref(u->match_bus_slot); sd_bus_track_unref(u->bus_track); u->deserialized_refs = strv_free(u->deserialized_refs); @@ -2605,7 +2606,8 @@ void unit_unwatch_all_pids(Unit *u) { u->pids = set_free(u->pids); } -void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) { +static void unit_tidy_watch_pids(Unit *u) { + pid_t except1, except2; Iterator i; void *e; @@ -2613,6 +2615,9 @@ void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) { /* Cleans dead PIDs from our list */ + except1 = unit_main_pid(u); + except2 = unit_control_pid(u); + SET_FOREACH(e, u->pids, i) { pid_t pid = PTR_TO_PID(e); @@ -2624,6 +2629,76 @@ void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) { } } +static int on_rewatch_pids_event(sd_event_source *s, void *userdata) { + Unit *u = userdata; + + assert(s); + assert(u); + + unit_tidy_watch_pids(u); + unit_watch_all_pids(u); + + /* If the PID set is empty now, then let's finish this off. */ + unit_synthesize_cgroup_empty_event(u); + + return 0; +} + +int unit_enqueue_rewatch_pids(Unit *u) { + int r; + + assert(u); + + if (!u->cgroup_path) + return -ENOENT; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return r; + if (r > 0) /* On unified we can use proper notifications */ + return 0; + + /* Enqueues a low-priority job that will clean up dead PIDs from our list of PIDs to watch and subscribe to new + * PIDs that might have appeared. We do this in a delayed job because the work might be quite slow, as it + * involves issuing kill(pid, 0) on all processes we watch. */ + + if (!u->rewatch_pids_event_source) { + _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; + + r = sd_event_add_defer(u->manager->event, &s, on_rewatch_pids_event, u); + if (r < 0) + return log_error_errno(r, "Failed to allocate event source for tidying watched PIDs: %m"); + + r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_IDLE); + if (r < 0) + return log_error_errno(r, "Failed to adjust priority of event source for tidying watched PIDs: m"); + + (void) sd_event_source_set_description(s, "tidy-watch-pids"); + + u->rewatch_pids_event_source = TAKE_PTR(s); + } + + r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_ONESHOT); + if (r < 0) + return log_error_errno(r, "Failed to enable event source for tidying watched PIDs: %m"); + + return 0; +} + +void unit_dequeue_rewatch_pids(Unit *u) { + int r; + assert(u); + + if (!u->rewatch_pids_event_source) + return; + + r = sd_event_source_set_enabled(u->rewatch_pids_event_source, SD_EVENT_OFF); + if (r < 0) + log_warning_errno(r, "Failed to disable event source for tidying watched PIDs, ignoring: %m"); + + u->rewatch_pids_event_source = sd_event_source_unref(u->rewatch_pids_event_source); +} + bool unit_job_is_applicable(Unit *u, JobType j) { assert(u); assert(j >= 0 && j < _JOB_TYPE_MAX); diff --git a/src/core/unit.h b/src/core/unit.h index f1b5d61..76270a0 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -279,6 +279,10 @@ typedef struct Unit { uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX]; + /* Low-priority event source which is used to remove watched PIDs that have gone away, and subscribe to any new + * ones which might have appeared. */ + sd_event_source *rewatch_pids_event_source; + /* How to start OnFailure units */ JobMode on_failure_job_mode; @@ -650,7 +654,8 @@ int unit_watch_pid(Unit *u, pid_t pid); void unit_unwatch_pid(Unit *u, pid_t pid); void unit_unwatch_all_pids(Unit *u); -void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2); +int unit_enqueue_rewatch_pids(Unit *u); +void unit_dequeue_rewatch_pids(Unit *u); int unit_install_bus_match(Unit *u, sd_bus *bus, const char *name); int unit_watch_bus_name(Unit *u, const char *name); -- 2.7.4