From 6592b9759cae509b407a3b49603498468bf5d276 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 7 Feb 2018 22:52:52 +0100 Subject: [PATCH] core: add new new bus call for migrating foreign processes to scope/service units MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This adds a new bus call to service and scope units called AttachProcesses() that moves arbitrary processes into the cgroup of the unit. The primary user for this new API is systemd itself: the systemd --user instance uses this call of the systemd --system instance to migrate processes if itself gets the request to migrate processes and the kernel refuses this due to access restrictions. The primary use-case of this is to make "systemd-run --scope --user …" invoked from user session scopes work correctly on pure cgroupsv2 environments. There, the kernel refuses to migrate processes between two unprivileged-owned cgroups unless the requestor as well as the ownership of the closest parent cgroup all match. This however is not the case between the session-XYZ.scope unit of a login session and the user@ABC.service of the systemd --user instance. The new logic always tries to move the processes on its own, but if that doesn't work when being the user manager, then the system manager is asked to do it instead. The new operation is relatively restrictive: it will only allow to move the processes like this if the caller is root, or the UID of the target unit, caller and process all match. Note that this means that unprivileged users cannot attach processes to scope units, as those do not have "owning" users (i.e. they have now User= field). Fixes: #3388 --- src/core/cgroup.c | 145 +++++++++++++++++++++++++++++++-- src/core/cgroup.h | 3 +- src/core/dbus-manager.c | 21 +++++ src/core/dbus-scope.c | 32 ++++++-- src/core/dbus-unit.c | 113 +++++++++++++++++++++++++ src/core/dbus-unit.h | 1 + src/core/org.freedesktop.systemd1.conf | 16 ++++ src/core/scope.c | 2 +- src/core/unit.c | 28 +++++++ src/core/unit.h | 2 + 10 files changed, 346 insertions(+), 17 deletions(-) diff --git a/src/core/cgroup.c b/src/core/cgroup.c index d05e338..edb702c 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -24,6 +24,7 @@ #include "alloc-util.h" #include "blockdev-util.h" #include "bpf-firewall.h" +#include "bus-error.h" #include "cgroup-util.h" #include "cgroup.h" #include "fd-util.h" @@ -1303,13 +1304,12 @@ void unit_update_cgroup_members_masks(Unit *u) { } } -static const char *migrate_callback(CGroupMask mask, void *userdata) { - Unit *u = userdata; +const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask) { - assert(mask != 0); - assert(u); + /* Returns the realized cgroup path of the specified unit where all specified controllers are available. */ while (u) { + if (u->cgroup_path && u->cgroup_realized && (u->cgroup_realized_mask & mask) == mask) @@ -1321,6 +1321,10 @@ static const char *migrate_callback(CGroupMask mask, void *userdata) { return NULL; } +static const char *migrate_callback(CGroupMask mask, void *userdata) { + return unit_get_realized_cgroup_path(userdata, mask); +} + char *unit_default_cgroup_path(Unit *u) { _cleanup_free_ char *escaped = NULL, *slice = NULL; int r; @@ -1503,19 +1507,142 @@ static int unit_create_cgroup( return 0; } -int unit_attach_pids_to_cgroup(Unit *u) { +static int unit_attach_pid_to_cgroup_via_bus(Unit *u, pid_t pid, const char *suffix_path) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + char *pp; int r; + assert(u); - r = unit_realize_cgroup(u); + if (MANAGER_IS_SYSTEM(u->manager)) + return -EINVAL; + + if (!u->manager->system_bus) + return -EIO; + + if (!u->cgroup_path) + return -EINVAL; + + /* Determine this unit's cgroup path relative to our cgroup root */ + pp = path_startswith(u->cgroup_path, u->manager->cgroup_root); + if (!pp) + return -EINVAL; + + pp = strjoina("/", pp, suffix_path); + path_kill_slashes(pp); + + r = sd_bus_call_method(u->manager->system_bus, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "AttachProcessesToUnit", + &error, NULL, + "ssau", + NULL /* empty unit name means client's unit, i.e. us */, pp, 1, (uint32_t) pid); if (r < 0) - return r; + return log_unit_debug_errno(u, r, "Failed to attach unit process " PID_FMT " via the bus: %s", pid, bus_error_message(&error, r)); + + return 0; +} + +int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) { + CGroupMask delegated_mask; + const char *p; + Iterator i; + void *pidp; + int r, q; + + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + if (set_isempty(pids)) + return 0; - r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u); + r = unit_realize_cgroup(u); if (r < 0) return r; - return 0; + if (isempty(suffix_path)) + p = u->cgroup_path; + else + p = strjoina(u->cgroup_path, "/", suffix_path); + + delegated_mask = unit_get_delegate_mask(u); + + r = 0; + SET_FOREACH(pidp, pids, i) { + pid_t pid = PTR_TO_PID(pidp); + CGroupController c; + + /* First, attach the PID to the main cgroup hierarchy */ + q = cg_attach(SYSTEMD_CGROUP_CONTROLLER, p, pid); + if (q < 0) { + log_unit_debug_errno(u, q, "Couldn't move process " PID_FMT " to requested cgroup '%s': %m", pid, p); + + if (MANAGER_IS_USER(u->manager) && IN_SET(q, -EPERM, -EACCES)) { + int z; + + /* If we are in a user instance, and we can't move the process ourselves due to + * permission problems, let's ask the system instance about it instead. Since it's more + * privileged it might be able to move the process across the leaves of a subtree who's + * top node is not owned by us. */ + + z = unit_attach_pid_to_cgroup_via_bus(u, pid, suffix_path); + if (z < 0) + log_unit_debug_errno(u, z, "Couldn't move process " PID_FMT " to requested cgroup '%s' via the system bus either: %m", pid, p); + else + continue; /* When the bus thing worked via the bus we are fully done for this PID. */ + } + + if (r >= 0) + r = q; /* Remember first error */ + + continue; + } + + q = cg_all_unified(); + if (q < 0) + return q; + if (q > 0) + continue; + + /* In the legacy hierarchy, attach the process to the request cgroup if possible, and if not to the + * innermost realized one */ + + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); + const char *realized; + + if (!(u->manager->cgroup_supported & bit)) + continue; + + /* If this controller is delegated and realized, honour the caller's request for the cgroup suffix. */ + if (delegated_mask & u->cgroup_realized_mask & bit) { + q = cg_attach(cgroup_controller_to_string(c), p, pid); + if (q >= 0) + continue; /* Success! */ + + log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to requested cgroup %s in controller %s, falling back to unit's cgroup: %m", + pid, p, cgroup_controller_to_string(c)); + } + + /* So this controller is either not delegate or realized, or something else weird happened. In + * that case let's attach the PID at least to the closest cgroup up the tree that is + * realized. */ + realized = unit_get_realized_cgroup_path(u, bit); + if (!realized) + continue; /* Not even realized in the root slice? Then let's not bother */ + + q = cg_attach(cgroup_controller_to_string(c), realized, pid); + if (q < 0) + log_unit_debug_errno(u, q, "Failed to attach PID " PID_FMT " to realized cgroup %s in controller %s, ignoring: %m", + pid, realized, cgroup_controller_to_string(c)); + } + } + + return r; } static void cgroup_xattr_apply(Unit *u) { diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 5113313..e2e875d 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -167,6 +167,7 @@ bool unit_get_needs_bpf(Unit *u); void unit_update_cgroup_members_masks(Unit *u); +const char *unit_get_realized_cgroup_path(Unit *u, CGroupMask mask); char *unit_default_cgroup_path(Unit *u); int unit_set_cgroup_path(Unit *u, const char *path); int unit_pick_cgroup_path(Unit *u); @@ -178,7 +179,7 @@ int unit_watch_cgroup(Unit *u); void unit_add_to_cgroup_empty_queue(Unit *u); -int unit_attach_pids_to_cgroup(Unit *u); +int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path); int manager_setup_cgroup(Manager *m); void manager_shutdown_cgroup(Manager *m, bool delete); diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index 945645c..889779d 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -863,6 +863,26 @@ static int method_get_unit_processes(sd_bus_message *message, void *userdata, sd return bus_unit_method_get_processes(message, u, error); } +static int method_attach_processes_to_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Manager *m = userdata; + const char *name; + Unit *u; + int r; + + assert(message); + assert(m); + + r = sd_bus_message_read(message, "s", &name); + if (r < 0) + return r; + + r = bus_get_unit_by_name(m, message, name, &u, error); + if (r < 0) + return r; + + return bus_unit_method_attach_processes(message, u, error); +} + static int transient_unit_from_message( Manager *m, sd_bus_message *message, @@ -2504,6 +2524,7 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_METHOD("UnrefUnit", "s", NULL, method_unref_unit, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("StartTransientUnit", "ssa(sv)a(sa(sv))", "o", method_start_transient_unit, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("GetUnitProcesses", "s", "a(sus)", method_get_unit_processes, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("AttachProcessesToUnit", "ssau", NULL, method_attach_processes_to_unit, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("GetJob", "u", "o", method_get_job, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("GetJobAfter", "u", "a(usssoo)", method_get_job_waiting, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD("GetJobBefore", "u", "a(usssoo)", method_get_job_waiting, SD_BUS_VTABLE_UNPRIVILEGED), diff --git a/src/core/dbus-scope.c b/src/core/dbus-scope.c index a0c4a65..4d9ced8 100644 --- a/src/core/dbus-scope.c +++ b/src/core/dbus-scope.c @@ -89,17 +89,39 @@ static int bus_scope_set_transient_property( return bus_set_transient_usec(UNIT(s), name, &s->timeout_stop_usec, message, flags, error); if (streq(name, "PIDs")) { + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; unsigned n = 0; - uint32_t pid; r = sd_bus_message_enter_container(message, 'a', "u"); if (r < 0) return r; - while ((r = sd_bus_message_read(message, "u", &pid)) > 0) { + for (;;) { + uint32_t upid; + pid_t pid; - if (pid <= 1) - return -EINVAL; + r = sd_bus_message_read(message, "u", &upid); + if (r < 0) + return r; + if (r == 0) + break; + + if (upid == 0) { + if (!creds) { + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + } + + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } else + pid = (uid_t) upid; + + r = unit_pid_attachable(UNIT(s), pid, error); + if (r < 0) + return r; if (!UNIT_WRITE_FLAGS_NOOP(flags)) { r = unit_watch_pid(UNIT(s), pid); @@ -109,8 +131,6 @@ static int bus_scope_set_transient_property( n++; } - if (r < 0) - return r; r = sd_bus_message_exit_container(message); if (r < 0) diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index 7085eee..50a5ab9 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -1127,6 +1127,118 @@ static int property_get_ip_counter( return sd_bus_message_append(reply, "t", value); } +int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + _cleanup_(set_freep) Set *pids = NULL; + Unit *u = userdata; + const char *path; + int r; + + assert(message); + + /* This migrates the processes with the specified PIDs into the cgroup of this unit, optionally below a + * specified cgroup path. Obviously this only works for units that actually maintain a cgroup + * representation. If a process is already in the cgroup no operation is executed – in this case the specified + * subcgroup path has no effect! */ + + r = mac_selinux_unit_access_check(u, message, "start", error); + if (r < 0) + return r; + + r = sd_bus_message_read(message, "s", &path); + if (r < 0) + return r; + + path = empty_to_null(path); + if (path) { + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path); + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path); + } + + if (!unit_cgroup_delegate(u)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process migration not available on non-delegated units."); + + if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(u))) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unit is not active, refusing."); + + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID|SD_BUS_CREDS_PID, &creds); + if (r < 0) + return r; + + r = sd_bus_message_enter_container(message, 'a', "u"); + if (r < 0) + return r; + for (;;) { + uid_t process_uid, sender_uid; + uint32_t upid; + pid_t pid; + + r = sd_bus_message_read(message, "u", &upid); + if (r < 0) + return r; + if (r == 0) + break; + + if (upid == 0) { + r = sd_bus_creds_get_pid(creds, &pid); + if (r < 0) + return r; + } else + pid = (uid_t) upid; + + /* Filter out duplicates */ + if (set_contains(pids, PID_TO_PTR(pid))) + continue; + + /* Check if this process is suitable for attaching to this unit */ + r = unit_pid_attachable(u, pid, error); + if (r < 0) + return r; + + /* Let's query the sender's UID, so that we can make our security decisions */ + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0) + return r; + + /* Let's validate security: if the sender is root, then all is OK. If the sender is is any other unit, + * then the process' UID and the target unit's UID have to match the sender's UID */ + if (sender_uid != 0 && sender_uid != getuid()) { + r = get_process_uid(pid, &process_uid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to retrieve process UID: %m"); + + if (process_uid != sender_uid) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by client's UID. Refusing.", pid); + if (process_uid != u->ref_uid) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Process " PID_FMT " not owned by target unit's UID. Refusing.", pid); + } + + if (!pids) { + pids = set_new(NULL); + if (!pids) + return -ENOMEM; + } + + r = set_put(pids, PID_TO_PTR(pid)); + if (r < 0) + return r; + } + + r = sd_bus_message_exit_container(message); + if (r < 0) + return r; + + r = unit_attach_pids_to_cgroup(u, pids, path); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to attach processes to control group: %m"); + + return sd_bus_reply_method_return(message, NULL); +} + const sd_bus_vtable bus_unit_cgroup_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0), @@ -1139,6 +1251,7 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = { SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0), SD_BUS_PROPERTY("IPEgressPackets", "t", property_get_ip_counter, 0, 0), SD_BUS_METHOD("GetProcesses", NULL, "a(sus)", bus_unit_method_get_processes, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD("AttachProcesses", "sau", NULL, bus_unit_method_attach_processes, SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_VTABLE_END }; diff --git a/src/core/dbus-unit.h b/src/core/dbus-unit.h index d7066ee..bb4e43f 100644 --- a/src/core/dbus-unit.h +++ b/src/core/dbus-unit.h @@ -37,6 +37,7 @@ int bus_unit_method_reset_failed(sd_bus_message *message, void *userdata, sd_bus int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags flags, bool commit, sd_bus_error *error); int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error); int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error); int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf index a97edac..645c8f1 100644 --- a/src/core/org.freedesktop.systemd1.conf +++ b/src/core/org.freedesktop.systemd1.conf @@ -236,6 +236,10 @@ + + + + + + + + + + diff --git a/src/core/scope.c b/src/core/scope.c index 47c47bc..5b9c2bb 100644 --- a/src/core/scope.c +++ b/src/core/scope.c @@ -343,7 +343,7 @@ static int scope_start(Unit *u) { unit_export_state_files(UNIT(s)); - r = unit_attach_pids_to_cgroup(u); + r = unit_attach_pids_to_cgroup(u, UNIT(s)->pids, NULL); if (r < 0) { log_unit_warning_errno(UNIT(s), r, "Failed to add PIDs to scope's control group: %m"); scope_enter_dead(s, SCOPE_FAILURE_RESOURCES); diff --git a/src/core/unit.c b/src/core/unit.c index 47a06e4..26d97fa 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -5362,6 +5362,34 @@ const char *unit_label_path(Unit *u) { return p; } +int unit_pid_attachable(Unit *u, pid_t pid, sd_bus_error *error) { + int r; + + assert(u); + + /* Checks whether the specified PID is generally good for attaching, i.e. a valid PID, not our manager itself, + * and not a kernel thread either */ + + /* First, a simple range check */ + if (!pid_is_valid(pid)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process identifier " PID_FMT " is not valid.", pid); + + /* Some extra safety check */ + if (pid == 1 || pid == getpid_cached()) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a manager processs, refusing.", pid); + + /* Don't even begin to bother with kernel threads */ + r = is_kernel_thread(pid); + if (r == -ESRCH) + return sd_bus_error_setf(error, SD_BUS_ERROR_UNIX_PROCESS_ID_UNKNOWN, "Process with ID " PID_FMT " does not exist.", pid); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to determine whether process " PID_FMT " is a kernel thread: %m", pid); + if (r > 0) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Process " PID_FMT " is a kernel thread, refusing.", pid); + + return 0; +} + static const char* const collect_mode_table[_COLLECT_MODE_MAX] = { [COLLECT_INACTIVE] = "inactive", [COLLECT_INACTIVE_OR_FAILED] = "inactive-or-failed", diff --git a/src/core/unit.h b/src/core/unit.h index 6d49c7d..617f044 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -806,6 +806,8 @@ bool unit_needs_console(Unit *u); const char *unit_label_path(Unit *u); +int unit_pid_attachable(Unit *unit, pid_t pid, sd_bus_error *error); + /* Macros which append UNIT= or USER_UNIT= to the message */ #define log_unit_full(unit, level, error, ...) \ -- 2.7.4