From 5686391b006ee82d8a4559067ad9818e3e631247 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 17 Jul 2018 11:47:14 +0200 Subject: [PATCH] core: introduce new Type=exec service type Users are often surprised that "systemd-run" command lines like "systemd-run -p User=idontexist /bin/true" will return successfully, even though the logs show that the process couldn't be invoked, as the user "idontexist" doesn't exist. This is because Type=simple will only wait until fork() succeeded before returning start-up success. This patch adds a new service type Type=exec, which is very similar to Type=simple, but waits until the child process completed the execve() before returning success. It uses a pipe that has O_CLOEXEC set for this logic, so that the kernel automatically sends POLLHUP on it when the execve() succeeded but leaves the pipe open if not. This means PID 1 waits exactly until the execve() succeeded in the child, and not longer and not shorter, which is the desired functionality. Making use of this new functionality, the command line "systemd-run -p User=idontexist -p Type=exec /bin/true" will now fail, as expected. --- src/core/execute.c | 89 ++++++++++++++++++++++++---- src/core/execute.h | 3 + src/core/mount.c | 9 +-- src/core/service.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++---- src/core/service.h | 4 ++ src/core/socket.c | 9 +-- src/core/swap.c | 1 + 7 files changed, 254 insertions(+), 28 deletions(-) diff --git a/src/core/execute.c b/src/core/execute.c index 8c5139d..0f36f6c 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -2573,6 +2573,7 @@ static int close_remaining_fds( const DynamicCreds *dcreds, int user_lookup_fd, int socket_fd, + int exec_fd, int *fds, size_t n_fds) { size_t n_dont_close = 0; @@ -2589,6 +2590,8 @@ static int close_remaining_fds( if (socket_fd >= 0) dont_close[n_dont_close++] = socket_fd; + if (exec_fd >= 0) + dont_close[n_dont_close++] = exec_fd; if (n_fds > 0) { memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds); n_dont_close += n_fds; @@ -2732,9 +2735,10 @@ static int exec_child( int *exit_status) { _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL; - _cleanup_free_ char *home_buffer = NULL; + int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1; _cleanup_free_ gid_t *supplementary_gids = NULL; const char *username = NULL, *groupname = NULL; + _cleanup_free_ char *home_buffer = NULL; const char *home = NULL, *shell = NULL; dev_t journal_stream_dev = 0; ino_t journal_stream_ino = 0; @@ -2754,7 +2758,6 @@ static int exec_child( #endif uid_t uid = UID_INVALID; gid_t gid = GID_INVALID; - int r, ngids = 0; size_t n_fds; ExecDirectoryType dt; int secure_bits; @@ -2798,8 +2801,8 @@ static int exec_child( /* In case anything used libc syslog(), close this here, too */ closelog(); - n_fds = n_storage_fds + n_socket_fds; - r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds); + n_fds = n_socket_fds + n_storage_fds; + r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds); if (r < 0) { *exit_status = EXIT_FDS; return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m"); @@ -3172,9 +3175,45 @@ static int exec_child( } /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are - * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd - * was needed to upload the policy and can now be closed as well. */ - r = close_all_fds(fds, n_fds); + * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd + * however if we have it as we want to keep it open until the final execve(). */ + + if (params->exec_fd >= 0) { + exec_fd = params->exec_fd; + + if (exec_fd < 3 + (int) n_fds) { + int moved_fd; + + /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the + * process we are about to execute. */ + + moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds); + if (moved_fd < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m"); + } + + safe_close(exec_fd); + exec_fd = moved_fd; + } else { + /* This fd should be FD_CLOEXEC already, but let's make sure. */ + r = fd_cloexec(exec_fd, true); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m"); + } + } + + fds_with_exec_fd = newa(int, n_fds + 1); + memcpy(fds_with_exec_fd, fds, n_fds * sizeof(int)); + fds_with_exec_fd[n_fds] = exec_fd; + n_fds_with_exec_fd = n_fds + 1; + } else { + fds_with_exec_fd = fds; + n_fds_with_exec_fd = n_fds; + } + + r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd); if (r >= 0) r = shift_fds(fds, n_fds); if (r >= 0) @@ -3184,6 +3223,11 @@ static int exec_child( return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m"); } + /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off + * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined, + * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we + * came this far. */ + secure_bits = context->secure_bits; if (needs_sandboxing) { @@ -3414,10 +3458,35 @@ static int exec_child( LOG_UNIT_INVOCATION_ID(unit)); } + if (exec_fd >= 0) { + uint8_t hot = 1; + + /* We have finished with all our initializations. Let's now let the manager know that. From this point + * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m"); + } + } + execve(command->path, final_argv, accum_env); + r = -errno; + + if (exec_fd >= 0) { + uint8_t hot = 0; + + /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager + * that POLLHUP on it no longer means execve() succeeded. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m"); + } + } - if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { - log_struct_errno(LOG_INFO, errno, + if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { + log_struct_errno(LOG_INFO, r, "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, LOG_UNIT_ID(unit), LOG_UNIT_INVOCATION_ID(unit), @@ -3428,7 +3497,7 @@ static int exec_child( } *exit_status = EXIT_EXEC; - return log_unit_error_errno(unit, errno, "Failed to execute command: %m"); + return log_unit_error_errno(unit, r, "Failed to execute command: %m"); } static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l); diff --git a/src/core/execute.h b/src/core/execute.h index aa6f1cc..cd5165c 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -317,6 +317,9 @@ struct ExecParameters { int stdin_fd; int stdout_fd; int stderr_fd; + + /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */ + int exec_fd; }; #include "unit.h" diff --git a/src/core/mount.c b/src/core/mount.c index 21437da..16229d4 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -747,10 +747,11 @@ static void mount_dump(Unit *u, FILE *f, const char *prefix) { static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { ExecParameters exec_params = { - .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, - .stdin_fd = -1, - .stdout_fd = -1, - .stderr_fd = -1, + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, }; pid_t pid; int r; diff --git a/src/core/service.c b/src/core/service.c index fd64503..6abf87c 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -79,9 +79,10 @@ static const UnitActiveState state_translation_table_idle[_SERVICE_STATE_MAX] = [SERVICE_AUTO_RESTART] = UNIT_ACTIVATING }; -static int service_dispatch_io(sd_event_source *source, int fd, uint32_t events, void *userdata); +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata); static int service_dispatch_timer(sd_event_source *source, usec_t usec, void *userdata); static int service_dispatch_watchdog(sd_event_source *source, usec_t usec, void *userdata); +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata); static void service_enter_signal(Service *s, ServiceState state, ServiceResult f); static void service_enter_reload_by_notify(Service *s); @@ -389,6 +390,7 @@ static void service_done(Unit *u) { service_stop_watchdog(s); s->timer_event_source = sd_event_source_unref(s->timer_event_source); + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); service_release_resources(u); } @@ -1066,6 +1068,9 @@ static void service_set_state(Service *s, ServiceState state) { !(state == SERVICE_DEAD && UNIT(s)->job)) service_close_socket_fd(s); + if (state != SERVICE_START) + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + if (!IN_SET(state, SERVICE_START_POST, SERVICE_RUNNING, SERVICE_RELOAD)) service_stop_watchdog(s); @@ -1296,6 +1301,63 @@ static int service_collect_fds( return 0; } +static int service_allocate_exec_fd_event_source( + Service *s, + int fd, + sd_event_source **ret_event_source) { + + _cleanup_(sd_event_source_unrefp) sd_event_source *source = NULL; + int r; + + assert(s); + assert(fd >= 0); + assert(ret_event_source); + + r = sd_event_add_io(UNIT(s)->manager->event, &source, fd, 0, service_dispatch_exec_io, s); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to allocate exec_fd event source: %m"); + + /* This is a bit lower priority than SIGCHLD, as that carries a lot more interesting failure information */ + + r = sd_event_source_set_priority(source, SD_EVENT_PRIORITY_NORMAL-3); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to adjust priority of exec_fd event source: %m"); + + (void) sd_event_source_set_description(source, "service event_fd"); + + r = sd_event_source_set_io_fd_own(source, true); + if (r < 0) + return log_unit_error_errno(UNIT(s), r, "Failed to pass ownership of fd to event source: %m"); + + *ret_event_source = TAKE_PTR(source); + return 0; +} + +static int service_allocate_exec_fd( + Service *s, + sd_event_source **ret_event_source, + int* ret_exec_fd) { + + _cleanup_close_pair_ int p[2] = { -1, -1 }; + int r; + + assert(s); + assert(ret_event_source); + assert(ret_exec_fd); + + if (pipe2(p, O_CLOEXEC|O_NONBLOCK) < 0) + return log_unit_error_errno(UNIT(s), errno, "Failed to allocate exec_fd pipe: %m"); + + r = service_allocate_exec_fd_event_source(s, p[0], ret_event_source); + if (r < 0) + return r; + + p[0] = -1; + *ret_exec_fd = TAKE_FD(p[1]); + + return 0; +} + static bool service_exec_needs_notify_socket(Service *s, ExecFlags flags) { assert(s); @@ -1330,7 +1392,9 @@ static int service_spawn( .exec_fd = -1, }; _cleanup_strv_free_ char **final_env = NULL, **our_env = NULL, **fd_names = NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL; size_t n_socket_fds = 0, n_storage_fds = 0, n_env = 0; + _cleanup_close_ int exec_fd = -1; _cleanup_free_ int *fds = NULL; pid_t pid; int r; @@ -1363,6 +1427,14 @@ static int service_spawn( log_unit_debug(UNIT(s), "Passing %zu fds to service", n_socket_fds + n_storage_fds); } + if (!FLAGS_SET(flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { + assert(!s->exec_fd_event_source); + + r = service_allocate_exec_fd(s, &exec_fd_source, &exec_fd); + if (r < 0) + return r; + } + r = service_arm_timer(s, usec_add(now(CLOCK_MONOTONIC), timeout)); if (r < 0) return r; @@ -1462,6 +1534,7 @@ static int service_spawn( exec_params.stdin_fd = s->stdin_fd; exec_params.stdout_fd = s->stdout_fd; exec_params.stderr_fd = s->stderr_fd; + exec_params.exec_fd = exec_fd; r = exec_spawn(UNIT(s), c, @@ -1473,6 +1546,9 @@ static int service_spawn( if (r < 0) return r; + s->exec_fd_event_source = TAKE_PTR(exec_fd_source); + s->exec_fd_hot = false; + r = unit_watch_pid(UNIT(s), pid); if (r < 0) /* FIXME: we need to do something here */ return r; @@ -1984,14 +2060,12 @@ static void service_enter_start(Service *s) { s->control_pid = pid; service_set_state(s, SERVICE_START); - } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY)) { + } else if (IN_SET(s->type, SERVICE_ONESHOT, SERVICE_DBUS, SERVICE_NOTIFY, SERVICE_EXEC)) { - /* For oneshot services we wait until the start - * process exited, too, but it is our main process. */ + /* For oneshot services we wait until the start process exited, too, but it is our main process. */ - /* For D-Bus services we know the main pid right away, - * but wait for the bus name to appear on the - * bus. Notify services are similar. */ + /* For D-Bus services we know the main pid right away, but wait for the bus name to appear on the + * bus. 'notify' and 'exec' services are similar. */ service_set_main_pid(s, pid); service_set_state(s, SERVICE_START); @@ -2444,6 +2518,13 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) { if (r < 0) return r; + if (s->exec_fd_event_source) { + r = unit_serialize_item_fd(u, f, fds, "exec-fd", sd_event_source_get_io_fd(s->exec_fd_event_source)); + if (r < 0) + return r; + unit_serialize_item(u, f, "exec-fd-hot", yes_no(s->exec_fd_hot)); + } + if (UNIT_ISSET(s->accept_socket)) { r = unit_serialize_item(u, f, "accept-socket", UNIT_DEREF(s->accept_socket)->id); if (r < 0) @@ -2777,6 +2858,18 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, s->stderr_fd = fdset_remove(fds, fd); s->exec_context.stdio_as_fds = true; } + } else if (streq(key, "exec-fd")) { + int fd; + + if (safe_atoi(value, &fd) < 0 || fd < 0 || !fdset_contains(fds, fd)) + log_unit_debug(u, "Failed to parse exec-fd value: %s", value); + else { + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + fd = fdset_remove(fds, fd); + if (service_allocate_exec_fd_event_source(s, fd, &s->exec_fd_event_source) < 0) + safe_close(fd); + } } else if (streq(key, "watchdog-override-usec")) { usec_t watchdog_override_usec; if (timestamp_deserialize(value, &watchdog_override_usec) < 0) @@ -2860,7 +2953,7 @@ static int service_watch_pid_file(Service *s) { log_unit_debug(UNIT(s), "Setting watch for PID file %s", s->pid_file_pathspec->path); - r = path_spec_watch(s->pid_file_pathspec, service_dispatch_io); + r = path_spec_watch(s->pid_file_pathspec, service_dispatch_inotify_io); if (r < 0) goto fail; @@ -2904,7 +2997,7 @@ static int service_demand_pid_file(Service *s) { return service_watch_pid_file(s); } -static int service_dispatch_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { +static int service_dispatch_inotify_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { PathSpec *p = userdata; Service *s; @@ -2937,6 +3030,59 @@ fail: return 0; } +static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t events, void *userdata) { + Service *s = SERVICE(userdata); + + assert(s); + + log_unit_debug(UNIT(s), "got exec-fd event"); + + /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve() + * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically + * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the + * parent. We need to be careful however, as there are other reasons that we might cause the child's side of + * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless + * the child signalled us first that it is about to call the execve(). It does so by sending us a simple + * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it + * sends a zero byte we'll ignore POLLHUP on the fd again. */ + + for (;;) { + uint8_t x; + ssize_t n; + + n = read(fd, &x, sizeof(x)); + if (n < 0) { + if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ + return 0; + + return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m"); + } + if (n == 0) { /* EOF → the event we are waiting for */ + + s->exec_fd_event_source = sd_event_source_unref(s->exec_fd_event_source); + + if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */ + log_unit_debug(UNIT(s), "Got EOF on exec-fd"); + + s->exec_fd_hot = false; + + /* Nice! This is what we have been waiting for. Transition to next state. */ + if (s->type == SERVICE_EXEC && s->state == SERVICE_START) + service_enter_start_post(s); + } else + log_unit_debug(UNIT(s), "Got EOF on exec-fd while it was disabled, ignoring."); + + return 0; + } + + /* A byte was read → this turns on/off the exec fd logic */ + assert(n == sizeof(x)); + s->exec_fd_hot = x; + } + + return 0; +} + static void service_notify_cgroup_empty_event(Unit *u) { Service *s = SERVICE(u); @@ -3846,7 +3992,8 @@ static const char* const service_type_table[_SERVICE_TYPE_MAX] = { [SERVICE_ONESHOT] = "oneshot", [SERVICE_DBUS] = "dbus", [SERVICE_NOTIFY] = "notify", - [SERVICE_IDLE] = "idle" + [SERVICE_IDLE] = "idle", + [SERVICE_EXEC] = "exec", }; DEFINE_STRING_TABLE_LOOKUP(service_type, ServiceType); diff --git a/src/core/service.h b/src/core/service.h index 9c06e91..b53e420 100644 --- a/src/core/service.h +++ b/src/core/service.h @@ -30,6 +30,7 @@ typedef enum ServiceType { SERVICE_DBUS, /* we fork and wait until a specific D-Bus name appears on the bus */ SERVICE_NOTIFY, /* we fork and wait until a daemon sends us a ready message with sd_notify() */ SERVICE_IDLE, /* much like simple, but delay exec() until all jobs are dispatched. */ + SERVICE_EXEC, /* we fork and wait until we execute exec() (this means our own setup is waited for) */ _SERVICE_TYPE_MAX, _SERVICE_TYPE_INVALID = -1 } ServiceType; @@ -165,6 +166,8 @@ struct Service { NotifyAccess notify_access; NotifyState notify_state; + sd_event_source *exec_fd_event_source; + ServiceFDStore *fd_store; size_t n_fd_store; unsigned n_fd_store_max; @@ -179,6 +182,7 @@ struct Service { unsigned n_restarts; bool flush_n_restarts; + bool exec_fd_hot; }; extern const UnitVTable service_vtable; diff --git a/src/core/socket.c b/src/core/socket.c index 56d3222..d488c64 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1867,10 +1867,11 @@ static int socket_coldplug(Unit *u) { static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { ExecParameters exec_params = { - .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, - .stdin_fd = -1, - .stdout_fd = -1, - .stderr_fd = -1, + .flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT|EXEC_APPLY_TTY_STDIN, + .stdin_fd = -1, + .stdout_fd = -1, + .stderr_fd = -1, + .exec_fd = -1, }; pid_t pid; int r; diff --git a/src/core/swap.c b/src/core/swap.c index b78b1aa..e01e61e 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -606,6 +606,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { .stdin_fd = -1, .stdout_fd = -1, .stderr_fd = -1, + .exec_fd = -1, }; pid_t pid; int r; -- 2.7.4