core: add two new special ExecStart= character prefixes
authorLennart Poettering <lennart@poettering.net>
Wed, 9 Aug 2017 14:09:04 +0000 (16:09 +0200)
committerLennart Poettering <lennart@poettering.net>
Thu, 10 Aug 2017 13:04:32 +0000 (15:04 +0200)
This patch adds two new special character prefixes to ExecStart= and
friends, in addition to the existing "-", "@" and "+":

"!"  → much like "+", except with a much reduced effect as it only
       disables the actual setresuid()/setresgid()/setgroups() calls, but
       leaves all other security features on, including namespace
       options. This is very useful in combination with
       RuntimeDirectory= or DynamicUser= and similar option, as a user
       is still allocated and used for the runtime directory, but the
       actual UID/GID dropping is left to the daemon process itself.
       This should make RuntimeDirectory= a lot more useful for daemons
       which insist on doing their own privilege dropping.

"!!" → Similar to "!", but on systems supporting ambient caps this
       becomes a NOP. This makes it relatively straightforward to write
       unit files that make use of ambient capabilities to let systemd
       drop all privs while retaining compatibility with systems that
       lack ambient caps, where priv dropping is the left to the daemon
       codes themselves.

This is an alternative approach to #6564 and related PRs.

man/systemd.service.xml
src/core/execute.c
src/core/execute.h
src/core/load-fragment.c
src/shared/seccomp-util.c
src/shared/seccomp-util.h

index da35a52..9863532 100644 (file)
         <varname>ExecStop=</varname> are not valid.)</para>
 
         <para>For each of the specified commands, the first argument must be an absolute path to an
-        executable. Optionally, if this file name is prefixed with <literal>@</literal>, the second token will be
-        passed as <literal>argv[0]</literal> to the executed process, followed by the further arguments specified.  If
-        the absolute filename is prefixed with <literal>-</literal>, an exit code of the command normally considered a
-        failure (i.e. non-zero exit status or abnormal exit due to signal) is ignored and considered success.  If the
-        absolute path is prefixed with <literal>+</literal> then it is executed with full
-        privileges. <literal>@</literal>, <literal>-</literal>, and <literal>+</literal> may be used together and they
-        can appear in any order.</para>
+        executable. Optionally, this file name may be prefixed with a number of special characters:</para>
+
+        <table>
+          <title>Special executable prefixes</title>
+
+          <tgroup cols='2'>
+            <colspec colname='prefix'/>
+            <colspec colname='meaning'/>
+
+            <thead>
+              <row>
+                <entry>Prefix</entry>
+                <entry>Effect</entry>
+              </row>
+            </thead>
+            <tbody>
+              <row>
+                <entry><literal>@</literal></entry>
+                <entry>If the executable path is prefixed with <literal>@</literal>, the second specified token will be passed as <literal>argv[0]</literal> to the executed process (instead of the actual filename), followed by the further arguments specified.</entry>
+              </row>
+
+              <row>
+                <entry><literal>-</literal></entry>
+                <entry>If the executable path is prefixed with <literal>-</literal>, an exit code of the command normally considered a failure (i.e. non-zero exit status or abnormal exit due to signal) is ignored and considered success.</entry>
+              </row>
+
+              <row>
+                <entry><literal>+</literal></entry>
+                <entry>If the executable path is prefixed with <literal>+</literal> then the process is executed with full privileges. In this mode privilege restrictions configured with <varname>User=</varname>, <varname>Group=</varname>, <varname>CapabilityBoundingSet=</varname> or the various file system namespacing options (such as <varname>PrivateDevices=</varname>, <varname>PrivateTmp=</varname>) are not applied to the invoked command line (but still affect any other <varname>ExecStart=</varname>, <varname>ExecStop=</varname>, … lines).</entry>
+              </row>
+
+              <row>
+                <entry><literal>!</literal></entry>
+
+                <entry>Similar to the <literal>+</literal> character discussed above this permits invoking command lines with elevated privileges. However, unlike <literal>+</literal> the <literal>!</literal> character exclusively alters the effect of <varname>User=</varname>, <varname>Group=</varname> and <varname>SupplementaryGroups=</varname>, i.e. only the stanzas the affect user and group credentials. Note that this setting may be combined with <varname>DynamicUser=</varname>, in which case a dynamic user/group pair is allocated before the command is invoked, but credential changing is left to the executed process itself.</entry>
+              </row>
+
+              <row>
+                <entry><literal>!!</literal></entry>
+
+                <entry>This prefix is very similar to <literal>!!</literal>, however it only has an effect on systems lacking support for ambient process capabilities, i.e. without support for <varname>AmbientCapabilities=</varname>. It's intended to be used for unit files that take benefit of ambient capabilities to run processes with minimal privileges wherever possible while remaining compatible with systems that lack ambient capabilities support. Note that when <literal>!!</literal> is used, and a system lacking ambient capability support is detected any configured <varname>SystemCallFilter=</varname> and <varname>CapabilityBoundingSet=</varname> stanzas are implicitly modified, in order to permit spawned processes to drop credentials and capabilites themselves, even if this is configured to not be allowed. Moreover, if this prefix is used and a system lacking ambient capability support is detected <varname>AmbientCapabilities=</varname> will be skipped and not be applied. On systems supporting ambient capabilities, <literal>!!</literal> has no effect and is redundant.</entry>
+              </row>
+            </tbody>
+          </tgroup>
+        </table>
+
+        <para><literal>@</literal>, <literal>-</literal>, and one of
+        <literal>+</literal>/<literal>!</literal>/<literal>!!</literal> may be used together and they can appear in any
+        order. However, only one of <literal>+</literal>, <literal>!</literal>, <literal>!!</literal> may be used a at
+        time. Note that these prefixes are also supported for the other command line settings,
+        i.e. <varname>ExecStartPre=</varname>, <varname>ExecStartPost=</varname>, <varname>ExecReload</varname>,
+        <varname>ExecStop=</varname> and <varname>ExecStopPost=</varname>.</para>
 
         <para>If more than one command is specified, the commands are
         invoked sequentially in the order they appear in the unit
index 6caf13b..46e0b18 100644 (file)
@@ -1312,8 +1312,9 @@ static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
         return true;
 }
 
-static int apply_syscall_filter(const Unit* u, const ExecContext *c) {
+static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
         uint32_t negative_action, default_action, action;
+        int r;
 
         assert(u);
         assert(c);
@@ -1334,6 +1335,12 @@ static int apply_syscall_filter(const Unit* u, const ExecContext *c) {
                 action = negative_action;
         }
 
+        if (needs_ambient_hack) {
+                r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
+                if (r < 0)
+                        return r;
+        }
+
         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
 }
 
@@ -2004,7 +2011,7 @@ static int apply_mount_namespace(
                 .protect_kernel_modules = context->protect_kernel_modules,
                 .mount_apivfs = context->mount_apivfs,
         };
-        bool apply_restrictions;
+        bool needs_sandboxing;
         int r;
 
         assert(context);
@@ -2039,18 +2046,18 @@ static int apply_mount_namespace(
         if (!context->dynamic_user && root_dir)
                 ns_info.ignore_protect_paths = true;
 
-        apply_restrictions = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
+        needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
 
         r = setup_namespace(root_dir, root_image,
                             &ns_info, rw,
-                            apply_restrictions ? context->read_only_paths : NULL,
-                            apply_restrictions ? context->inaccessible_paths : NULL,
+                            needs_sandboxing ? context->read_only_paths : NULL,
+                            needs_sandboxing ? context->inaccessible_paths : NULL,
                             context->bind_mounts,
                             context->n_bind_mounts,
                             tmp,
                             var,
-                            apply_restrictions ? context->protect_home : PROTECT_HOME_NO,
-                            apply_restrictions ? context->protect_system : PROTECT_SYSTEM_NO,
+                            needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
+                            needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
                             context->mount_flags,
                             DISSECT_IMAGE_DISCARD_ON_LOOP);
 
@@ -2302,7 +2309,10 @@ static int exec_child(
         const char *home = NULL, *shell = NULL;
         dev_t journal_stream_dev = 0;
         ino_t journal_stream_ino = 0;
-        bool needs_sandboxing, needs_mount_namespace;
+        bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
+                needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
+                needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
+                needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
 #ifdef HAVE_SELINUX
         bool use_selinux = false;
 #endif
@@ -2317,6 +2327,7 @@ static int exec_child(
         int i, r, ngids = 0;
         unsigned n_fds;
         ExecDirectoryType dt;
+        int secure_bits;
 
         assert(unit);
         assert(command);
@@ -2653,17 +2664,19 @@ static int exec_child(
                 return r;
         }
 
+        /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
 
-        if (needs_sandboxing) {
-                if (context->pam_name && username) {
-                        r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
-                        if (r < 0) {
-                                *exit_status = EXIT_PAM;
-                                return r;
-                        }
-                }
+        /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
+        needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
 
+        /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
+        if (needs_ambient_hack)
+                needs_setuid = false;
+        else
+                needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
+
+        if (needs_sandboxing) {
                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
                  * present. The actual MAC context application will happen later, as late as possible, to avoid
                  * impacting our own code paths. */
@@ -2671,15 +2684,22 @@ static int exec_child(
 #ifdef HAVE_SELINUX
                 use_selinux = mac_selinux_use();
 #endif
-
 #ifdef HAVE_SMACK
                 use_smack = mac_smack_use();
 #endif
-
 #ifdef HAVE_APPARMOR
                 use_apparmor = mac_apparmor_use();
 #endif
+        }
 
+        if (needs_setuid) {
+                if (context->pam_name && username) {
+                        r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
+                        if (r < 0) {
+                                *exit_status = EXIT_PAM;
+                                return r;
+                        }
+                }
         }
 
         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
@@ -2705,13 +2725,15 @@ static int exec_child(
                 return r;
 
         /* Drop groups as early as possbile */
-        if (needs_sandboxing) {
+        if (needs_setuid) {
                 r = enforce_groups(context, gid, supplementary_gids, ngids);
                 if (r < 0) {
                         *exit_status = EXIT_GROUP;
                         return r;
                 }
+        }
 
+        if (needs_sandboxing) {
 #ifdef HAVE_SELINUX
                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
@@ -2731,12 +2753,9 @@ static int exec_child(
                 }
         }
 
-        /* We repeat the fd closing here, to make sure that
-         * nothing is leaked from the PAM modules. Note that
-         * we are more aggressive this time since socket_fd
-         * and the netns fds we don't need anymore. The custom
-         * endpoint fd was needed to upload the policy and can
-         * now be closed as well. */
+        /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
+         * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
+         * was needed to upload the policy and can now be closed as well. */
         r = close_all_fds(fds, n_fds);
         if (r >= 0)
                 r = shift_fds(fds, n_fds);
@@ -2747,9 +2766,10 @@ static int exec_child(
                 return r;
         }
 
-        if (needs_sandboxing) {
+        secure_bits = context->secure_bits;
 
-                int secure_bits = context->secure_bits;
+        if (needs_sandboxing) {
+                uint64_t bset;
 
                 for (i = 0; i < _RLIMIT_MAX; i++) {
 
@@ -2771,8 +2791,17 @@ static int exec_child(
                         }
                 }
 
-                if (!cap_test_all(context->capability_bounding_set)) {
-                        r = capability_bounding_set_drop(context->capability_bounding_set, false);
+                bset = context->capability_bounding_set;
+                /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
+                 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
+                 * instead of us doing that */
+                if (needs_ambient_hack)
+                        bset |= (UINT64_C(1) << CAP_SETPCAP) |
+                                (UINT64_C(1) << CAP_SETUID) |
+                                (UINT64_C(1) << CAP_SETGID);
+
+                if (!cap_test_all(bset)) {
+                        r = capability_bounding_set_drop(bset, false);
                         if (r < 0) {
                                 *exit_status = EXIT_CAPABILITIES;
                                 *error_message = strdup("Failed to drop capabilities");
@@ -2782,7 +2811,8 @@ static int exec_child(
 
                 /* This is done before enforce_user, but ambient set
                  * does not survive over setresuid() if keep_caps is not set. */
-                if (context->capability_ambient_set != 0) {
+                if (!needs_ambient_hack &&
+                    context->capability_ambient_set != 0) {
                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
                         if (r < 0) {
                                 *exit_status = EXIT_CAPABILITIES;
@@ -2790,7 +2820,9 @@ static int exec_child(
                                 return r;
                         }
                 }
+        }
 
+        if (needs_setuid) {
                 if (context->user) {
                         r = enforce_user(context, uid);
                         if (r < 0) {
@@ -2798,7 +2830,9 @@ static int exec_child(
                                 (void) asprintf(error_message, "Failed to change UID to "UID_FMT, uid);
                                 return r;
                         }
-                        if (context->capability_ambient_set != 0) {
+
+                        if (!needs_ambient_hack &&
+                            context->capability_ambient_set != 0) {
 
                                 /* Fix the ambient capabilities after user change. */
                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
@@ -2818,7 +2852,9 @@ static int exec_child(
                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
                         }
                 }
+        }
 
+        if (needs_sandboxing) {
                 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
@@ -2863,10 +2899,8 @@ static int exec_child(
                 }
 #endif
 
-                /* PR_GET_SECUREBITS is not privileged, while
-                 * PR_SET_SECUREBITS is. So to suppress
-                 * potential EPERMs we'll try not to call
-                 * PR_SET_SECUREBITS unless necessary. */
+                /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
+                 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
                                 *exit_status = EXIT_SECUREBITS;
@@ -2940,7 +2974,7 @@ static int exec_child(
 
                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
                  * by the filter as little as possible. */
-                r = apply_syscall_filter(unit, context);
+                r = apply_syscall_filter(unit, context, needs_ambient_hack);
                 if (r < 0) {
                         *exit_status = EXIT_SECCOMP;
                         *error_message = strdup("Failed to apply syscall filters");
index 0c5811c..9a28269 100644 (file)
@@ -91,6 +91,8 @@ struct ExecStatus {
 typedef enum ExecCommandFlags {
         EXEC_COMMAND_IGNORE_FAILURE = 1,
         EXEC_COMMAND_FULLY_PRIVILEGED = 2,
+        EXEC_COMMAND_NO_SETUID = 4,
+        EXEC_COMMAND_AMBIENT_MAGIC = 8,
 } ExecCommandFlags;
 
 struct ExecCommand {
index 07ad3c5..7bcce9b 100644 (file)
@@ -608,7 +608,8 @@ int config_parse_exec(
         p = rvalue;
         do {
                 _cleanup_free_ char *path = NULL, *firstword = NULL;
-                bool separate_argv0 = false, ignore = false, privileged = false;
+                ExecCommandFlags flags = 0;
+                bool ignore = false, separate_argv0 = false;
                 _cleanup_free_ ExecCommand *nce = NULL;
                 _cleanup_strv_free_ char **n = NULL;
                 size_t nlen = 0, nbufsize = 0;
@@ -622,18 +623,31 @@ int config_parse_exec(
 
                 f = firstword;
                 for (;;) {
-                        /* We accept an absolute path as first argument.
-                         * If it's prefixed with - and the path doesn't exist,
-                         * we ignore it instead of erroring out;
-                         * if it's prefixed with @, we allow overriding of argv[0];
-                         * and if it's prefixed with +, it will be run with full privileges */
-                        if (*f == '-' && !ignore)
+                        /* We accept an absolute path as first argument.  If it's prefixed with - and the path doesn't
+                         * exist, we ignore it instead of erroring out; if it's prefixed with @, we allow overriding of
+                         * argv[0]; if it's prefixed with +, it will be run with full privileges and no sandboxing; if
+                         * it's prefixed with '!' we apply sandboxing, but do not change user/group credentials; if
+                         * it's prefixed with '!!', then we apply user/group credentials if the kernel supports ambient
+                         * capabilities -- if it doesn't we don't apply the credentials themselves, but do apply most
+                         * other sandboxing, with some special exceptions for changing UID.
+                         *
+                         * The idea is that '!!' may be used to write services that can take benefit of systemd's
+                         * UID/GID dropping if the kernel supports ambient creds, but provide an automatic fallback to
+                         * privilege dropping within the daemon if the kernel does not offer that. */
+
+                        if (*f == '-' && !(flags & EXEC_COMMAND_IGNORE_FAILURE)) {
+                                flags |= EXEC_COMMAND_IGNORE_FAILURE;
                                 ignore = true;
-                        else if (*f == '@' && !separate_argv0)
+                        else if (*f == '@' && !separate_argv0)
                                 separate_argv0 = true;
-                        else if (*f == '+' && !privileged)
-                                privileged = true;
-                        else
+                        else if (*f == '+' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+                                flags |= EXEC_COMMAND_FULLY_PRIVILEGED;
+                        else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC)))
+                                flags |= EXEC_COMMAND_NO_SETUID;
+                        else if (*f == '!' && !(flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))) {
+                                flags &= ~EXEC_COMMAND_NO_SETUID;
+                                flags |= EXEC_COMMAND_AMBIENT_MAGIC;
+                        } else
                                 break;
                         f++;
                 }
@@ -752,9 +766,7 @@ int config_parse_exec(
 
                 nce->argv = n;
                 nce->path = path;
-                nce->flags =
-                        (ignore ? EXEC_COMMAND_IGNORE_FAILURE : 0) |
-                        (privileged ? EXEC_COMMAND_FULLY_PRIVILEGED : 0);
+                nce->flags = flags;
 
                 exec_command_append_list(e, nce);
 
index e80d98e..dd6d4fb 100644 (file)
@@ -1364,3 +1364,41 @@ int parse_syscall_archs(char **l, Set **archs) {
 
         return 0;
 }
+
+int seccomp_filter_set_add(Set *filter, bool add, const SyscallFilterSet *set) {
+        const char *i;
+        int r;
+
+        assert(set);
+
+        NULSTR_FOREACH(i, set->value) {
+
+                if (i[0] == '@') {
+                        const SyscallFilterSet *more;
+
+                        more = syscall_filter_set_find(i);
+                        if (!more)
+                                return -ENXIO;
+
+
+                        r = seccomp_filter_set_add(filter, add, more);
+                        if (r < 0)
+                                return r;
+                } else {
+                        int id;
+
+                        id = seccomp_syscall_resolve_name(i);
+                        if (id == __NR_SCMP_ERROR)
+                                return -ENXIO;
+
+                        if (add) {
+                                r = set_put(filter, INT_TO_PTR(id + 1));
+                                if (r < 0)
+                                        return r;
+                        } else
+                                (void) set_remove(filter, INT_TO_PTR(id + 1));
+                }
+        }
+
+        return 0;
+}
index f6b6889..0edffa1 100644 (file)
@@ -67,6 +67,8 @@ extern const SyscallFilterSet syscall_filter_sets[];
 
 const SyscallFilterSet *syscall_filter_set_find(const char *name);
 
+int seccomp_filter_set_add(Set *s, bool b, const SyscallFilterSet *set);
+
 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);