core: add a per-unit setting MountAPIVFS= for mounting /dev, /proc, /sys in conjuncti...
authorLennart Poettering <lennart@poettering.net>
Thu, 22 Dec 2016 22:34:35 +0000 (23:34 +0100)
committerLennart Poettering <lennart@poettering.net>
Tue, 7 Feb 2017 10:22:05 +0000 (11:22 +0100)
This adds a boolean unit file setting MountAPIVFS=. If set, the three
main API VFS mounts will be mounted for the service. This only has an
effect on RootDirectory=, which it makes a ton times more useful.

(This is basically the /dev + /proc + /sys mounting code posted in the
original #4727, but rebased on current git, and with the automatic logic
replaced by explicit logic controlled by a unit file setting)

man/systemd.exec.xml
src/core/dbus-execute.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.m4
src/core/namespace.c
src/core/namespace.h
src/shared/bus-unit-util.c

index bb38ea2..e594dc1 100644 (file)
         the <function>chroot()</function> jail. Note that setting this parameter might result in additional
         dependencies to be added to the unit (see above).</para>
 
-        <para>The <varname>PrivateUsers=</varname> setting is particularly useful in conjunction with
-        <varname>RootDirectory=</varname>. For details, see below.</para></listitem>
+        <para>The <varname>MountAPIVFS=</varname> and <varname>PrivateUsers=</varname> settings are particularly useful
+        in conjunction with <varname>RootDirectory=</varname>. For details, see below.</para></listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><varname>MountAPIVFS=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If on, a private mount namespace for the unit's processes is created
+        and the API file systems <filename>/proc</filename>, <filename>/sys</filename> and <filename>/dev</filename>
+        will be mounted inside of it, unless they are already mounted. Note that this option has no effect unless used
+        in conjunction with <varname>RootDirectory=</varname> as these three mounts are generally mounted in the host
+        anyway, and unless the root directory is changed the private mount namespace will be a 1:1 copy of the host's,
+        and include these three mounts. Note that the <filename>/dev</filename> file system of the host is bind mounted
+        if this option is used without <varname>PrivateDevices=</varname>. To run the service with a private, minimal
+        version of <filename>/dev/</filename>, combine this option with
+        <varname>PrivateDevices=</varname>.</para></listitem>
       </varlistentry>
 
       <varlistentry>
index cc10e2d..60b0288 100644 (file)
@@ -828,6 +828,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_VTABLE_END
 };
 
@@ -1207,7 +1208,7 @@ int bus_exec_context_set_transient_property(
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
                               "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
                               "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
-                              "ProtectKernelModules", "ProtectControlGroups")) {
+                              "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) {
                 int b;
 
                 r = sd_bus_message_read(message, "b", &b);
@@ -1247,6 +1248,8 @@ int bus_exec_context_set_transient_property(
                                 c->protect_kernel_modules = b;
                         else if (streq(name, "ProtectControlGroups"))
                                 c->protect_control_groups = b;
+                        else if (streq(name, "MountAPIVFS"))
+                                c->mount_apivfs = b;
 
                         unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
                 }
index aa0ddb5..54f6418 100644 (file)
@@ -1662,6 +1662,9 @@ static bool exec_needs_mount_namespace(
             context->protect_control_groups)
                 return true;
 
+        if (context->mount_apivfs)
+                return true;
+
         return false;
 }
 
@@ -1942,6 +1945,7 @@ static int apply_mount_namespace(Unit *u, const ExecContext *context,
                 .protect_control_groups = context->protect_control_groups,
                 .protect_kernel_tunables = context->protect_kernel_tunables,
                 .protect_kernel_modules = context->protect_kernel_modules,
+                .mount_apivfs = context->mount_apivfs,
         };
 
         assert(context);
@@ -3294,6 +3298,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 "%sPrivateUsers: %s\n"
                 "%sProtectHome: %s\n"
                 "%sProtectSystem: %s\n"
+                "%sMountAPIVFS: %s\n"
                 "%sIgnoreSIGPIPE: %s\n"
                 "%sMemoryDenyWriteExecute: %s\n"
                 "%sRestrictRealtime: %s\n",
@@ -3310,6 +3315,7 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 prefix, yes_no(c->private_users),
                 prefix, protect_home_to_string(c->protect_home),
                 prefix, protect_system_to_string(c->protect_system),
+                prefix, yes_no(c->mount_apivfs),
                 prefix, yes_no(c->ignore_sigpipe),
                 prefix, yes_no(c->memory_deny_write_execute),
                 prefix, yes_no(c->restrict_realtime));
index f8694ef..6fd5a6e 100644 (file)
@@ -183,6 +183,7 @@ struct ExecContext {
         bool protect_kernel_tunables;
         bool protect_kernel_modules;
         bool protect_control_groups;
+        bool mount_apivfs;
 
         bool no_new_privileges;
 
index 15f22a2..07f2a70 100644 (file)
@@ -101,6 +101,7 @@ $1.PrivateUsers,                 config_parse_bool,                  0,
 $1.ProtectSystem,                config_parse_protect_system,        0,                             offsetof($1, exec_context)
 $1.ProtectHome,                  config_parse_protect_home,          0,                             offsetof($1, exec_context)
 $1.MountFlags,                   config_parse_exec_mount_flags,      0,                             offsetof($1, exec_context)
+$1.MountAPIVFS,                  config_parse_bool,                  0,                             offsetof($1, exec_context.mount_apivfs)
 $1.Personality,                  config_parse_personality,           0,                             offsetof($1, exec_context.personality)
 $1.RuntimeDirectoryMode,         config_parse_mode,                  0,                             offsetof($1, exec_context.runtime_directory_mode)
 $1.RuntimeDirectory,             config_parse_runtime_directory,     0,                             offsetof($1, exec_context.runtime_directory)
index d0fdc3d..10917f7 100644 (file)
@@ -52,10 +52,13 @@ typedef enum MountMode {
         INACCESSIBLE,
         BIND_MOUNT,
         BIND_MOUNT_RECURSIVE,
-        READONLY,
         PRIVATE_TMP,
         PRIVATE_VAR_TMP,
         PRIVATE_DEV,
+        BIND_DEV,
+        SYSFS,
+        PROCFS,
+        READONLY,
         READWRITE,
 } MountMode;
 
@@ -70,13 +73,13 @@ typedef struct MountEntry {
         char *source_malloc;
 } MountEntry;
 
-/*
- * The following Protect tables are to protect paths and mark some of them
- * READONLY, in case a path is covered by an option from another table, then
- * it is marked READWRITE in the current one, and the more restrictive mode is
- * applied from that other table. This way all options can be combined in a
- * safe and comprehensible way for users.
- */
+/* If MountAPIVFS= is used, let's mount /sys and /proc into the it, but only as a fallback if the user hasn't mounted
+ * something there already. These mounts are hence overriden by any other explicitly configured mounts. */
+static const MountEntry apivfs_table[] = {
+        { "/proc",               PROCFS,       false },
+        { "/dev",                BIND_DEV,     false },
+        { "/sys",                SYSFS,        false },
+};
 
 /* ProtectKernelTunables= option and the related filesystem APIs */
 static const MountEntry protect_kernel_tunables_table[] = {
@@ -465,7 +468,7 @@ static void drop_outside_root(const char *root_directory, MountEntry *m, unsigne
         *n = t - m;
 }
 
-static int mount_dev(MountEntry *m) {
+static int mount_private_dev(MountEntry *m) {
         static const char devnodes[] =
                 "/dev/null\0"
                 "/dev/zero\0"
@@ -604,6 +607,62 @@ fail:
         return r;
 }
 
+static int mount_bind_dev(MountEntry *m) {
+        int r;
+
+        assert(m);
+
+        /* Implements the little brother of mount_private_dev(): simply bind mounts the host's /dev into the service's
+         * /dev. This is only used when RootDirectory= is set. */
+
+        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Unable to determine whether /dev is already mounted: %m");
+        if (r > 0) /* make this a NOP if /dev is already a mount point */
+                return 0;
+
+        if (mount("/dev", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
+                return log_debug_errno(errno, "Failed to bind mount %s: %m", mount_entry_path(m));
+
+        return 1;
+}
+
+static int mount_sysfs(MountEntry *m) {
+        int r;
+
+        assert(m);
+
+        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Unable to determine whether /sys is already mounted: %m");
+        if (r > 0) /* make this a NOP if /sys is already a mount point */
+                return 0;
+
+        /* Bind mount the host's version so that we get all child mounts of it, too. */
+        if (mount("/sys", mount_entry_path(m), NULL, MS_BIND|MS_REC, NULL) < 0)
+                return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
+
+        return 1;
+}
+
+static int mount_procfs(MountEntry *m) {
+        int r;
+
+        assert(m);
+
+        r = path_is_mount_point(mount_entry_path(m), NULL, 0);
+        if (r < 0)
+                return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
+        if (r > 0) /* make this a NOP if /proc is already a mount point */
+                return 0;
+
+        /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
+        if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
+                return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
+
+        return 1;
+}
+
 static int mount_entry_chase(
                 const char *root_directory,
                 MountEntry *m,
@@ -691,6 +750,7 @@ static int apply_mount(
 
         case BIND_MOUNT_RECURSIVE:
                 /* Also chase the source mount */
+
                 r = mount_entry_chase(root_directory, m, mount_entry_source(m), &m->source_malloc);
                 if (r <= 0)
                         return r;
@@ -707,7 +767,16 @@ static int apply_mount(
                 break;
 
         case PRIVATE_DEV:
-                return mount_dev(m);
+                return mount_private_dev(m);
+
+        case BIND_DEV:
+                return mount_bind_dev(m);
+
+        case SYSFS:
+                return mount_sysfs(m);
+
+        case PROCFS:
+                return mount_procfs(m);
 
         default:
                 assert_not_reached("Unknown mode");
@@ -729,7 +798,7 @@ static int make_read_only(MountEntry *m, char **blacklist) {
 
         if (mount_entry_read_only(m))
                 r = bind_remount_recursive(mount_entry_path(m), true, blacklist);
-        else if (m->mode == PRIVATE_DEV) { /* Can be readonly but the submounts can't*/
+        else if (m->mode == PRIVATE_DEV) { /* Superblock can be readonly but the submounts can't*/
                 if (mount(NULL, mount_entry_path(m), NULL, MS_REMOUNT|DEV_MOUNT_OPTIONS|MS_RDONLY, NULL) < 0)
                         r = -errno;
         } else
@@ -745,6 +814,17 @@ static int make_read_only(MountEntry *m, char **blacklist) {
         return r;
 }
 
+static bool namespace_info_mount_apivfs(const NameSpaceInfo *ns_info) {
+        assert(ns_info);
+
+        /* ProtectControlGroups= and ProtectKernelTunables= imply MountAPIVFS=, since to protect the API VFS mounts,
+         * they need to be around in the first place... */
+
+        return ns_info->mount_apivfs ||
+                ns_info->protect_control_groups ||
+                ns_info->protect_kernel_tunables;
+}
+
 static unsigned namespace_calculate_mounts(
                 const NameSpaceInfo *ns_info,
                 char** read_write_paths,
@@ -781,7 +861,8 @@ static unsigned namespace_calculate_mounts(
                 (ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
                 (ns_info->protect_control_groups ? 1 : 0) +
                 (ns_info->protect_kernel_modules ? ELEMENTSOF(protect_kernel_modules_table) : 0) +
-                protect_home_cnt + protect_system_cnt;
+                protect_home_cnt + protect_system_cnt +
+                (namespace_info_mount_apivfs(ns_info) ? ELEMENTSOF(apivfs_table) : 0);
 }
 
 int setup_namespace(
@@ -885,6 +966,12 @@ int setup_namespace(
                 if (r < 0)
                         goto finish;
 
+                if (namespace_info_mount_apivfs(ns_info)) {
+                        r = append_static_mounts(&m, apivfs_table, ELEMENTSOF(apivfs_table), ns_info->ignore_protect_paths);
+                        if (r < 0)
+                                goto finish;
+                }
+
                 assert(mounts + n_mounts == m);
 
                 /* Prepend the root directory where that's necessary */
index de3edc4..bb9de98 100644 (file)
@@ -50,6 +50,7 @@ struct NameSpaceInfo {
         bool protect_control_groups:1;
         bool protect_kernel_tunables:1;
         bool protect_kernel_modules:1;
+        bool mount_apivfs:1;
 };
 
 struct BindMount {
index 829be2c..b6da20a 100644 (file)
@@ -208,7 +208,7 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
                               "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
                               "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
-                              "ProtectKernelModules", "ProtectControlGroups")) {
+                              "ProtectKernelModules", "ProtectControlGroups", "MountAPIVFS")) {
 
                 r = parse_boolean(eq);
                 if (r < 0)