nspawn: R/W support for /sys, and /proc/sys
authorSergiusz Urbaniak <sergiusz.urbaniak@gmail.com>
Fri, 14 Oct 2016 12:00:15 +0000 (14:00 +0200)
committerSergiusz Urbaniak <sergiusz.urbaniak@gmail.com>
Fri, 18 Nov 2016 08:50:40 +0000 (09:50 +0100)
This commit adds the possibility to leave /sys, and /proc/sys read-write.
It introduces a new (undocumented) env var SYSTEMD_NSPAWN_API_VFS_WRITABLE
to enable this feature.

If set to "yes", /sys, and /proc/sys will be read-write.
If set to "no", /sys, and /proc/sys will be read-only.
If set to "network" /proc/sys/net will be read-write. This is useful in
use-cases, where systemd-nspawn is used in an external network
namespace.

This adds the possibility to start privileged containers which need more
control over settings in the /proc, and /sys filesystem.

This is also a follow-up on the discussion from
https://github.com/systemd/systemd/pull/4018#r76971862 where an
introduction of a simple env var to enable R/W support for those
directories was already discussed.

src/nspawn/nspawn-mount.c
src/nspawn/nspawn-mount.h
src/nspawn/nspawn.c

index 0c24b8e..95bb3c0 100644 (file)
@@ -225,9 +225,10 @@ static int tmpfs_patch_options(
         return !!buf;
 }
 
-int mount_sysfs(const char *dest) {
+int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
         const char *full, *top, *x;
         int r;
+        unsigned long extra_flags = 0;
 
         top = prefix_roota(dest, "/sys");
         r = path_check_fstype(top, SYSFS_MAGIC);
@@ -244,8 +245,11 @@ int mount_sysfs(const char *dest) {
 
         (void) mkdir(full, 0755);
 
+        if (mount_settings & MOUNT_APPLY_APIVFS_RO)
+                extra_flags |= MS_RDONLY;
+
         r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
-                          MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+                          MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
         if (r < 0)
                 return r;
 
@@ -267,7 +271,7 @@ int mount_sysfs(const char *dest) {
                         return r;
 
                 r = mount_verbose(LOG_ERR, NULL, to, NULL,
-                                  MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+                                  MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
                 if (r < 0)
                         return r;
         }
@@ -291,7 +295,7 @@ int mount_sysfs(const char *dest) {
         }
 
         return mount_verbose(LOG_ERR, NULL, top, NULL,
-                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
+                             MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
 }
 
 static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
@@ -348,8 +352,7 @@ static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, boo
 }
 
 int mount_all(const char *dest,
-              bool use_userns, bool in_userns,
-              bool use_netns,
+              MountSettingsMask mount_settings,
               uid_t uid_shift, uid_t uid_range,
               const char *selinux_apifs_context) {
 
@@ -359,41 +362,52 @@ int mount_all(const char *dest,
                 const char *type;
                 const char *options;
                 unsigned long flags;
-                bool fatal;
-                bool in_userns;
-                bool use_netns;
+                MountSettingsMask mount_settings;
         } MountPoint;
 
         static const MountPoint mount_table[] = {
-                { "proc",                "/proc",               "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  true,  false },
-                { "/proc/sys",           "/proc/sys",           NULL,    NULL,        MS_BIND,                                                   true,  true,  false },   /* Bind mount first ...*/
-                { "/proc/sys/net",       "/proc/sys/net",       NULL,    NULL,        MS_BIND,                                                   true,  true,  true  },   /* (except for this) */
-                { NULL,                  "/proc/sys",           NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true,  true,  false },   /* ... then, make it r/o */
-                { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND,                                                   false, true,  false },   /* Bind mount first ...*/
-                { NULL,                  "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true,  false },   /* ... then, make it r/o */
-                { "tmpfs",               "/sys",                "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              true,  false, true  },
-                { "sysfs",               "/sys",                "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    true,  false, false },
-                { "tmpfs",               "/dev",                "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  true,  false, false },
-                { "tmpfs",               "/dev/shm",            "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
-                { "tmpfs",               "/run",                "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         true,  false, false },
-                { "tmpfs",               "/tmp",                "tmpfs", "mode=1777", MS_STRICTATIME,                                            true,  true,  false },
+                /* inner child mounts */
+                { "proc",                "/proc",               "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,                              MOUNT_FATAL|MOUNT_IN_USERNS },
+                { "/proc/sys",           "/proc/sys",           NULL,    NULL,        MS_BIND,                                                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* Bind mount first ...*/
+                { "/proc/sys/net",       "/proc/sys/net",       NULL,    NULL,        MS_BIND,                                                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
+                { NULL,                  "/proc/sys",           NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* ... then, make it r/o */
+                { "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND,                                                               MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* Bind mount first ...*/
+                { NULL,                  "/proc/sysrq-trigger", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,             MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* ... then, make it r/o */
+                { "tmpfs",               "/tmp",                "tmpfs", "mode=1777", MS_STRICTATIME,                                            MOUNT_FATAL|MOUNT_IN_USERNS },
+
+                /* outer child mounts */
+                { "tmpfs",               "/sys",                "tmpfs", "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV,                              MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
+                { "sysfs",               "/sys",                "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,                    MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO },    /* skipped if above was mounted */
+                { "sysfs",               "/sys",                "sysfs", NULL,                  MS_NOSUID|MS_NOEXEC|MS_NODEV,                    MOUNT_FATAL },                          /* skipped if above was mounted */
+
+                { "tmpfs",               "/dev",                "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,                                  MOUNT_FATAL },
+                { "tmpfs",               "/dev/shm",            "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         MOUNT_FATAL },
+                { "tmpfs",               "/run",                "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,                         MOUNT_FATAL },
 #ifdef HAVE_SELINUX
-                { "/sys/fs/selinux",     "/sys/fs/selinux",     NULL,     NULL,       MS_BIND,                                                   false, false, false },  /* Bind mount first */
-                { NULL,                  "/sys/fs/selinux",     NULL,     NULL,       MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false },  /* Then, make it r/o */
+                { "/sys/fs/selinux",     "/sys/fs/selinux",     NULL,     NULL,       MS_BIND,                                                   0 },  /* Bind mount first */
+                { NULL,                  "/sys/fs/selinux",     NULL,     NULL,       MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 },  /* Then, make it r/o */
 #endif
         };
 
         unsigned k;
         int r;
+        bool use_userns = (mount_settings & MOUNT_USE_USERNS);
+        bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
+        bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
+        bool in_userns = (mount_settings & MOUNT_IN_USERNS);
 
         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
                 _cleanup_free_ char *where = NULL, *options = NULL;
                 const char *o;
+                bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
+
+                if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
+                        continue;
 
-                if (in_userns != mount_table[k].in_userns)
+                if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
                         continue;
 
-                if (!use_netns && mount_table[k].use_netns)
+                if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
                         continue;
 
                 where = prefix_root(dest, mount_table[k].where);
@@ -410,7 +424,7 @@ int mount_all(const char *dest,
 
                 r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift);
                 if (r < 0 && r != -EEXIST) {
-                        if (mount_table[k].fatal)
+                        if (fatal)
                                 return log_error_errno(r, "Failed to create directory %s: %m", where);
 
                         log_debug_errno(r, "Failed to create directory %s: %m", where);
@@ -429,13 +443,13 @@ int mount_all(const char *dest,
                                 o = options;
                 }
 
-                r = mount_verbose(mount_table[k].fatal ? LOG_ERR : LOG_DEBUG,
+                r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
                                   mount_table[k].what,
                                   where,
                                   mount_table[k].type,
                                   mount_table[k].flags,
                                   o);
-                if (r < 0 && mount_table[k].fatal)
+                if (r < 0 && fatal)
                         return r;
         }
 
index 7307a83..74aee7e 100644 (file)
 
 #include "cgroup-util.h"
 
+typedef enum MountSettingsMask {
+        MOUNT_FATAL              = 1 << 0, /* if set, a mount error is considered fatal */
+        MOUNT_USE_USERNS         = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
+        MOUNT_IN_USERNS          = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
+        MOUNT_APPLY_APIVFS_RO    = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */
+        MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
+                                              Works only if MOUNT_APPLY_APIVFS_RO is also set. */
+} MountSettingsMask;
+
 typedef enum VolatileMode {
         VOLATILE_NO,
         VOLATILE_YES,
@@ -57,8 +66,8 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s);
 
 int custom_mount_compare(const void *a, const void *b);
 
-int mount_all(const char *dest, bool use_userns, bool in_userns, bool use_netns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
-int mount_sysfs(const char *dest);
+int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
+int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
 
 int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
 int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
index 9b9ae90..69b9efe 100644 (file)
@@ -195,6 +195,7 @@ static const char *arg_container_service_name = "systemd-nspawn";
 static bool arg_notify_ready = false;
 static bool arg_use_cgns = true;
 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
+static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
 
 static void help(void) {
         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@@ -378,6 +379,31 @@ static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
         arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
 }
 
+static void parse_mount_settings_env(void) {
+        int r;
+        const char *e;
+
+        e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
+        if (!e)
+                return;
+
+        if (streq(e, "network")) {
+                arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
+                return;
+        }
+
+        r = parse_boolean(e);
+        if (r < 0) {
+                log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
+                return;
+        } else if (r > 0)
+                arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
+        else
+                arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
+
+        arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
+}
+
 static int parse_argv(int argc, char *argv[]) {
 
         enum {
@@ -1070,6 +1096,14 @@ static int parse_argv(int argc, char *argv[]) {
         parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
         parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
 
+        if (arg_userns_mode != USER_NAMESPACE_NO)
+                arg_mount_settings |= MOUNT_USE_USERNS;
+
+        if (arg_private_network)
+                arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
+
+        parse_mount_settings_env();
+
         if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
             !(arg_clone_ns_flags & CLONE_NEWUTS)) {
                 arg_register = false;
@@ -1164,6 +1198,15 @@ static int parse_argv(int argc, char *argv[]) {
 }
 
 static int verify_arguments(void) {
+        if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
+                log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
+                return -EINVAL;
+        }
+
+        if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
+                log_error("Cannot combine --private-users with read-write mounts.");
+                return -EINVAL;
+        }
 
         if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
@@ -2700,9 +2743,7 @@ static int inner_child(
                 return log_error_errno(r, "Couldn't become new root: %m");
 
         r = mount_all(NULL,
-                      arg_userns_mode != USER_NAMESPACE_NO,
-                      true,
-                      arg_private_network,
+                      arg_mount_settings | MOUNT_IN_USERNS,
                       arg_uid_shift,
                       arg_uid_range,
                       arg_selinux_apifs_context);
@@ -2710,7 +2751,7 @@ static int inner_child(
         if (r < 0)
                 return r;
 
-        r = mount_sysfs(NULL);
+        r = mount_sysfs(NULL, arg_mount_settings);
         if (r < 0)
                 return r;
 
@@ -3077,9 +3118,7 @@ static int outer_child(
         }
 
         r = mount_all(directory,
-                      arg_userns_mode != USER_NAMESPACE_NO,
-                      false,
-                      arg_private_network,
+                      arg_mount_settings,
                       arg_uid_shift,
                       arg_uid_range,
                       arg_selinux_apifs_context);