1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
23 #include <sys/mount.h>
28 #include <sys/types.h>
30 #include <sys/syscall.h>
37 #include "path-util.h"
38 #include "namespace.h"
41 #include "loopback-setup.h"
43 #include "dev-setup.h"
47 typedef enum MountMode {
48 /* This is ordered by priority! */
57 typedef struct BindMount {
64 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
69 STRV_FOREACH(i, strv) {
74 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
79 if (!path_is_absolute(*i))
90 static int mount_path_compare(const void *a, const void *b) {
91 const BindMount *p = a, *q = b;
93 if (path_equal(p->path, q->path)) {
95 /* If the paths are equal, check the mode */
96 if (p->mode < q->mode)
99 if (p->mode > q->mode)
105 /* If the paths are not equal, then order prefixes first */
106 if (path_startswith(p->path, q->path))
109 if (path_startswith(q->path, p->path))
115 static void drop_duplicates(BindMount *m, unsigned *n) {
116 BindMount *f, *t, *previous;
121 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
123 /* The first one wins */
124 if (previous && path_equal(f->path, previous->path))
137 static int mount_dev(BindMount *m) {
138 static const char devnodes[] =
146 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
147 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devkdbus = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
148 _cleanup_umask_ mode_t u;
155 if (!mkdtemp(temporary_mount))
158 dev = strappenda(temporary_mount, "/dev");
160 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
165 devpts = strappenda(temporary_mount, "/dev/pts");
167 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
172 devptmx = strappenda(temporary_mount, "/dev/ptmx");
173 symlink("pts/ptmx", devptmx);
175 devshm = strappenda(temporary_mount, "/dev/shm");
176 mkdir(devshm, 01777);
177 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
183 devmqueue = strappenda(temporary_mount, "/dev/mqueue");
184 mkdir(devmqueue, 0755);
185 mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
187 devkdbus = strappenda(temporary_mount, "/dev/kdbus");
188 mkdir(devkdbus, 0755);
189 mount("/dev/kdbus", devkdbus, NULL, MS_BIND, NULL);
191 devhugepages = strappenda(temporary_mount, "/dev/hugepages");
192 mkdir(devhugepages, 0755);
193 mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
195 devlog = strappenda(temporary_mount, "/dev/log");
196 symlink("/run/systemd/journal/dev-log", devlog);
198 NULSTR_FOREACH(d, devnodes) {
199 _cleanup_free_ char *dn = NULL;
212 if (!S_ISBLK(st.st_mode) &&
213 !S_ISCHR(st.st_mode)) {
221 dn = strappend(temporary_mount, d);
227 label_context_set(d, st.st_mode);
228 r = mknod(dn, st.st_mode, st.st_rdev);
229 label_context_clear();
237 dev_setup(temporary_mount);
239 if (mount(dev, "/dev/", NULL, MS_MOVE, NULL) < 0) {
245 rmdir(temporary_mount);
260 umount(devhugepages);
270 rmdir(temporary_mount);
275 static int apply_mount(
278 const char *var_tmp_dir) {
289 /* First, get rid of everything that is below if there
290 * is anything... Then, overmount it with an
291 * inaccessible directory. */
292 umount_recursive(m->path, 0);
294 what = "/run/systemd/inaccessible";
299 /* Nothing to mount here, we just later toggle the
300 * MS_RDONLY bit for the mount point */
307 case PRIVATE_VAR_TMP:
315 assert_not_reached("Unknown mode");
320 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
322 log_debug("Successfully mounted %s to %s", what, m->path);
323 else if (m->ignore && errno == ENOENT)
329 static int make_read_only(BindMount *m) {
334 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
335 r = bind_remount_recursive(m->path, true);
336 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
337 r = bind_remount_recursive(m->path, false);
341 if (m->ignore && r == -ENOENT)
348 char** read_write_dirs,
349 char** read_only_dirs,
350 char** inaccessible_dirs,
354 ProtectHome protect_home,
355 ProtectSystem protect_system,
356 unsigned mount_flags) {
358 BindMount *m, *mounts = NULL;
362 if (mount_flags == 0)
363 mount_flags = MS_SHARED;
365 if (unshare(CLONE_NEWNS) < 0)
368 n = !!tmp_dir + !!var_tmp_dir +
369 strv_length(read_write_dirs) +
370 strv_length(read_only_dirs) +
371 strv_length(inaccessible_dirs) +
373 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
374 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
375 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
378 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
379 r = append_mounts(&m, read_write_dirs, READWRITE);
383 r = append_mounts(&m, read_only_dirs, READONLY);
387 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
393 m->mode = PRIVATE_TMP;
398 m->path = "/var/tmp";
399 m->mode = PRIVATE_VAR_TMP;
405 m->mode = PRIVATE_DEV;
409 if (protect_home != PROTECT_HOME_NO) {
410 r = append_mounts(&m, STRV_MAKE("-/home", "-/run/user", "-/root"), protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
415 if (protect_system != PROTECT_SYSTEM_NO) {
416 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL ? STRV_MAKE("/usr", "-/boot", "/etc") : STRV_MAKE("/usr", "-/boot"), READONLY);
421 assert(mounts + n == m);
423 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
424 drop_duplicates(mounts, &n);
428 /* Remount / as SLAVE so that nothing now mounted in the namespace
429 shows up in the parent */
430 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
433 for (m = mounts; m < mounts + n; ++m) {
434 r = apply_mount(m, tmp_dir, var_tmp_dir);
439 for (m = mounts; m < mounts + n; ++m) {
440 r = make_read_only(m);
446 /* Remount / as the desired mode. Not that this will not
447 * reestablish propagation from our side to the host, since
448 * what's disconnected is disconnected. */
449 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
458 for (m = mounts; m < mounts + n; ++m)
460 umount2(m->path, MNT_DETACH);
466 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
467 _cleanup_free_ char *x = NULL;
468 char bid[SD_ID128_STRING_MAX];
476 /* We include the boot id in the directory so that after a
477 * reboot we can easily identify obsolete directories. */
479 r = sd_id128_get_boot(&boot_id);
483 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
491 RUN_WITH_UMASK(0000) {
494 y = strappenda(x, "/tmp");
496 if (mkdir(y, 0777 | S_ISVTX) < 0)
506 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
514 r = setup_one_tmp_dir(id, "/tmp", &a);
518 r = setup_one_tmp_dir(id, "/var/tmp", &b);
522 t = strappenda(a, "/tmp");
536 int setup_netns(int netns_storage_socket[2]) {
537 _cleanup_close_ int netns = -1;
539 struct cmsghdr cmsghdr;
540 uint8_t buf[CMSG_SPACE(sizeof(int))];
543 .msg_control = &control,
544 .msg_controllen = sizeof(control),
546 struct cmsghdr *cmsg;
549 assert(netns_storage_socket);
550 assert(netns_storage_socket[0] >= 0);
551 assert(netns_storage_socket[1] >= 0);
553 /* We use the passed socketpair as a storage buffer for our
554 * namespace reference fd. Whatever process runs this first
555 * shall create a new namespace, all others should just join
556 * it. To serialize that we use a file lock on the socket
559 * It's a bit crazy, but hey, works great! */
561 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
564 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
565 if (errno != EAGAIN) {
570 /* Nothing stored yet, so let's create a new namespace */
572 if (unshare(CLONE_NEWNET) < 0) {
579 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
587 /* Yay, found something, so let's join the namespace */
589 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
590 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
591 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
592 netns = *(int*) CMSG_DATA(cmsg);
596 if (setns(netns, CLONE_NEWNET) < 0) {
604 cmsg = CMSG_FIRSTHDR(&mh);
605 cmsg->cmsg_level = SOL_SOCKET;
606 cmsg->cmsg_type = SCM_RIGHTS;
607 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
608 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
609 mh.msg_controllen = cmsg->cmsg_len;
611 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
617 lockf(netns_storage_socket[0], F_ULOCK, 0);
622 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
623 [PROTECT_HOME_NO] = "no",
624 [PROTECT_HOME_YES] = "yes",
625 [PROTECT_HOME_READ_ONLY] = "read-only",
628 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
630 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
631 [PROTECT_SYSTEM_NO] = "no",
632 [PROTECT_SYSTEM_YES] = "yes",
633 [PROTECT_SYSTEM_FULL] = "full",
636 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);