if (context->mount_apivfs && (context->root_image || context->root_directory))
return true;
+ if (context->dynamic_user &&
+ (!strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
+ !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
+ !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
+ !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
+ return true;
+
return false;
}
}
STRV_FOREACH(rt, context->directories[type].paths) {
- _cleanup_free_ char *p;
+ _cleanup_free_ char *p = NULL, *pp = NULL;
+ const char *effective;
p = strjoin(params->prefix[type], "/", *rt);
if (!p) {
if (r < 0)
goto fail;
- r = mkdir_label(p, context->directories[type].mode);
- if (r < 0 && r != -EEXIST)
- goto fail;
+ if (context->dynamic_user && type != EXEC_DIRECTORY_CONFIGURATION) {
+ _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
+
+ /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
+ * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
+ * whose UID is later on reused. To lock this down we use the same trick used by container
+ * managers to prohibit host users to get access to files of the same UID in containers: we
+ * place everything inside a directory that has an access mode of 0700 and is owned root:root,
+ * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
+ * to make this directory permeable for the service itself.
+ *
+ * Specifically: for a service which wants a special directory "foo/" we first create a
+ * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
+ * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
+ * privileged host users can access "foo/" as usual, but unprivileged host users can't look
+ * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
+ * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
+ * disabling the access boundary for the service and making sure it only gets access to the
+ * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
+ *
+ * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
+ * owned by the service itself. */
+
+ private_root = strjoin(params->prefix[type], "/private");
+ if (!private_root) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
+ r = mkdir_safe_label(private_root, 0700, 0, 0);
+ if (r < 0)
+ goto fail;
+
+ pp = strjoin(private_root, "/", *rt);
+ if (!pp) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ /* Create all directories between the configured directory and this private root, and mark them 0755 */
+ r = mkdir_parents_label(pp, 0755);
+ if (r < 0)
+ goto fail;
+
+ /* Finally, create the actual directory for the service */
+ r = mkdir_label(pp, context->directories[type].mode);
+ if (r < 0 && r != -EEXIST)
+ goto fail;
+
+ parent = dirname_malloc(p);
+ if (!parent) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ r = path_make_relative(parent, pp, &relative);
+ if (r < 0)
+ goto fail;
+
+ /* And link it up from the original place */
+ r = symlink_idempotent(relative, p);
+ if (r < 0)
+ goto fail;
+
+ effective = pp;
+
+ } else {
+ r = mkdir_label(p, context->directories[type].mode);
+ if (r < 0 && r != -EEXIST)
+ goto fail;
+
+ effective = p;
+ }
/* First lock down the access mode */
- if (chmod(p, context->directories[type].mode) < 0) {
+ if (chmod(effective, context->directories[type].mode) < 0) {
r = -errno;
goto fail;
}
continue;
/* Then, change the ownership of the whole tree, if necessary */
- r = path_chown_recursive(p, uid, gid);
+ r = path_chown_recursive(effective, uid, gid);
if (r < 0)
goto fail;
}
return 0;
}
+static int compile_bind_mounts(
+ const ExecContext *context,
+ const ExecParameters *params,
+ BindMount **ret_bind_mounts,
+ unsigned *ret_n_bind_mounts,
+ char ***ret_empty_directories) {
+
+ _cleanup_strv_free_ char **empty_directories = NULL;
+ BindMount *bind_mounts;
+ unsigned n, h = 0, i;
+ ExecDirectoryType t;
+ int r;
+
+ assert(context);
+ assert(params);
+ assert(ret_bind_mounts);
+ assert(ret_n_bind_mounts);
+ assert(ret_empty_directories);
+
+ n = context->n_bind_mounts;
+ for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ if (!params->prefix[t])
+ continue;
+
+ n += strv_length(context->directories[t].paths);
+ }
+
+ if (n <= 0) {
+ *ret_bind_mounts = NULL;
+ *ret_n_bind_mounts = 0;
+ *ret_empty_directories = NULL;
+ return 0;
+ }
+
+ bind_mounts = new(BindMount, n);
+ if (!bind_mounts)
+ return -ENOMEM;
+
+ for (i = 0; context->n_bind_mounts; i++) {
+ BindMount *item = context->bind_mounts + i;
+ char *s, *d;
+
+ s = strdup(item->source);
+ if (!s) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ d = strdup(item->destination);
+ if (!d) {
+ free(s);
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ bind_mounts[h++] = (BindMount) {
+ .source = s,
+ .destination = d,
+ .read_only = item->read_only,
+ .recursive = item->recursive,
+ .ignore_enoent = item->ignore_enoent,
+ };
+ }
+
+ for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ char **suffix;
+
+ if (!params->prefix[t])
+ continue;
+
+ if (strv_isempty(context->directories[t].paths))
+ continue;
+
+ if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION) {
+ char *private_root;
+
+ /* So this is for a dynamic user, and we need to make sure the process can access its own
+ * directory. For that we overmount the usually inaccessible "private" subdirectory with a
+ * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
+
+ private_root = strjoin(params->prefix[t], "/private");
+ if (!private_root) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ r = strv_consume(&empty_directories, private_root);
+ if (r < 0) {
+ r = -ENOMEM;
+ goto finish;
+ }
+ }
+
+ STRV_FOREACH(suffix, context->directories[t].paths) {
+ char *s, *d;
+
+ if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION)
+ s = strjoin(params->prefix[t], "/private/", *suffix);
+ else
+ s = strjoin(params->prefix[t], "/", *suffix);
+ if (!s) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ d = strdup(s);
+ if (!d) {
+ free(s);
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ bind_mounts[h++] = (BindMount) {
+ .source = s,
+ .destination = d,
+ .read_only = false,
+ .recursive = true,
+ .ignore_enoent = false,
+ };
+ }
+ }
+
+ assert(h == n);
+
+ *ret_bind_mounts = bind_mounts;
+ *ret_n_bind_mounts = n;
+ *ret_empty_directories = empty_directories;
+
+ empty_directories = NULL;
+
+ return (int) n;
+
+finish:
+ bind_mount_free_many(bind_mounts, h);
+ return r;
+}
+
static int apply_mount_namespace(
Unit *u,
ExecCommand *command,
const ExecParameters *params,
ExecRuntime *runtime) {
- _cleanup_strv_free_ char **rw = NULL;
+ _cleanup_strv_free_ char **rw = NULL, **empty_directories = NULL;
char *tmp = NULL, *var = NULL;
const char *root_dir = NULL, *root_image = NULL;
NameSpaceInfo ns_info = {
.mount_apivfs = context->mount_apivfs,
};
bool needs_sandboxing;
+ BindMount *bind_mounts = NULL;
+ unsigned n_bind_mounts = 0;
int r;
assert(context);
root_dir = context->root_directory;
}
+ r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
+ if (r < 0)
+ return r;
+
/*
* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
* sandbox info, otherwise enforce it, don't ignore protected paths and
&ns_info, rw,
needs_sandboxing ? context->read_only_paths : NULL,
needs_sandboxing ? context->inaccessible_paths : NULL,
- context->bind_mounts,
- context->n_bind_mounts,
+ empty_directories,
+ bind_mounts,
+ n_bind_mounts,
tmp,
var,
needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
context->mount_flags,
DISSECT_IMAGE_DISCARD_ON_LOOP);
+ bind_mount_free_many(bind_mounts, n_bind_mounts);
+
/* If we couldn't set up the namespace this is probably due to a
* missing capability. In this case, silently proceeed. */
if (IN_SET(r, -EPERM, -EACCES)) {
if (!p)
return -ENOMEM;
- /* We execute this synchronously, since we need to be
- * sure this is gone when we start the service
+ /* We execute this synchronously, since we need to be sure this is gone when we start the service
* next. */
(void) rm_rf(p, REMOVE_ROOT);
+
+ /* Also destroy any matching subdirectory below /private/. This is done to support DynamicUser=1
+ * setups. Note that we don't conditionalize here on that though, as the namespace is same way, and it
+ * makes us a bit more robust towards changing unit settings. Or to say this differently: in the worst
+ * case this is a NOP. */
+
+ free(p);
+ p = strjoin(runtime_prefix, "/private/", *i);
+ if (!p)
+ return -ENOMEM;
+
+ (void) rm_rf(p, REMOVE_ROOT);
}
return 0;
PRIVATE_VAR_TMP,
PRIVATE_DEV,
BIND_DEV,
+ EMPTY_DIR,
SYSFS,
PROCFS,
READONLY,
return 0;
}
+static int append_empty_dir_mounts(MountEntry **p, char **strv) {
+ char **i;
+
+ assert(p);
+
+ /* Adds tmpfs mounts to provide readable but empty directories. This is primarily used to implement the
+ * "/private/" boundary directories for DynamicUser=1. */
+
+ STRV_FOREACH(i, strv) {
+
+ *((*p)++) = (MountEntry) {
+ .path_const = *i,
+ .mode = EMPTY_DIR,
+ .ignore = false,
+ .has_prefix = false,
+ .read_only = true,
+ };
+ }
+
+ return 0;
+}
+
static int append_bind_mounts(MountEntry **p, const BindMount *binds, unsigned n) {
unsigned i;
return 1;
}
+static int mount_empty_dir(MountEntry *m) {
+ assert(m);
+
+ /* First, get rid of everything that is below if there is anything. Then, overmount with our new empty dir */
+
+ (void) mkdir_p_label(mount_entry_path(m), 0755);
+ (void) umount_recursive(mount_entry_path(m), 0);
+
+ if (mount("tmpfs", mount_entry_path(m), "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
+ return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
+
+ return 1;
+}
+
static int mount_entry_chase(
const char *root_directory,
MountEntry *m,
make = true;
break;
+ case EMPTY_DIR:
+ return mount_empty_dir(m);
+
case PRIVATE_TMP:
what = tmp_dir;
make = true;
char** read_write_paths,
char** read_only_paths,
char** inaccessible_paths,
+ char** empty_directories,
const BindMount *bind_mounts,
unsigned n_bind_mounts,
const char* tmp_dir,
strv_length(read_write_paths) +
strv_length(read_only_paths) +
strv_length(inaccessible_paths) +
+ strv_length(empty_directories) +
n_bind_mounts +
ns_info->private_dev +
(ns_info->protect_kernel_tunables ? ELEMENTSOF(protect_kernel_tunables_table) : 0) +
char** read_write_paths,
char** read_only_paths,
char** inaccessible_paths,
+ char** empty_directories,
const BindMount *bind_mounts,
unsigned n_bind_mounts,
const char* tmp_dir,
read_write_paths,
read_only_paths,
inaccessible_paths,
+ empty_directories,
bind_mounts, n_bind_mounts,
tmp_dir, var_tmp_dir,
protect_home, protect_system);
if (r < 0)
goto finish;
+ r = append_empty_dir_mounts(&m, empty_directories);
+ if (r < 0)
+ goto finish;
+
r = append_bind_mounts(&m, bind_mounts, n_bind_mounts);
if (r < 0)
goto finish;