From 96bedbe2e5301fe0e93993de0e9a31baf2679168 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 14 Sep 2017 10:18:57 +0200 Subject: [PATCH] nspawn: replace syscall blacklist by a whitelist Let's lock things down a bit, and maintain a list of what's permitted rather than a list of what's prohibited in nspawn (also to make things a bit more like Docker and friends). Note that this slightly alters the effect of --system-call-filter=, as now the negative list now takes precedence over the positive list. However, given that the option is just a few days old and not included in any released version it should be fine to change it at this point in time. Note that the whitelist is good chunk more restrictive thatn the previous blacklist. Specifically: - fanotify is not permitted (given the buffer size issues it's problematic in containers) - nfsservctl is not permitted (NFS server support is not virtualized) - pkey_xyz stuff is not permitted (really new stuff I don't grok) - @cpu-emulation is prohibited (untested legacy stuff mostly, and if people really want to run dosemu in nspawn, they should use --system-call-filter=@cpu-emulation and all should be good) --- man/systemd-nspawn.xml | 6 +- src/nspawn/nspawn-seccomp.c | 193 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 159 insertions(+), 40 deletions(-) diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index c4db6a3..3951e32 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -723,9 +723,9 @@ system calls will be permitted. The list may optionally be prefixed by ~, in which case all listed system calls are prohibited. If this command line option is used multiple times the configured lists are combined. If both a positive and a negative list (that is one system call list without and one with the - ~ prefix) are configured, the positive list takes precedence over the negative list. Note - that systemd-nspawn always implements a system call blacklist (as opposed to a whitelist), - and this command line option hence adds or removes entries from the default blacklist, depending on the + ~ prefix) are configured, the negative list takes precedence over the positive list. Note + that systemd-nspawn always implements a system call whitelist (as opposed to a blacklist), + and this command line option hence adds or removes entries from the default whitelist, depending on the ~ prefix. Note that the applied system call filter is also altered implicitly if additional capabilities are passed using the --capabilities=. diff --git a/src/nspawn/nspawn-seccomp.c b/src/nspawn/nspawn-seccomp.c index a6f7a7d..db3d098 100644 --- a/src/nspawn/nspawn-seccomp.c +++ b/src/nspawn/nspawn-seccomp.c @@ -47,47 +47,154 @@ static int seccomp_add_default_syscall_filter( static const struct { uint64_t capability; const char* name; - } blacklist[] = { - { 0, "@obsolete" }, - { 0, "@keyring" }, /* keyring is not namespaced */ - { 0, "bpf" }, - { 0, "kexec_file_load" }, - { 0, "kexec_load" }, - { 0, "lookup_dcookie" }, - { 0, "open_by_handle_at" }, - { 0, "perf_event_open" }, - { 0, "quotactl" }, - { 0, "@swap" }, - { CAP_SYSLOG, "syslog" }, - { CAP_SYS_MODULE, "@module" }, - { CAP_SYS_PACCT, "acct" }, - { CAP_SYS_PTRACE, "process_vm_readv" }, - { CAP_SYS_PTRACE, "process_vm_writev" }, - { CAP_SYS_PTRACE, "ptrace" }, - { CAP_SYS_RAWIO, "@raw-io" }, - { CAP_SYS_TIME, "@clock" }, + } whitelist[] = { + /* Let's use set names where we can */ + { 0, "@basic-io" }, + { 0, "@credentials" }, + { 0, "@default" }, + { 0, "@file-system" }, + { 0, "@io-event" }, + { 0, "@ipc" }, + { 0, "@mount" }, + { 0, "@network-io" }, + { 0, "@process" }, + { 0, "@resources" }, + { 0, "@setuid" }, + { 0, "@signal" }, + { 0, "@timer" }, + + /* The following four are sets we optionally enable, in case the caps have been configured for it */ + { CAP_SYS_TIME, "@clock" }, + { CAP_SYS_MODULE, "@module" }, + { CAP_SYS_RAWIO, "@raw-io" }, + { CAP_IPC_LOCK, "@memlock" }, + + /* Plus a good set of additional syscalls which are not part of any of the groups above */ + { 0, "brk" }, + { 0, "capset" }, + { 0, "chown" }, + { 0, "chown32" }, + { 0, "copy_file_range" }, + { 0, "fadvise64" }, + { 0, "fadvise64_64" }, + { 0, "fchown" }, + { 0, "fchown32" }, + { 0, "fchownat" }, + { 0, "fdatasync" }, + { 0, "flock" }, + { 0, "fsync" }, + { 0, "get_mempolicy" }, + { 0, "getcpu" }, + { 0, "getpriority" }, + { 0, "getrandom" }, + { 0, "io_cancel" }, + { 0, "io_destroy" }, + { 0, "io_getevents" }, + { 0, "io_setup" }, + { 0, "io_submit" }, + { 0, "ioctl" }, + { 0, "ioprio_get" }, + { 0, "kcmp" }, + { 0, "lchown" }, + { 0, "lchown32" }, + { 0, "madvise" }, + { 0, "mincore" }, + { 0, "mprotect" }, + { 0, "mremap" }, + { 0, "msync" }, + { 0, "name_to_handle_at" }, + { 0, "oldolduname" }, + { 0, "olduname" }, + { 0, "personality" }, + { 0, "preadv2" }, + { 0, "pwritev2" }, + { 0, "readahead" }, + { 0, "readdir" }, + { 0, "remap_file_pages" }, + { 0, "sched_get_priority_max" }, + { 0, "sched_get_priority_min" }, + { 0, "sched_getaffinity" }, + { 0, "sched_getattr" }, + { 0, "sched_getparam" }, + { 0, "sched_getscheduler" }, + { 0, "sched_rr_get_interval" }, + { 0, "sched_yield" }, + { 0, "seccomp" }, + { 0, "sendfile" }, + { 0, "sendfile64" }, + { 0, "setdomainname" }, + { 0, "setfsgid" }, + { 0, "setfsgid32" }, + { 0, "setfsuid" }, + { 0, "setfsuid32" }, + { 0, "sethostname" }, + { 0, "setpgid" }, + { 0, "setsid" }, + { 0, "splice" }, + { 0, "sync" }, + { 0, "sync_file_range" }, + { 0, "syncfs" }, + { 0, "sysinfo" }, + { 0, "tee" }, + { 0, "ugetrlimit" }, + { 0, "umask" }, + { 0, "uname" }, + { 0, "userfaultfd" }, + { 0, "vmsplice" }, + + /* The following individual syscalls are added depending on specified caps */ + { CAP_SYS_PACCT, "acct" }, + { CAP_SYS_PTRACE, "process_vm_readv" }, + { CAP_SYS_PTRACE, "process_vm_writev" }, + { CAP_SYS_PTRACE, "ptrace" }, + { CAP_SYS_BOOT, "reboot" }, + { CAP_SYSLOG, "syslog" }, + { CAP_SYS_TTY_CONFIG, "vhangup" }, + + /* + * The following syscalls and groups are knowingly excluded: + * + * @cpu-emulation + * @keyring (NB: keyring is not namespaced!) + * @obsolete + * @swap + * + * bpf (NB: bpffs is not namespaced!) + * fanotify_init + * fanotify_mark + * kexec_file_load + * kexec_load + * lookup_dcookie + * nfsservctl + * open_by_handle_at + * perf_event_open + * pkey_alloc + * pkey_free + * pkey_mprotect + * quotactl + */ }; int r, c = 0; size_t i; char **p; - for (i = 0; i < ELEMENTSOF(blacklist); i++) { - if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability))) + for (i = 0; i < ELEMENTSOF(whitelist); i++) { + if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0) continue; - r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist); + r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist); if (r < 0) /* If the system call is not known on this architecture, then that's fine, let's ignore it */ - log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name); + log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch)); else c++; } - STRV_FOREACH(p, syscall_blacklist) { - r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist); + STRV_FOREACH(p, syscall_whitelist) { + r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist); if (r < 0) - log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p); + log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch)); else c++; } @@ -106,17 +213,32 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys SECCOMP_FOREACH_LOCAL_ARCH(arch) { _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; - int n; - log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch)); + log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch)); - r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM)); if (r < 0) return log_error_errno(r, "Failed to allocate seccomp object: %m"); - n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); - if (n < 0) - return n; + r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist); + if (r < 0) + return r; + + r = seccomp_load(seccomp); + if (IN_SET(r, -EPERM, -EACCES)) + return log_error_errno(r, "Failed to install seccomp filter: %m"); + if (r < 0) + log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch)); + } + + SECCOMP_FOREACH_LOCAL_ARCH(arch) { + _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL; + + log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch)); + + r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW); + if (r < 0) + return log_error_errno(r, "Failed to allocate seccomp object: %m"); /* Audit is broken in containers, much of the userspace audit hookup will fail if running inside a @@ -133,13 +255,10 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys 2, SCMP_A0(SCMP_CMP_EQ, AF_NETLINK), SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT)); - if (r < 0) + if (r < 0) { log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m"); - else - n++; - - if (n <= 0) /* no rule added? then skip this architecture */ continue; + } r = seccomp_load(seccomp); if (IN_SET(r, -EPERM, -EACCES)) -- 2.7.4