nspawn: replace syscall blacklist by a whitelist
authorLennart Poettering <lennart@poettering.net>
Thu, 14 Sep 2017 08:18:57 +0000 (10:18 +0200)
committerLennart Poettering <lennart@poettering.net>
Thu, 14 Sep 2017 13:45:21 +0000 (15:45 +0200)
Let's lock things down a bit, and maintain a list of what's permitted
rather than a list of what's prohibited in nspawn (also to make things a
bit more like Docker and friends).

Note that this slightly alters the effect of --system-call-filter=, as
now the negative list now takes precedence over the positive list.
However, given that the option is just a few days old and not included
in any released version it should be fine to change it at this point in
time.

Note that the whitelist is good chunk more restrictive thatn the
previous blacklist. Specifically:

- fanotify is not permitted (given the buffer size issues it's
  problematic in containers)
- nfsservctl is not permitted (NFS server support is not virtualized)
- pkey_xyz stuff is not permitted (really new stuff I don't grok)
- @cpu-emulation is prohibited (untested legacy stuff mostly, and if
  people really want to run dosemu in nspawn, they should use
  --system-call-filter=@cpu-emulation and all should be good)

man/systemd-nspawn.xml
src/nspawn/nspawn-seccomp.c

index c4db6a3..3951e32 100644 (file)
         system calls will be permitted. The list may optionally be prefixed by <literal>~</literal>, in which case all
         listed system calls are prohibited. If this command line option is used multiple times the configured lists are
         combined. If both a positive and a negative list (that is one system call list without and one with the
-        <literal>~</literal> prefix) are configured, the positive list takes precedence over the negative list. Note
-        that <command>systemd-nspawn</command> always implements a system call blacklist (as opposed to a whitelist),
-        and this command line option hence adds or removes entries from the default blacklist, depending on the
+        <literal>~</literal> prefix) are configured, the negative list takes precedence over the positive list. Note
+        that <command>systemd-nspawn</command> always implements a system call whitelist (as opposed to a blacklist),
+        and this command line option hence adds or removes entries from the default whitelist, depending on the
         <literal>~</literal> prefix. Note that the applied system call filter is also altered implicitly if additional
         capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
       </varlistentry>
index a6f7a7d..db3d098 100644 (file)
@@ -47,47 +47,154 @@ static int seccomp_add_default_syscall_filter(
         static const struct {
                 uint64_t capability;
                 const char* name;
-        } blacklist[] = {
-                { 0,              "@obsolete"           },
-                { 0,              "@keyring"            }, /* keyring is not namespaced */
-                { 0,              "bpf"                 },
-                { 0,              "kexec_file_load"     },
-                { 0,              "kexec_load"          },
-                { 0,              "lookup_dcookie"      },
-                { 0,              "open_by_handle_at"   },
-                { 0,              "perf_event_open"     },
-                { 0,              "quotactl"            },
-                { 0,              "@swap"               },
-                { CAP_SYSLOG,     "syslog"              },
-                { CAP_SYS_MODULE, "@module"             },
-                { CAP_SYS_PACCT,  "acct"                },
-                { CAP_SYS_PTRACE, "process_vm_readv"    },
-                { CAP_SYS_PTRACE, "process_vm_writev"   },
-                { CAP_SYS_PTRACE, "ptrace"              },
-                { CAP_SYS_RAWIO,  "@raw-io"             },
-                { CAP_SYS_TIME,   "@clock"              },
+        } whitelist[] = {
+                /* Let's use set names where we can */
+                { 0,                  "@basic-io"              },
+                { 0,                  "@credentials"           },
+                { 0,                  "@default"               },
+                { 0,                  "@file-system"           },
+                { 0,                  "@io-event"              },
+                { 0,                  "@ipc"                   },
+                { 0,                  "@mount"                 },
+                { 0,                  "@network-io"            },
+                { 0,                  "@process"               },
+                { 0,                  "@resources"             },
+                { 0,                  "@setuid"                },
+                { 0,                  "@signal"                },
+                { 0,                  "@timer"                 },
+
+                /* The following four are sets we optionally enable, in case the caps have been configured for it */
+                { CAP_SYS_TIME,       "@clock"                 },
+                { CAP_SYS_MODULE,     "@module"                },
+                { CAP_SYS_RAWIO,      "@raw-io"                },
+                { CAP_IPC_LOCK,       "@memlock"               },
+
+                /* Plus a good set of additional syscalls which are not part of any of the groups above */
+                { 0,                  "brk"                    },
+                { 0,                  "capset"                 },
+                { 0,                  "chown"                  },
+                { 0,                  "chown32"                },
+                { 0,                  "copy_file_range"        },
+                { 0,                  "fadvise64"              },
+                { 0,                  "fadvise64_64"           },
+                { 0,                  "fchown"                 },
+                { 0,                  "fchown32"               },
+                { 0,                  "fchownat"               },
+                { 0,                  "fdatasync"              },
+                { 0,                  "flock"                  },
+                { 0,                  "fsync"                  },
+                { 0,                  "get_mempolicy"          },
+                { 0,                  "getcpu"                 },
+                { 0,                  "getpriority"            },
+                { 0,                  "getrandom"              },
+                { 0,                  "io_cancel"              },
+                { 0,                  "io_destroy"             },
+                { 0,                  "io_getevents"           },
+                { 0,                  "io_setup"               },
+                { 0,                  "io_submit"              },
+                { 0,                  "ioctl"                  },
+                { 0,                  "ioprio_get"             },
+                { 0,                  "kcmp"                   },
+                { 0,                  "lchown"                 },
+                { 0,                  "lchown32"               },
+                { 0,                  "madvise"                },
+                { 0,                  "mincore"                },
+                { 0,                  "mprotect"               },
+                { 0,                  "mremap"                 },
+                { 0,                  "msync"                  },
+                { 0,                  "name_to_handle_at"      },
+                { 0,                  "oldolduname"            },
+                { 0,                  "olduname"               },
+                { 0,                  "personality"            },
+                { 0,                  "preadv2"                },
+                { 0,                  "pwritev2"               },
+                { 0,                  "readahead"              },
+                { 0,                  "readdir"                },
+                { 0,                  "remap_file_pages"       },
+                { 0,                  "sched_get_priority_max" },
+                { 0,                  "sched_get_priority_min" },
+                { 0,                  "sched_getaffinity"      },
+                { 0,                  "sched_getattr"          },
+                { 0,                  "sched_getparam"         },
+                { 0,                  "sched_getscheduler"     },
+                { 0,                  "sched_rr_get_interval"  },
+                { 0,                  "sched_yield"            },
+                { 0,                  "seccomp"                },
+                { 0,                  "sendfile"               },
+                { 0,                  "sendfile64"             },
+                { 0,                  "setdomainname"          },
+                { 0,                  "setfsgid"               },
+                { 0,                  "setfsgid32"             },
+                { 0,                  "setfsuid"               },
+                { 0,                  "setfsuid32"             },
+                { 0,                  "sethostname"            },
+                { 0,                  "setpgid"                },
+                { 0,                  "setsid"                 },
+                { 0,                  "splice"                 },
+                { 0,                  "sync"                   },
+                { 0,                  "sync_file_range"        },
+                { 0,                  "syncfs"                 },
+                { 0,                  "sysinfo"                },
+                { 0,                  "tee"                    },
+                { 0,                  "ugetrlimit"             },
+                { 0,                  "umask"                  },
+                { 0,                  "uname"                  },
+                { 0,                  "userfaultfd"            },
+                { 0,                  "vmsplice"               },
+
+                /* The following individual syscalls are added depending on specified caps */
+                { CAP_SYS_PACCT,      "acct"                   },
+                { CAP_SYS_PTRACE,     "process_vm_readv"       },
+                { CAP_SYS_PTRACE,     "process_vm_writev"      },
+                { CAP_SYS_PTRACE,     "ptrace"                 },
+                { CAP_SYS_BOOT,       "reboot"                 },
+                { CAP_SYSLOG,         "syslog"                 },
+                { CAP_SYS_TTY_CONFIG, "vhangup"                },
+
+                /*
+                 * The following syscalls and groups are knowingly excluded:
+                 *
+                 * @cpu-emulation
+                 * @keyring           (NB: keyring is not namespaced!)
+                 * @obsolete
+                 * @swap
+                 *
+                 * bpf                (NB: bpffs is not namespaced!)
+                 * fanotify_init
+                 * fanotify_mark
+                 * kexec_file_load
+                 * kexec_load
+                 * lookup_dcookie
+                 * nfsservctl
+                 * open_by_handle_at
+                 * perf_event_open
+                 * pkey_alloc
+                 * pkey_free
+                 * pkey_mprotect
+                 * quotactl
+                 */
         };
 
         int r, c = 0;
         size_t i;
         char **p;
 
-        for (i = 0; i < ELEMENTSOF(blacklist); i++) {
-                if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
+        for (i = 0; i < ELEMENTSOF(whitelist); i++) {
+                if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0)
                         continue;
 
-                r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
+                r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist);
                 if (r < 0)
                         /* If the system call is not known on this architecture, then that's fine, let's ignore it */
-                        log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
+                        log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch));
                 else
                         c++;
         }
 
-        STRV_FOREACH(p, syscall_blacklist) {
-                r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
+        STRV_FOREACH(p, syscall_whitelist) {
+                r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist);
                 if (r < 0)
-                        log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
+                        log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch));
                 else
                         c++;
         }
@@ -106,17 +213,32 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
 
         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
-                int n;
 
-                log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
+                log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch));
 
-                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM));
                 if (r < 0)
                         return log_error_errno(r, "Failed to allocate seccomp object: %m");
 
-                n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
-                if (n < 0)
-                        return n;
+                r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
+                if (r < 0)
+                        return r;
+
+                r = seccomp_load(seccomp);
+                if (IN_SET(r, -EPERM, -EACCES))
+                        return log_error_errno(r, "Failed to install seccomp filter: %m");
+                if (r < 0)
+                        log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+        }
+
+        SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+                _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+                log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch));
+
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to allocate seccomp object: %m");
 
                 /*
                   Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
@@ -133,13 +255,10 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
                                 2,
                                 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
                                 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
-                if (r < 0)
+                if (r < 0) {
                         log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
-                else
-                        n++;
-
-                if (n <= 0) /* no rule added? then skip this architecture */
                         continue;
+                }
 
                 r = seccomp_load(seccomp);
                 if (IN_SET(r, -EPERM, -EACCES))