core: when applying syscall filters, use ENOSYS for unknown calls
authorZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Fri, 1 Dec 2023 18:03:23 +0000 (19:03 +0100)
committerZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Sat, 2 Dec 2023 00:20:28 +0000 (01:20 +0100)
glibc starting using fchmodat2 to implement fchmod with flags [1], but
current version of libseccomp does not support fchmodat2 [2]. This is
causing problems with programs sandboxed by systemd. libseccomp needs to know
a syscall to be able to set any kind of filter for it, so for syscalls unknown
by libseccomp we would always do the default action, i.e. either return the
errno set by SystemCallErrorNumber or send a fatal signal. For glibc to ignore
the unknown syscall and gracefully fall back to the older implementation,
we need to return ENOSYS. In particular, tar now fails with the default
SystemCallFilter="@system-service" sandbox [3].

This is of course a wider problem: any time the kernel gains new syscalls,
before libseccomp and systemd have caught up, we'd behave incorrectly. Let's
do the same as we already were doing in nspawn since
3573e032f26724949e86626eace058d006b8bf70, and do the "default action" only
for syscalls which are known by us and libseccomp, and return ENOSYS for
anything else. This means that users can start using a sandbox with the new
syscalls only after libseccomp and systemd have been updated, but before that
happens they behaviour that is backwards-compatible.

[1] https://github.com/bminor/glibc/commit/65341f7bbea824d2ff9d37db15d8be162df42bd3
[2] https://github.com/seccomp/libseccomp/issues/406
[2] https://github.com/systemd/systemd/issues/30250

Fixes https://github.com/systemd/systemd/issues/30250.

In seccomp_restrict_sxid() there's a chunk conditionalized with
'#if defined(__SNR_fchmodat2)'. We need to kep that because seccomp_restrict_sxid()
seccomp_restrict_suid_sgid() uses SCMP_ACT_ALLOW as the default action.

src/shared/seccomp-util.c

index bb970d5..95c704d 100644 (file)
@@ -1129,7 +1129,9 @@ int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter
 
                 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
 
-                r = seccomp_init_for_arch(&seccomp, arch, default_action);
+                /* We install ENOSYS as the default action, but it will only apply to syscalls which are not
+                 * in the @known set. */
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(ENOSYS));
                 if (r < 0)
                         return r;
 
@@ -1164,6 +1166,23 @@ int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter
                         }
                 }
 
+                NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
+                        int id;
+
+                        id = seccomp_syscall_resolve_name(name);
+                        if (id < 0)
+                                continue;
+
+                        /* Ignore the syscall if it was already handled above */
+                        if (hashmap_contains(filter, INT_TO_PTR(id + 1)))
+                                continue;
+
+                        r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
+                        if (r < 0 && r != -EDOM)  /* EDOM means that the syscall is not available for arch */
+                                return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
+                                                       name, id);
+                }
+
                 r = seccomp_load(seccomp);
                 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
                         return r;