2 * sandbox.c: Process sandboxing
4 * Copyright (C) 2017 Colin Watson.
6 * This file is part of man-db.
8 * man-db is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * man-db is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with man-db; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 * Some of the syscall lists in this file come from systemd, whose
23 * copyright/licensing statement is as follows. Per LGPLv2.1 s. 3, I have
24 * altered the original references to LGPLv2.1 to refer to GPLv2 instead.
26 * Copyright 2014 Lennart Poettering
28 * systemd is free software; you can redistribute it and/or modify it
29 * under the terms of the GNU General Public License as published by
30 * the Free Software Foundation; either version 2 of the License, or
31 * (at your option) any later version.
33 * systemd is distributed in the hope that it will be useful, but
34 * WITHOUT ANY WARRANTY; without even the implied warranty of
35 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
36 * General Public License for more details.
38 * You should have received a copy of the GNU General Public License
39 * along with systemd; If not, see <https://www.gnu.org/licenses/>.
44 #endif /* HAVE_CONFIG_H */
49 #include <sys/types.h>
54 #ifdef HAVE_LIBSECCOMP
55 # include <sys/ioctl.h>
57 # include <sys/mman.h>
58 # include <sys/prctl.h>
60 # include <sys/socket.h>
63 #endif /* HAVE_LIBSECCOMP */
65 #include "manconfig.h"
72 #ifdef HAVE_LIBSECCOMP
74 scmp_filter_ctx permissive_ctx;
75 #else /* !HAVE_LIBSECCOMP */
77 #endif /* HAVE_LIBSECCOMP */
80 #ifdef HAVE_LIBSECCOMP
81 static int seccomp_filter_unavailable = 0;
83 static void gripe_seccomp_filter_unavailable (void)
85 debug ("seccomp filtering requires a kernel configured with "
86 "CONFIG_SECCOMP_FILTER\n");
89 static int search_ld_preload (const char *needle)
91 const char *ld_preload_env;
92 static char *ld_preload_file = NULL;
94 ld_preload_env = getenv ("LD_PRELOAD");
95 if (ld_preload_env && strstr (ld_preload_env, needle) != NULL)
98 if (!ld_preload_file) {
103 fd = open ("/etc/ld.so.preload", O_RDONLY);
104 if (fd >= 0 && fstat (fd, &st) >= 0 && st.st_size)
105 mapped = mmap (NULL, st.st_size, PROT_READ,
106 MAP_PRIVATE | MAP_FILE, fd, 0);
108 ld_preload_file = xstrndup (mapped, st.st_size);
109 munmap (mapped, st.st_size);
111 ld_preload_file = xstrdup ("");
115 /* This isn't very accurate: /etc/ld.so.preload may contain
116 * comments. On the other hand, glibc says "it should only be used
117 * for emergencies and testing". File a bug if this is a problem
120 if (strstr (ld_preload_file, needle) != NULL)
126 /* Can we load a seccomp filter into this process?
128 * This guard allows us to call sandbox_load in code paths that may
129 * conditionally do so again.
131 static int can_load_seccomp (void)
133 const char *man_disable_seccomp;
136 if (seccomp_filter_unavailable) {
137 gripe_seccomp_filter_unavailable ();
141 man_disable_seccomp = getenv ("MAN_DISABLE_SECCOMP");
142 if (man_disable_seccomp && *man_disable_seccomp) {
143 debug ("seccomp filter disabled by user request\n");
147 /* Valgrind causes the child process to make some system calls we
148 * don't want to allow in general, so disable seccomp when running
151 * The correct approach seems to be to either require valgrind.h at
152 * build-time or copy valgrind.h into this project and then use the
153 * RUNNING_ON_VALGRIND macro, but I'd really rather not add a
154 * build-dependency for this or take a copy of a >6000-line header
155 * file. Since the goal of this is only to disable the seccomp
156 * filter under Valgrind, this will do for now.
158 if (search_ld_preload ("/vgpreload")) {
159 debug ("seccomp filter disabled while running under "
164 seccomp_status = prctl (PR_GET_SECCOMP);
166 if (seccomp_status == 0)
169 if (seccomp_status == -1) {
171 debug ("running kernel does not support seccomp\n");
173 debug ("unknown error getting seccomp status: %s\n",
175 } else if (seccomp_status == 2)
176 debug ("seccomp already enabled\n");
178 debug ("unknown return value from PR_GET_SECCOMP: %d\n",
182 #endif /* HAVE_LIBSECCOMP */
184 #ifdef HAVE_LIBSECCOMP
186 #define SC_ALLOW(name) \
188 int nr = seccomp_syscall_resolve_name (name); \
189 if (nr == __NR_SCMP_ERROR) \
191 if (seccomp_rule_add (ctx, SCMP_ACT_ALLOW, nr, 0) < 0) \
192 error (FATAL, errno, "can't add seccomp rule"); \
195 #define SC_ALLOW_ARG_1(name, cmp1) \
197 int nr = seccomp_syscall_resolve_name (name); \
198 if (nr == __NR_SCMP_ERROR) \
200 if (seccomp_rule_add (ctx, SCMP_ACT_ALLOW, nr, 1, cmp1) < 0) \
201 error (FATAL, errno, "can't add seccomp rule"); \
204 #define SC_ALLOW_ARG_2(name, cmp1, cmp2) \
206 int nr = seccomp_syscall_resolve_name (name); \
207 if (nr == __NR_SCMP_ERROR) \
209 if (seccomp_rule_add (ctx, SCMP_ACT_ALLOW, nr, \
210 2, cmp1, cmp2) < 0) \
211 error (FATAL, errno, "can't add seccomp rule"); \
214 /* Create a seccomp filter.
216 * If permissive is true, then the returned filter will allow limited file
217 * creation (although not making executable files). This obviously
218 * constitutes less effective confinement, but it's necessary for some
219 * subprocesses (such as groff) that need the ability to write to temporary
220 * files. Confining these further requires additional tools that can do
221 * path-based filtering or similar, such as AppArmor.
223 static scmp_filter_ctx make_seccomp_filter (int permissive)
226 mode_t mode_mask = S_ISUID | S_ISGID | S_IXUSR | S_IXGRP | S_IXOTH;
227 int create_mask = O_CREAT
230 #endif /* O_TMPFILE */
233 debug ("initialising seccomp filter (permissive: %d)\n", permissive);
234 ctx = seccomp_init (SCMP_ACT_TRAP);
236 error (FATAL, errno, "can't initialise seccomp filter");
238 /* Allow sibling architectures for x86, since people sometimes mix
239 * and match architectures there for performance reasons.
241 switch (seccomp_arch_native ()) {
243 seccomp_arch_add (ctx, SCMP_ARCH_X86_64);
244 seccomp_arch_add (ctx, SCMP_ARCH_X32);
246 case SCMP_ARCH_X86_64:
247 seccomp_arch_add (ctx, SCMP_ARCH_X86);
248 seccomp_arch_add (ctx, SCMP_ARCH_X32);
251 seccomp_arch_add (ctx, SCMP_ARCH_X86);
252 seccomp_arch_add (ctx, SCMP_ARCH_X86_64);
256 /* This sandbox is intended to allow operations that might
257 * reasonably be needed in simple data-transforming pipes: it should
258 * allow the process to do most reasonable things to itself, to read
259 * and write data from and to already-open file descriptors, to open
260 * files in read-only mode, and to fork new processes with the same
261 * restrictions. (If permissive is true, then it should also allow
262 * limited file creation; see the header comment above.)
264 * Since I currently know of no library with suitable syscall lists,
265 * the syscall lists here are taken from
266 * systemd:src/shared/seccomp-util.c, last updated from commit
267 * 67eb5b380a7b7eed82f658190bff4ca2d83e9abe (2017-11-30).
270 /* systemd: SystemCallFilter=@default */
271 SC_ALLOW ("clock_getres");
272 SC_ALLOW ("clock_gettime");
273 SC_ALLOW ("clock_nanosleep");
276 SC_ALLOW ("exit_group");
278 SC_ALLOW ("get_robust_list");
279 SC_ALLOW ("get_thread_area");
280 SC_ALLOW ("getegid");
281 SC_ALLOW ("getegid32");
282 SC_ALLOW ("geteuid");
283 SC_ALLOW ("geteuid32");
285 SC_ALLOW ("getgid32");
286 SC_ALLOW ("getgroups");
287 SC_ALLOW ("getgroups32");
288 SC_ALLOW ("getpgid");
289 SC_ALLOW ("getpgrp");
291 SC_ALLOW ("getppid");
292 SC_ALLOW ("getresgid");
293 SC_ALLOW ("getresgid32");
294 SC_ALLOW ("getresuid");
295 SC_ALLOW ("getresuid32");
296 SC_ALLOW ("getrlimit");
299 SC_ALLOW ("gettimeofday");
301 SC_ALLOW ("getuid32");
302 SC_ALLOW ("membarrier");
303 SC_ALLOW ("nanosleep");
305 SC_ALLOW ("prlimit64");
306 SC_ALLOW ("restart_syscall");
307 SC_ALLOW ("rt_sigreturn");
308 SC_ALLOW ("sched_yield");
309 SC_ALLOW ("set_robust_list");
310 SC_ALLOW ("set_thread_area");
311 SC_ALLOW ("set_tid_address");
312 SC_ALLOW ("set_tls");
313 SC_ALLOW ("sigreturn");
315 SC_ALLOW ("ugetrlimit");
317 /* systemd: SystemCallFilter=@basic-io */
318 SC_ALLOW ("_llseek");
324 SC_ALLOW ("pread64");
326 SC_ALLOW ("preadv2");
327 SC_ALLOW ("pwrite64");
328 SC_ALLOW ("pwritev");
329 SC_ALLOW ("pwritev2");
335 /* systemd: SystemCallFilter=@file-system (subset) */
339 SC_ALLOW_ARG_1 ("chmod",
340 SCMP_A1 (SCMP_CMP_MASKED_EQ, mode_mask, 0));
341 SC_ALLOW_ARG_1 ("creat",
342 SCMP_A1 (SCMP_CMP_MASKED_EQ, mode_mask, 0));
344 SC_ALLOW ("faccessat");
345 SC_ALLOW ("fallocate");
348 SC_ALLOW_ARG_1 ("fchmod",
349 SCMP_A1 (SCMP_CMP_MASKED_EQ, mode_mask, 0));
350 SC_ALLOW_ARG_1 ("fchmodat",
351 SCMP_A2 (SCMP_CMP_MASKED_EQ, mode_mask, 0));
354 SC_ALLOW ("fcntl64");
356 SC_ALLOW ("fstat64");
357 SC_ALLOW ("fstatat64");
358 SC_ALLOW ("fstatfs");
359 SC_ALLOW ("fstatfs64");
360 SC_ALLOW ("ftruncate");
361 SC_ALLOW ("ftruncate64");
362 if (permissive) SC_ALLOW ("futimesat");
364 SC_ALLOW ("getdents");
365 SC_ALLOW ("getdents64");
366 if (permissive) SC_ALLOW ("link");
367 if (permissive) SC_ALLOW ("linkat");
369 SC_ALLOW ("lstat64");
370 if (permissive) SC_ALLOW ("mkdir");
371 if (permissive) SC_ALLOW ("mkdirat");
375 SC_ALLOW ("newfstatat");
376 SC_ALLOW ("oldfstat");
377 SC_ALLOW ("oldlstat");
378 SC_ALLOW ("oldstat");
380 SC_ALLOW_ARG_2 ("open",
381 SCMP_A1 (SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
382 SCMP_A2 (SCMP_CMP_MASKED_EQ, mode_mask, 0));
383 SC_ALLOW_ARG_2 ("openat",
384 SCMP_A2 (SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
385 SCMP_A3 (SCMP_CMP_MASKED_EQ, mode_mask, 0));
387 SC_ALLOW_ARG_2 ("open",
388 SCMP_A1 (SCMP_CMP_MASKED_EQ,
389 O_TMPFILE, O_TMPFILE),
390 SCMP_A2 (SCMP_CMP_MASKED_EQ, mode_mask, 0));
391 SC_ALLOW_ARG_2 ("openat",
392 SCMP_A2 (SCMP_CMP_MASKED_EQ,
393 O_TMPFILE, O_TMPFILE),
394 SCMP_A3 (SCMP_CMP_MASKED_EQ, mode_mask, 0));
395 #endif /* O_TMPFILE */
396 SC_ALLOW_ARG_1 ("open",
397 SCMP_A1 (SCMP_CMP_MASKED_EQ, create_mask, 0));
398 SC_ALLOW_ARG_1 ("openat",
399 SCMP_A2 (SCMP_CMP_MASKED_EQ, create_mask, 0));
401 SC_ALLOW_ARG_1 ("open",
402 SCMP_A1 (SCMP_CMP_MASKED_EQ, O_ACCMODE,
404 SC_ALLOW_ARG_1 ("openat",
405 SCMP_A2 (SCMP_CMP_MASKED_EQ, O_ACCMODE,
408 SC_ALLOW ("readlink");
409 SC_ALLOW ("readlinkat");
410 if (permissive) SC_ALLOW ("rename");
411 if (permissive) SC_ALLOW ("renameat");
412 if (permissive) SC_ALLOW ("renameat2");
413 if (permissive) SC_ALLOW ("rmdir");
417 SC_ALLOW ("statfs64");
419 if (permissive) SC_ALLOW ("symlink");
420 if (permissive) SC_ALLOW ("symlinkat");
421 if (permissive) SC_ALLOW ("truncate");
422 if (permissive) SC_ALLOW ("truncateat");
423 if (permissive) SC_ALLOW ("unlink");
424 if (permissive) SC_ALLOW ("unlinkat");
425 if (permissive) SC_ALLOW ("utime");
426 if (permissive) SC_ALLOW ("utimensat");
427 if (permissive) SC_ALLOW ("utimes");
429 /* systemd: SystemCallFilter=@io-event */
430 SC_ALLOW ("_newselect");
431 SC_ALLOW ("epoll_create");
432 SC_ALLOW ("epoll_create1");
433 SC_ALLOW ("epoll_ctl");
434 SC_ALLOW ("epoll_ctl_old");
435 SC_ALLOW ("epoll_pwait");
436 SC_ALLOW ("epoll_wait");
437 SC_ALLOW ("epoll_wait_old");
438 SC_ALLOW ("eventfd");
439 SC_ALLOW ("eventfd2");
442 SC_ALLOW ("pselect6");
445 /* systemd: SystemCallFilter=@ipc (subset) */
449 /* systemd: SystemCallFilter=@process (subset) */
450 SC_ALLOW ("arch_prctl");
453 SC_ALLOW ("execveat");
455 SC_ALLOW ("getrusage");
460 SC_ALLOW ("waitpid");
462 /* systemd: SystemCallFilter=@signal */
463 SC_ALLOW ("rt_sigaction");
464 SC_ALLOW ("rt_sigpending");
465 SC_ALLOW ("rt_sigprocmask");
466 SC_ALLOW ("rt_sigsuspend");
467 SC_ALLOW ("rt_sigtimedwait");
468 SC_ALLOW ("sigaction");
469 SC_ALLOW ("sigaltstack");
471 SC_ALLOW ("signalfd");
472 SC_ALLOW ("signalfd4");
473 SC_ALLOW ("sigpending");
474 SC_ALLOW ("sigprocmask");
475 SC_ALLOW ("sigsuspend");
477 /* systemd: SystemCallFilter=@sync */
478 SC_ALLOW ("fdatasync");
482 SC_ALLOW ("sync_file_range");
485 /* Extra syscalls not in any of systemd's sets. */
486 SC_ALLOW ("arm_fadvise64_64");
487 SC_ALLOW ("arm_sync_file_range");
489 SC_ALLOW ("fadvise64");
490 SC_ALLOW ("fadvise64_64");
494 SC_ALLOW_ARG_1 ("ioctl", SCMP_A1 (SCMP_CMP_EQ, TCGETS));
495 SC_ALLOW_ARG_1 ("ioctl", SCMP_A1 (SCMP_CMP_EQ, TIOCGWINSZ));
497 SC_ALLOW ("madvise");
498 SC_ALLOW ("mprotect");
500 SC_ALLOW ("sched_getaffinity");
501 SC_ALLOW ("sync_file_range2");
502 SC_ALLOW ("sysinfo");
505 /* Allow killing processes and threads. This is unfortunate but
506 * unavoidable: groff uses kill to explicitly pass on SIGPIPE to its
507 * child processes, and we can't do any more sophisticated filtering
513 /* Allow some relatively harmless System V shared memory operations.
514 * These seem to be popular among the sort of program that wants to
515 * install itself in /etc/ld.so.preload or similar (e.g. antivirus
516 * programs and VPNs).
518 SC_ALLOW_ARG_1 ("shmat", SCMP_A2 (SCMP_CMP_EQ, SHM_RDONLY));
519 SC_ALLOW_ARG_1 ("shmctl", SCMP_A1 (SCMP_CMP_EQ, IPC_STAT));
523 /* Some antivirus programs use an LD_PRELOAD wrapper that wants to
524 * talk to a private daemon using a Unix-domain socket. We really
525 * don't want to allow these syscalls in general, but if such a
526 * thing is in use we probably have no choice.
528 * snoopy is an execve monitoring tool that may log messages to
531 if (search_ld_preload ("libesets_pac.so") ||
532 search_ld_preload ("libsnoopy.so")) {
533 SC_ALLOW ("connect");
534 SC_ALLOW ("recvmsg");
536 SC_ALLOW ("setsockopt");
537 SC_ALLOW_ARG_1 ("socket", SCMP_A0 (SCMP_CMP_EQ, AF_UNIX));
539 /* ESET sends messages to a System V message queue. */
540 if (search_ld_preload ("libesets_pac.so")) {
541 SC_ALLOW_ARG_1 ("msgget", SCMP_A1 (SCMP_CMP_EQ, 0));
548 #undef SC_ALLOW_ARG_2
549 #undef SC_ALLOW_ARG_1
552 #endif /* HAVE_LIBSECCOMP */
554 /* Create a sandbox for processing untrusted data.
556 * This only sets up data structures; the caller must call sandbox_load to
557 * actually enter the sandbox.
559 man_sandbox *sandbox_init (void)
561 man_sandbox *sandbox = XZALLOC (man_sandbox);
563 #ifdef HAVE_LIBSECCOMP
564 sandbox->ctx = make_seccomp_filter (0);
565 sandbox->permissive_ctx = make_seccomp_filter (1);
566 #else /* !HAVE_LIBSECCOMP */
568 #endif /* HAVE_LIBSECCOMP */
573 static void _sandbox_load (man_sandbox *sandbox, int permissive) {
574 #ifdef HAVE_LIBSECCOMP
575 if (can_load_seccomp ()) {
578 debug ("loading seccomp filter (permissive: %d)\n",
581 ctx = sandbox->permissive_ctx;
584 if (seccomp_load (ctx) < 0) {
585 if (errno == EINVAL || errno == EFAULT) {
586 /* The kernel doesn't give us particularly
587 * fine-grained errors. EINVAL could in
588 * theory be an invalid BPF program, but
589 * it's much more likely that the running
590 * kernel doesn't support seccomp filtering.
591 * EFAULT normally means a programming
592 * error, but it could also be returned here
593 * by some versions of qemu-user
594 * (https://bugs.launchpad.net/bugs/1726394).
596 gripe_seccomp_filter_unavailable ();
597 /* Don't try this again. */
598 seccomp_filter_unavailable = 1;
601 "can't load seccomp filter");
604 #endif /* HAVE_LIBSECCOMP */
607 /* Enter a sandbox for processing untrusted data. */
608 void sandbox_load (void *data)
610 man_sandbox *sandbox = data;
612 _sandbox_load (sandbox, 0);
615 /* Enter a sandbox for processing untrusted data, allowing limited file
618 void sandbox_load_permissive (void *data)
620 man_sandbox *sandbox = data;
622 _sandbox_load (sandbox, 1);
625 /* Free a sandbox for processing untrusted data. */
626 void sandbox_free (void *data) {
627 man_sandbox *sandbox = data;
629 #ifdef HAVE_LIBSECCOMP
630 seccomp_release (sandbox->ctx);
631 #endif /* HAVE_LIBSECCOMP */