Merge tag 'lsm-pr-20220801' of git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm
[platform/kernel/linux-starfive.git] / tools / testing / selftests / seccomp / seccomp_bpf.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/user.h>
28 #include <linux/prctl.h>
29 #include <linux/ptrace.h>
30 #include <linux/seccomp.h>
31 #include <pthread.h>
32 #include <semaphore.h>
33 #include <signal.h>
34 #include <stddef.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <time.h>
38 #include <limits.h>
39 #include <linux/elf.h>
40 #include <sys/uio.h>
41 #include <sys/utsname.h>
42 #include <sys/fcntl.h>
43 #include <sys/mman.h>
44 #include <sys/times.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <linux/kcmp.h>
48 #include <sys/resource.h>
49 #include <sys/capability.h>
50
51 #include <unistd.h>
52 #include <sys/syscall.h>
53 #include <poll.h>
54
55 #include "../kselftest_harness.h"
56 #include "../clone3/clone3_selftests.h"
57
58 /* Attempt to de-conflict with the selftests tree. */
59 #ifndef SKIP
60 #define SKIP(s, ...)    XFAIL(s, ##__VA_ARGS__)
61 #endif
62
63 #define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
64
65 #ifndef PR_SET_PTRACER
66 # define PR_SET_PTRACER 0x59616d61
67 #endif
68
69 #ifndef PR_SET_NO_NEW_PRIVS
70 #define PR_SET_NO_NEW_PRIVS 38
71 #define PR_GET_NO_NEW_PRIVS 39
72 #endif
73
74 #ifndef PR_SECCOMP_EXT
75 #define PR_SECCOMP_EXT 43
76 #endif
77
78 #ifndef SECCOMP_EXT_ACT
79 #define SECCOMP_EXT_ACT 1
80 #endif
81
82 #ifndef SECCOMP_EXT_ACT_TSYNC
83 #define SECCOMP_EXT_ACT_TSYNC 1
84 #endif
85
86 #ifndef SECCOMP_MODE_STRICT
87 #define SECCOMP_MODE_STRICT 1
88 #endif
89
90 #ifndef SECCOMP_MODE_FILTER
91 #define SECCOMP_MODE_FILTER 2
92 #endif
93
94 #ifndef SECCOMP_RET_ALLOW
95 struct seccomp_data {
96         int nr;
97         __u32 arch;
98         __u64 instruction_pointer;
99         __u64 args[6];
100 };
101 #endif
102
103 #ifndef SECCOMP_RET_KILL_PROCESS
104 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
105 #define SECCOMP_RET_KILL_THREAD  0x00000000U /* kill the thread */
106 #endif
107 #ifndef SECCOMP_RET_KILL
108 #define SECCOMP_RET_KILL         SECCOMP_RET_KILL_THREAD
109 #define SECCOMP_RET_TRAP         0x00030000U /* disallow and force a SIGSYS */
110 #define SECCOMP_RET_ERRNO        0x00050000U /* returns an errno */
111 #define SECCOMP_RET_TRACE        0x7ff00000U /* pass to a tracer or disallow */
112 #define SECCOMP_RET_ALLOW        0x7fff0000U /* allow */
113 #endif
114 #ifndef SECCOMP_RET_LOG
115 #define SECCOMP_RET_LOG          0x7ffc0000U /* allow after logging */
116 #endif
117
118 #ifndef __NR_seccomp
119 # if defined(__i386__)
120 #  define __NR_seccomp 354
121 # elif defined(__x86_64__)
122 #  define __NR_seccomp 317
123 # elif defined(__arm__)
124 #  define __NR_seccomp 383
125 # elif defined(__aarch64__)
126 #  define __NR_seccomp 277
127 # elif defined(__riscv)
128 #  define __NR_seccomp 277
129 # elif defined(__csky__)
130 #  define __NR_seccomp 277
131 # elif defined(__hppa__)
132 #  define __NR_seccomp 338
133 # elif defined(__powerpc__)
134 #  define __NR_seccomp 358
135 # elif defined(__s390__)
136 #  define __NR_seccomp 348
137 # elif defined(__xtensa__)
138 #  define __NR_seccomp 337
139 # elif defined(__sh__)
140 #  define __NR_seccomp 372
141 # else
142 #  warning "seccomp syscall number unknown for this architecture"
143 #  define __NR_seccomp 0xffff
144 # endif
145 #endif
146
147 #ifndef SECCOMP_SET_MODE_STRICT
148 #define SECCOMP_SET_MODE_STRICT 0
149 #endif
150
151 #ifndef SECCOMP_SET_MODE_FILTER
152 #define SECCOMP_SET_MODE_FILTER 1
153 #endif
154
155 #ifndef SECCOMP_GET_ACTION_AVAIL
156 #define SECCOMP_GET_ACTION_AVAIL 2
157 #endif
158
159 #ifndef SECCOMP_GET_NOTIF_SIZES
160 #define SECCOMP_GET_NOTIF_SIZES 3
161 #endif
162
163 #ifndef SECCOMP_FILTER_FLAG_TSYNC
164 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
165 #endif
166
167 #ifndef SECCOMP_FILTER_FLAG_LOG
168 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
169 #endif
170
171 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
172 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
173 #endif
174
175 #ifndef PTRACE_SECCOMP_GET_METADATA
176 #define PTRACE_SECCOMP_GET_METADATA     0x420d
177
178 struct seccomp_metadata {
179         __u64 filter_off;       /* Input: which filter */
180         __u64 flags;             /* Output: filter's flags */
181 };
182 #endif
183
184 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
185 #define SECCOMP_FILTER_FLAG_NEW_LISTENER        (1UL << 3)
186 #endif
187
188 #ifndef SECCOMP_RET_USER_NOTIF
189 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
190
191 #define SECCOMP_IOC_MAGIC               '!'
192 #define SECCOMP_IO(nr)                  _IO(SECCOMP_IOC_MAGIC, nr)
193 #define SECCOMP_IOR(nr, type)           _IOR(SECCOMP_IOC_MAGIC, nr, type)
194 #define SECCOMP_IOW(nr, type)           _IOW(SECCOMP_IOC_MAGIC, nr, type)
195 #define SECCOMP_IOWR(nr, type)          _IOWR(SECCOMP_IOC_MAGIC, nr, type)
196
197 /* Flags for seccomp notification fd ioctl. */
198 #define SECCOMP_IOCTL_NOTIF_RECV        SECCOMP_IOWR(0, struct seccomp_notif)
199 #define SECCOMP_IOCTL_NOTIF_SEND        SECCOMP_IOWR(1, \
200                                                 struct seccomp_notif_resp)
201 #define SECCOMP_IOCTL_NOTIF_ID_VALID    SECCOMP_IOW(2, __u64)
202
203 struct seccomp_notif {
204         __u64 id;
205         __u32 pid;
206         __u32 flags;
207         struct seccomp_data data;
208 };
209
210 struct seccomp_notif_resp {
211         __u64 id;
212         __s64 val;
213         __s32 error;
214         __u32 flags;
215 };
216
217 struct seccomp_notif_sizes {
218         __u16 seccomp_notif;
219         __u16 seccomp_notif_resp;
220         __u16 seccomp_data;
221 };
222 #endif
223
224 #ifndef SECCOMP_IOCTL_NOTIF_ADDFD
225 /* On success, the return value is the remote process's added fd number */
226 #define SECCOMP_IOCTL_NOTIF_ADDFD       SECCOMP_IOW(3,  \
227                                                 struct seccomp_notif_addfd)
228
229 /* valid flags for seccomp_notif_addfd */
230 #define SECCOMP_ADDFD_FLAG_SETFD        (1UL << 0) /* Specify remote fd */
231
232 struct seccomp_notif_addfd {
233         __u64 id;
234         __u32 flags;
235         __u32 srcfd;
236         __u32 newfd;
237         __u32 newfd_flags;
238 };
239 #endif
240
241 #ifndef SECCOMP_ADDFD_FLAG_SEND
242 #define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */
243 #endif
244
245 struct seccomp_notif_addfd_small {
246         __u64 id;
247         char weird[4];
248 };
249 #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL \
250         SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
251
252 struct seccomp_notif_addfd_big {
253         union {
254                 struct seccomp_notif_addfd addfd;
255                 char buf[sizeof(struct seccomp_notif_addfd) + 8];
256         };
257 };
258 #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG   \
259         SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
260
261 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
262 #define PTRACE_EVENTMSG_SYSCALL_ENTRY   1
263 #define PTRACE_EVENTMSG_SYSCALL_EXIT    2
264 #endif
265
266 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
267 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
268 #endif
269
270 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
271 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
272 #endif
273
274 #ifndef SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV
275 #define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
276 #endif
277
278 #ifndef seccomp
279 int seccomp(unsigned int op, unsigned int flags, void *args)
280 {
281         errno = 0;
282         return syscall(__NR_seccomp, op, flags, args);
283 }
284 #endif
285
286 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
287 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
288 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
289 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
290 #else
291 #error "wut? Unknown __BYTE_ORDER__?!"
292 #endif
293
294 #define SIBLING_EXIT_UNKILLED   0xbadbeef
295 #define SIBLING_EXIT_FAILURE    0xbadface
296 #define SIBLING_EXIT_NEWPRIVS   0xbadfeed
297
298 static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
299 {
300 #ifdef __NR_kcmp
301         errno = 0;
302         return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
303 #else
304         errno = ENOSYS;
305         return -1;
306 #endif
307 }
308
309 /* Have TH_LOG report actual location filecmp() is used. */
310 #define filecmp(pid1, pid2, fd1, fd2)   ({              \
311         int _ret;                                       \
312                                                         \
313         _ret = __filecmp(pid1, pid2, fd1, fd2);         \
314         if (_ret != 0) {                                \
315                 if (_ret < 0 && errno == ENOSYS) {      \
316                         TH_LOG("kcmp() syscall missing (test is less accurate)");\
317                         _ret = 0;                       \
318                 }                                       \
319         }                                               \
320         _ret; })
321
322 TEST(kcmp)
323 {
324         int ret;
325
326         ret = __filecmp(getpid(), getpid(), 1, 1);
327         EXPECT_EQ(ret, 0);
328         if (ret != 0 && errno == ENOSYS)
329                 SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
330 }
331
332 TEST(mode_strict_support)
333 {
334         long ret;
335
336         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
337         ASSERT_EQ(0, ret) {
338                 TH_LOG("Kernel does not support CONFIG_SECCOMP");
339         }
340         syscall(__NR_exit, 0);
341 }
342
343 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
344 {
345         long ret;
346
347         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
348         ASSERT_EQ(0, ret) {
349                 TH_LOG("Kernel does not support CONFIG_SECCOMP");
350         }
351         syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
352                 NULL, NULL, NULL);
353         EXPECT_FALSE(true) {
354                 TH_LOG("Unreachable!");
355         }
356 }
357
358 /* Note! This doesn't test no new privs behavior */
359 TEST(no_new_privs_support)
360 {
361         long ret;
362
363         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
364         EXPECT_EQ(0, ret) {
365                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
366         }
367 }
368
369 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
370 TEST(mode_filter_support)
371 {
372         long ret;
373
374         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
375         ASSERT_EQ(0, ret) {
376                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
377         }
378         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
379         EXPECT_EQ(-1, ret);
380         EXPECT_EQ(EFAULT, errno) {
381                 TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
382         }
383 }
384
385 TEST(mode_filter_without_nnp)
386 {
387         struct sock_filter filter[] = {
388                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
389         };
390         struct sock_fprog prog = {
391                 .len = (unsigned short)ARRAY_SIZE(filter),
392                 .filter = filter,
393         };
394         long ret;
395
396         ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
397         ASSERT_LE(0, ret) {
398                 TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
399         }
400         errno = 0;
401         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
402         /* Succeeds with CAP_SYS_ADMIN, fails without */
403         /* TODO(wad) check caps not euid */
404         if (geteuid()) {
405                 EXPECT_EQ(-1, ret);
406                 EXPECT_EQ(EACCES, errno);
407         } else {
408                 EXPECT_EQ(0, ret);
409         }
410 }
411
412 #define MAX_INSNS_PER_PATH 32768
413
414 TEST(filter_size_limits)
415 {
416         int i;
417         int count = BPF_MAXINSNS + 1;
418         struct sock_filter allow[] = {
419                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
420         };
421         struct sock_filter *filter;
422         struct sock_fprog prog = { };
423         long ret;
424
425         filter = calloc(count, sizeof(*filter));
426         ASSERT_NE(NULL, filter);
427
428         for (i = 0; i < count; i++)
429                 filter[i] = allow[0];
430
431         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
432         ASSERT_EQ(0, ret);
433
434         prog.filter = filter;
435         prog.len = count;
436
437         /* Too many filter instructions in a single filter. */
438         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
439         ASSERT_NE(0, ret) {
440                 TH_LOG("Installing %d insn filter was allowed", prog.len);
441         }
442
443         /* One less is okay, though. */
444         prog.len -= 1;
445         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
446         ASSERT_EQ(0, ret) {
447                 TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
448         }
449 }
450
451 TEST(filter_chain_limits)
452 {
453         int i;
454         int count = BPF_MAXINSNS;
455         struct sock_filter allow[] = {
456                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
457         };
458         struct sock_filter *filter;
459         struct sock_fprog prog = { };
460         long ret;
461
462         filter = calloc(count, sizeof(*filter));
463         ASSERT_NE(NULL, filter);
464
465         for (i = 0; i < count; i++)
466                 filter[i] = allow[0];
467
468         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
469         ASSERT_EQ(0, ret);
470
471         prog.filter = filter;
472         prog.len = 1;
473
474         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
475         ASSERT_EQ(0, ret);
476
477         prog.len = count;
478
479         /* Too many total filter instructions. */
480         for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
481                 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
482                 if (ret != 0)
483                         break;
484         }
485         ASSERT_NE(0, ret) {
486                 TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
487                        i, count, i * (count + 4));
488         }
489 }
490
491 TEST(mode_filter_cannot_move_to_strict)
492 {
493         struct sock_filter filter[] = {
494                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
495         };
496         struct sock_fprog prog = {
497                 .len = (unsigned short)ARRAY_SIZE(filter),
498                 .filter = filter,
499         };
500         long ret;
501
502         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
503         ASSERT_EQ(0, ret);
504
505         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
506         ASSERT_EQ(0, ret);
507
508         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
509         EXPECT_EQ(-1, ret);
510         EXPECT_EQ(EINVAL, errno);
511 }
512
513
514 TEST(mode_filter_get_seccomp)
515 {
516         struct sock_filter filter[] = {
517                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
518         };
519         struct sock_fprog prog = {
520                 .len = (unsigned short)ARRAY_SIZE(filter),
521                 .filter = filter,
522         };
523         long ret;
524
525         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
526         ASSERT_EQ(0, ret);
527
528         ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
529         EXPECT_EQ(0, ret);
530
531         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
532         ASSERT_EQ(0, ret);
533
534         ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
535         EXPECT_EQ(2, ret);
536 }
537
538
539 TEST(ALLOW_all)
540 {
541         struct sock_filter filter[] = {
542                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
543         };
544         struct sock_fprog prog = {
545                 .len = (unsigned short)ARRAY_SIZE(filter),
546                 .filter = filter,
547         };
548         long ret;
549
550         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
551         ASSERT_EQ(0, ret);
552
553         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
554         ASSERT_EQ(0, ret);
555 }
556
557 TEST(empty_prog)
558 {
559         struct sock_filter filter[] = {
560         };
561         struct sock_fprog prog = {
562                 .len = (unsigned short)ARRAY_SIZE(filter),
563                 .filter = filter,
564         };
565         long ret;
566
567         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
568         ASSERT_EQ(0, ret);
569
570         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
571         EXPECT_EQ(-1, ret);
572         EXPECT_EQ(EINVAL, errno);
573 }
574
575 TEST(log_all)
576 {
577         struct sock_filter filter[] = {
578                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
579         };
580         struct sock_fprog prog = {
581                 .len = (unsigned short)ARRAY_SIZE(filter),
582                 .filter = filter,
583         };
584         long ret;
585         pid_t parent = getppid();
586
587         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
588         ASSERT_EQ(0, ret);
589
590         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
591         ASSERT_EQ(0, ret);
592
593         /* getppid() should succeed and be logged (no check for logging) */
594         EXPECT_EQ(parent, syscall(__NR_getppid));
595 }
596
597 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
598 {
599         struct sock_filter filter[] = {
600                 BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
601         };
602         struct sock_fprog prog = {
603                 .len = (unsigned short)ARRAY_SIZE(filter),
604                 .filter = filter,
605         };
606         long ret;
607
608         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
609         ASSERT_EQ(0, ret);
610
611         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
612         ASSERT_EQ(0, ret);
613         EXPECT_EQ(0, syscall(__NR_getpid)) {
614                 TH_LOG("getpid() shouldn't ever return");
615         }
616 }
617
618 /* return code >= 0x80000000 is unused. */
619 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
620 {
621         struct sock_filter filter[] = {
622                 BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
623         };
624         struct sock_fprog prog = {
625                 .len = (unsigned short)ARRAY_SIZE(filter),
626                 .filter = filter,
627         };
628         long ret;
629
630         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
631         ASSERT_EQ(0, ret);
632
633         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
634         ASSERT_EQ(0, ret);
635         EXPECT_EQ(0, syscall(__NR_getpid)) {
636                 TH_LOG("getpid() shouldn't ever return");
637         }
638 }
639
640 TEST_SIGNAL(KILL_all, SIGSYS)
641 {
642         struct sock_filter filter[] = {
643                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
644         };
645         struct sock_fprog prog = {
646                 .len = (unsigned short)ARRAY_SIZE(filter),
647                 .filter = filter,
648         };
649         long ret;
650
651         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
652         ASSERT_EQ(0, ret);
653
654         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
655         ASSERT_EQ(0, ret);
656 }
657
658 TEST_SIGNAL(KILL_one, SIGSYS)
659 {
660         struct sock_filter filter[] = {
661                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
662                         offsetof(struct seccomp_data, nr)),
663                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
664                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
665                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
666         };
667         struct sock_fprog prog = {
668                 .len = (unsigned short)ARRAY_SIZE(filter),
669                 .filter = filter,
670         };
671         long ret;
672         pid_t parent = getppid();
673
674         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
675         ASSERT_EQ(0, ret);
676
677         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
678         ASSERT_EQ(0, ret);
679
680         EXPECT_EQ(parent, syscall(__NR_getppid));
681         /* getpid() should never return. */
682         EXPECT_EQ(0, syscall(__NR_getpid));
683 }
684
685 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
686 {
687         void *fatal_address;
688         struct sock_filter filter[] = {
689                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
690                         offsetof(struct seccomp_data, nr)),
691                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
692                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
693                 /* Only both with lower 32-bit for now. */
694                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
695                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
696                         (unsigned long)&fatal_address, 0, 1),
697                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
698                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
699         };
700         struct sock_fprog prog = {
701                 .len = (unsigned short)ARRAY_SIZE(filter),
702                 .filter = filter,
703         };
704         long ret;
705         pid_t parent = getppid();
706         struct tms timebuf;
707         clock_t clock = times(&timebuf);
708
709         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
710         ASSERT_EQ(0, ret);
711
712         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
713         ASSERT_EQ(0, ret);
714
715         EXPECT_EQ(parent, syscall(__NR_getppid));
716         EXPECT_LE(clock, syscall(__NR_times, &timebuf));
717         /* times() should never return. */
718         EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
719 }
720
721 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
722 {
723 #ifndef __NR_mmap2
724         int sysno = __NR_mmap;
725 #else
726         int sysno = __NR_mmap2;
727 #endif
728         struct sock_filter filter[] = {
729                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
730                         offsetof(struct seccomp_data, nr)),
731                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
732                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
733                 /* Only both with lower 32-bit for now. */
734                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
735                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
736                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
737                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
738         };
739         struct sock_fprog prog = {
740                 .len = (unsigned short)ARRAY_SIZE(filter),
741                 .filter = filter,
742         };
743         long ret;
744         pid_t parent = getppid();
745         int fd;
746         void *map1, *map2;
747         int page_size = sysconf(_SC_PAGESIZE);
748
749         ASSERT_LT(0, page_size);
750
751         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
752         ASSERT_EQ(0, ret);
753
754         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
755         ASSERT_EQ(0, ret);
756
757         fd = open("/dev/zero", O_RDONLY);
758         ASSERT_NE(-1, fd);
759
760         EXPECT_EQ(parent, syscall(__NR_getppid));
761         map1 = (void *)syscall(sysno,
762                 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
763         EXPECT_NE(MAP_FAILED, map1);
764         /* mmap2() should never return. */
765         map2 = (void *)syscall(sysno,
766                  NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
767         EXPECT_EQ(MAP_FAILED, map2);
768
769         /* The test failed, so clean up the resources. */
770         munmap(map1, page_size);
771         munmap(map2, page_size);
772         close(fd);
773 }
774
775 /* This is a thread task to die via seccomp filter violation. */
776 void *kill_thread(void *data)
777 {
778         bool die = (bool)data;
779
780         if (die) {
781                 prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
782                 return (void *)SIBLING_EXIT_FAILURE;
783         }
784
785         return (void *)SIBLING_EXIT_UNKILLED;
786 }
787
788 enum kill_t {
789         KILL_THREAD,
790         KILL_PROCESS,
791         RET_UNKNOWN
792 };
793
794 /* Prepare a thread that will kill itself or both of us. */
795 void kill_thread_or_group(struct __test_metadata *_metadata,
796                           enum kill_t kill_how)
797 {
798         pthread_t thread;
799         void *status;
800         /* Kill only when calling __NR_prctl. */
801         struct sock_filter filter_thread[] = {
802                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
803                         offsetof(struct seccomp_data, nr)),
804                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
805                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
806                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
807         };
808         struct sock_fprog prog_thread = {
809                 .len = (unsigned short)ARRAY_SIZE(filter_thread),
810                 .filter = filter_thread,
811         };
812         int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA;
813         struct sock_filter filter_process[] = {
814                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
815                         offsetof(struct seccomp_data, nr)),
816                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
817                 BPF_STMT(BPF_RET|BPF_K, kill),
818                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
819         };
820         struct sock_fprog prog_process = {
821                 .len = (unsigned short)ARRAY_SIZE(filter_process),
822                 .filter = filter_process,
823         };
824
825         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
826                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
827         }
828
829         ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
830                              kill_how == KILL_THREAD ? &prog_thread
831                                                      : &prog_process));
832
833         /*
834          * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
835          * flag cannot be downgraded by a new filter.
836          */
837         if (kill_how == KILL_PROCESS)
838                 ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
839
840         /* Start a thread that will exit immediately. */
841         ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
842         ASSERT_EQ(0, pthread_join(thread, &status));
843         ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
844
845         /* Start a thread that will die immediately. */
846         ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
847         ASSERT_EQ(0, pthread_join(thread, &status));
848         ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
849
850         /*
851          * If we get here, only the spawned thread died. Let the parent know
852          * the whole process didn't die (i.e. this thread, the spawner,
853          * stayed running).
854          */
855         exit(42);
856 }
857
858 TEST(KILL_thread)
859 {
860         int status;
861         pid_t child_pid;
862
863         child_pid = fork();
864         ASSERT_LE(0, child_pid);
865         if (child_pid == 0) {
866                 kill_thread_or_group(_metadata, KILL_THREAD);
867                 _exit(38);
868         }
869
870         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
871
872         /* If only the thread was killed, we'll see exit 42. */
873         ASSERT_TRUE(WIFEXITED(status));
874         ASSERT_EQ(42, WEXITSTATUS(status));
875 }
876
877 TEST(KILL_process)
878 {
879         int status;
880         pid_t child_pid;
881
882         child_pid = fork();
883         ASSERT_LE(0, child_pid);
884         if (child_pid == 0) {
885                 kill_thread_or_group(_metadata, KILL_PROCESS);
886                 _exit(38);
887         }
888
889         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
890
891         /* If the entire process was killed, we'll see SIGSYS. */
892         ASSERT_TRUE(WIFSIGNALED(status));
893         ASSERT_EQ(SIGSYS, WTERMSIG(status));
894 }
895
896 TEST(KILL_unknown)
897 {
898         int status;
899         pid_t child_pid;
900
901         child_pid = fork();
902         ASSERT_LE(0, child_pid);
903         if (child_pid == 0) {
904                 kill_thread_or_group(_metadata, RET_UNKNOWN);
905                 _exit(38);
906         }
907
908         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
909
910         /* If the entire process was killed, we'll see SIGSYS. */
911         EXPECT_TRUE(WIFSIGNALED(status)) {
912                 TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
913         }
914         ASSERT_EQ(SIGSYS, WTERMSIG(status));
915 }
916
917 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
918 TEST(arg_out_of_range)
919 {
920         struct sock_filter filter[] = {
921                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
922                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
923         };
924         struct sock_fprog prog = {
925                 .len = (unsigned short)ARRAY_SIZE(filter),
926                 .filter = filter,
927         };
928         long ret;
929
930         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
931         ASSERT_EQ(0, ret);
932
933         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
934         EXPECT_EQ(-1, ret);
935         EXPECT_EQ(EINVAL, errno);
936 }
937
938 #define ERRNO_FILTER(name, errno)                                       \
939         struct sock_filter _read_filter_##name[] = {                    \
940                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,                          \
941                         offsetof(struct seccomp_data, nr)),             \
942                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),       \
943                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),     \
944                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),             \
945         };                                                              \
946         struct sock_fprog prog_##name = {                               \
947                 .len = (unsigned short)ARRAY_SIZE(_read_filter_##name), \
948                 .filter = _read_filter_##name,                          \
949         }
950
951 /* Make sure basic errno values are correctly passed through a filter. */
952 TEST(ERRNO_valid)
953 {
954         ERRNO_FILTER(valid, E2BIG);
955         long ret;
956         pid_t parent = getppid();
957
958         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
959         ASSERT_EQ(0, ret);
960
961         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
962         ASSERT_EQ(0, ret);
963
964         EXPECT_EQ(parent, syscall(__NR_getppid));
965         EXPECT_EQ(-1, read(-1, NULL, 0));
966         EXPECT_EQ(E2BIG, errno);
967 }
968
969 /* Make sure an errno of zero is correctly handled by the arch code. */
970 TEST(ERRNO_zero)
971 {
972         ERRNO_FILTER(zero, 0);
973         long ret;
974         pid_t parent = getppid();
975
976         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
977         ASSERT_EQ(0, ret);
978
979         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
980         ASSERT_EQ(0, ret);
981
982         EXPECT_EQ(parent, syscall(__NR_getppid));
983         /* "errno" of 0 is ok. */
984         EXPECT_EQ(0, read(-1, NULL, 0));
985 }
986
987 /*
988  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
989  * This tests that the errno value gets capped correctly, fixed by
990  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
991  */
992 TEST(ERRNO_capped)
993 {
994         ERRNO_FILTER(capped, 4096);
995         long ret;
996         pid_t parent = getppid();
997
998         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
999         ASSERT_EQ(0, ret);
1000
1001         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
1002         ASSERT_EQ(0, ret);
1003
1004         EXPECT_EQ(parent, syscall(__NR_getppid));
1005         EXPECT_EQ(-1, read(-1, NULL, 0));
1006         EXPECT_EQ(4095, errno);
1007 }
1008
1009 /*
1010  * Filters are processed in reverse order: last applied is executed first.
1011  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
1012  * SECCOMP_RET_DATA mask results will follow the most recently applied
1013  * matching filter return (and not the lowest or highest value).
1014  */
1015 TEST(ERRNO_order)
1016 {
1017         ERRNO_FILTER(first,  11);
1018         ERRNO_FILTER(second, 13);
1019         ERRNO_FILTER(third,  12);
1020         long ret;
1021         pid_t parent = getppid();
1022
1023         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1024         ASSERT_EQ(0, ret);
1025
1026         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
1027         ASSERT_EQ(0, ret);
1028
1029         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
1030         ASSERT_EQ(0, ret);
1031
1032         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
1033         ASSERT_EQ(0, ret);
1034
1035         EXPECT_EQ(parent, syscall(__NR_getppid));
1036         EXPECT_EQ(-1, read(-1, NULL, 0));
1037         EXPECT_EQ(12, errno);
1038 }
1039
1040 FIXTURE(TRAP) {
1041         struct sock_fprog prog;
1042 };
1043
1044 FIXTURE_SETUP(TRAP)
1045 {
1046         struct sock_filter filter[] = {
1047                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1048                         offsetof(struct seccomp_data, nr)),
1049                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1050                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1051                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1052         };
1053
1054         memset(&self->prog, 0, sizeof(self->prog));
1055         self->prog.filter = malloc(sizeof(filter));
1056         ASSERT_NE(NULL, self->prog.filter);
1057         memcpy(self->prog.filter, filter, sizeof(filter));
1058         self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1059 }
1060
1061 FIXTURE_TEARDOWN(TRAP)
1062 {
1063         if (self->prog.filter)
1064                 free(self->prog.filter);
1065 }
1066
1067 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1068 {
1069         long ret;
1070
1071         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1072         ASSERT_EQ(0, ret);
1073
1074         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1075         ASSERT_EQ(0, ret);
1076         syscall(__NR_getpid);
1077 }
1078
1079 /* Ensure that SIGSYS overrides SIG_IGN */
1080 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1081 {
1082         long ret;
1083
1084         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1085         ASSERT_EQ(0, ret);
1086
1087         signal(SIGSYS, SIG_IGN);
1088
1089         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1090         ASSERT_EQ(0, ret);
1091         syscall(__NR_getpid);
1092 }
1093
1094 static siginfo_t TRAP_info;
1095 static volatile int TRAP_nr;
1096 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1097 {
1098         memcpy(&TRAP_info, info, sizeof(TRAP_info));
1099         TRAP_nr = nr;
1100 }
1101
1102 TEST_F(TRAP, handler)
1103 {
1104         int ret, test;
1105         struct sigaction act;
1106         sigset_t mask;
1107
1108         memset(&act, 0, sizeof(act));
1109         sigemptyset(&mask);
1110         sigaddset(&mask, SIGSYS);
1111
1112         act.sa_sigaction = &TRAP_action;
1113         act.sa_flags = SA_SIGINFO;
1114         ret = sigaction(SIGSYS, &act, NULL);
1115         ASSERT_EQ(0, ret) {
1116                 TH_LOG("sigaction failed");
1117         }
1118         ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1119         ASSERT_EQ(0, ret) {
1120                 TH_LOG("sigprocmask failed");
1121         }
1122
1123         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1124         ASSERT_EQ(0, ret);
1125         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1126         ASSERT_EQ(0, ret);
1127         TRAP_nr = 0;
1128         memset(&TRAP_info, 0, sizeof(TRAP_info));
1129         /* Expect the registers to be rolled back. (nr = error) may vary
1130          * based on arch. */
1131         ret = syscall(__NR_getpid);
1132         /* Silence gcc warning about volatile. */
1133         test = TRAP_nr;
1134         EXPECT_EQ(SIGSYS, test);
1135         struct local_sigsys {
1136                 void *_call_addr;       /* calling user insn */
1137                 int _syscall;           /* triggering system call number */
1138                 unsigned int _arch;     /* AUDIT_ARCH_* of syscall */
1139         } *sigsys = (struct local_sigsys *)
1140 #ifdef si_syscall
1141                 &(TRAP_info.si_call_addr);
1142 #else
1143                 &TRAP_info.si_pid;
1144 #endif
1145         EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1146         /* Make sure arch is non-zero. */
1147         EXPECT_NE(0, sigsys->_arch);
1148         EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1149 }
1150
1151 FIXTURE(precedence) {
1152         struct sock_fprog allow;
1153         struct sock_fprog log;
1154         struct sock_fprog trace;
1155         struct sock_fprog error;
1156         struct sock_fprog trap;
1157         struct sock_fprog kill;
1158 };
1159
1160 FIXTURE_SETUP(precedence)
1161 {
1162         struct sock_filter allow_insns[] = {
1163                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1164         };
1165         struct sock_filter log_insns[] = {
1166                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1167                         offsetof(struct seccomp_data, nr)),
1168                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1169                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1170                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1171         };
1172         struct sock_filter trace_insns[] = {
1173                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1174                         offsetof(struct seccomp_data, nr)),
1175                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1176                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1177                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1178         };
1179         struct sock_filter error_insns[] = {
1180                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1181                         offsetof(struct seccomp_data, nr)),
1182                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1183                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1184                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1185         };
1186         struct sock_filter trap_insns[] = {
1187                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1188                         offsetof(struct seccomp_data, nr)),
1189                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1190                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1191                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1192         };
1193         struct sock_filter kill_insns[] = {
1194                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1195                         offsetof(struct seccomp_data, nr)),
1196                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1197                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1198                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1199         };
1200
1201         memset(self, 0, sizeof(*self));
1202 #define FILTER_ALLOC(_x) \
1203         self->_x.filter = malloc(sizeof(_x##_insns)); \
1204         ASSERT_NE(NULL, self->_x.filter); \
1205         memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1206         self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1207         FILTER_ALLOC(allow);
1208         FILTER_ALLOC(log);
1209         FILTER_ALLOC(trace);
1210         FILTER_ALLOC(error);
1211         FILTER_ALLOC(trap);
1212         FILTER_ALLOC(kill);
1213 }
1214
1215 FIXTURE_TEARDOWN(precedence)
1216 {
1217 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1218         FILTER_FREE(allow);
1219         FILTER_FREE(log);
1220         FILTER_FREE(trace);
1221         FILTER_FREE(error);
1222         FILTER_FREE(trap);
1223         FILTER_FREE(kill);
1224 }
1225
1226 TEST_F(precedence, allow_ok)
1227 {
1228         pid_t parent, res = 0;
1229         long ret;
1230
1231         parent = getppid();
1232         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1233         ASSERT_EQ(0, ret);
1234
1235         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1236         ASSERT_EQ(0, ret);
1237         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1238         ASSERT_EQ(0, ret);
1239         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1240         ASSERT_EQ(0, ret);
1241         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1242         ASSERT_EQ(0, ret);
1243         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1244         ASSERT_EQ(0, ret);
1245         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1246         ASSERT_EQ(0, ret);
1247         /* Should work just fine. */
1248         res = syscall(__NR_getppid);
1249         EXPECT_EQ(parent, res);
1250 }
1251
1252 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1253 {
1254         pid_t parent, res = 0;
1255         long ret;
1256
1257         parent = getppid();
1258         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1259         ASSERT_EQ(0, ret);
1260
1261         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1262         ASSERT_EQ(0, ret);
1263         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1264         ASSERT_EQ(0, ret);
1265         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1266         ASSERT_EQ(0, ret);
1267         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1268         ASSERT_EQ(0, ret);
1269         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1270         ASSERT_EQ(0, ret);
1271         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1272         ASSERT_EQ(0, ret);
1273         /* Should work just fine. */
1274         res = syscall(__NR_getppid);
1275         EXPECT_EQ(parent, res);
1276         /* getpid() should never return. */
1277         res = syscall(__NR_getpid);
1278         EXPECT_EQ(0, res);
1279 }
1280
1281 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1282 {
1283         pid_t parent;
1284         long ret;
1285
1286         parent = getppid();
1287         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1288         ASSERT_EQ(0, ret);
1289
1290         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1291         ASSERT_EQ(0, ret);
1292         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1293         ASSERT_EQ(0, ret);
1294         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1295         ASSERT_EQ(0, ret);
1296         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1297         ASSERT_EQ(0, ret);
1298         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1299         ASSERT_EQ(0, ret);
1300         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1301         ASSERT_EQ(0, ret);
1302         /* Should work just fine. */
1303         EXPECT_EQ(parent, syscall(__NR_getppid));
1304         /* getpid() should never return. */
1305         EXPECT_EQ(0, syscall(__NR_getpid));
1306 }
1307
1308 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1309 {
1310         pid_t parent;
1311         long ret;
1312
1313         parent = getppid();
1314         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1315         ASSERT_EQ(0, ret);
1316
1317         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1318         ASSERT_EQ(0, ret);
1319         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1320         ASSERT_EQ(0, ret);
1321         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1322         ASSERT_EQ(0, ret);
1323         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1324         ASSERT_EQ(0, ret);
1325         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1326         ASSERT_EQ(0, ret);
1327         /* Should work just fine. */
1328         EXPECT_EQ(parent, syscall(__NR_getppid));
1329         /* getpid() should never return. */
1330         EXPECT_EQ(0, syscall(__NR_getpid));
1331 }
1332
1333 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1334 {
1335         pid_t parent;
1336         long ret;
1337
1338         parent = getppid();
1339         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1340         ASSERT_EQ(0, ret);
1341
1342         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1343         ASSERT_EQ(0, ret);
1344         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1345         ASSERT_EQ(0, ret);
1346         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1347         ASSERT_EQ(0, ret);
1348         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1349         ASSERT_EQ(0, ret);
1350         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1351         ASSERT_EQ(0, ret);
1352         /* Should work just fine. */
1353         EXPECT_EQ(parent, syscall(__NR_getppid));
1354         /* getpid() should never return. */
1355         EXPECT_EQ(0, syscall(__NR_getpid));
1356 }
1357
1358 TEST_F(precedence, errno_is_third)
1359 {
1360         pid_t parent;
1361         long ret;
1362
1363         parent = getppid();
1364         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1365         ASSERT_EQ(0, ret);
1366
1367         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1368         ASSERT_EQ(0, ret);
1369         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1370         ASSERT_EQ(0, ret);
1371         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1372         ASSERT_EQ(0, ret);
1373         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1374         ASSERT_EQ(0, ret);
1375         /* Should work just fine. */
1376         EXPECT_EQ(parent, syscall(__NR_getppid));
1377         EXPECT_EQ(0, syscall(__NR_getpid));
1378 }
1379
1380 TEST_F(precedence, errno_is_third_in_any_order)
1381 {
1382         pid_t parent;
1383         long ret;
1384
1385         parent = getppid();
1386         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1387         ASSERT_EQ(0, ret);
1388
1389         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1390         ASSERT_EQ(0, ret);
1391         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1392         ASSERT_EQ(0, ret);
1393         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1394         ASSERT_EQ(0, ret);
1395         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1396         ASSERT_EQ(0, ret);
1397         /* Should work just fine. */
1398         EXPECT_EQ(parent, syscall(__NR_getppid));
1399         EXPECT_EQ(0, syscall(__NR_getpid));
1400 }
1401
1402 TEST_F(precedence, trace_is_fourth)
1403 {
1404         pid_t parent;
1405         long ret;
1406
1407         parent = getppid();
1408         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1409         ASSERT_EQ(0, ret);
1410
1411         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1412         ASSERT_EQ(0, ret);
1413         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1414         ASSERT_EQ(0, ret);
1415         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1416         ASSERT_EQ(0, ret);
1417         /* Should work just fine. */
1418         EXPECT_EQ(parent, syscall(__NR_getppid));
1419         /* No ptracer */
1420         EXPECT_EQ(-1, syscall(__NR_getpid));
1421 }
1422
1423 TEST_F(precedence, trace_is_fourth_in_any_order)
1424 {
1425         pid_t parent;
1426         long ret;
1427
1428         parent = getppid();
1429         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1430         ASSERT_EQ(0, ret);
1431
1432         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1433         ASSERT_EQ(0, ret);
1434         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1435         ASSERT_EQ(0, ret);
1436         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1437         ASSERT_EQ(0, ret);
1438         /* Should work just fine. */
1439         EXPECT_EQ(parent, syscall(__NR_getppid));
1440         /* No ptracer */
1441         EXPECT_EQ(-1, syscall(__NR_getpid));
1442 }
1443
1444 TEST_F(precedence, log_is_fifth)
1445 {
1446         pid_t mypid, parent;
1447         long ret;
1448
1449         mypid = getpid();
1450         parent = getppid();
1451         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1452         ASSERT_EQ(0, ret);
1453
1454         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1455         ASSERT_EQ(0, ret);
1456         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1457         ASSERT_EQ(0, ret);
1458         /* Should work just fine. */
1459         EXPECT_EQ(parent, syscall(__NR_getppid));
1460         /* Should also work just fine */
1461         EXPECT_EQ(mypid, syscall(__NR_getpid));
1462 }
1463
1464 TEST_F(precedence, log_is_fifth_in_any_order)
1465 {
1466         pid_t mypid, parent;
1467         long ret;
1468
1469         mypid = getpid();
1470         parent = getppid();
1471         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1472         ASSERT_EQ(0, ret);
1473
1474         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1475         ASSERT_EQ(0, ret);
1476         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1477         ASSERT_EQ(0, ret);
1478         /* Should work just fine. */
1479         EXPECT_EQ(parent, syscall(__NR_getppid));
1480         /* Should also work just fine */
1481         EXPECT_EQ(mypid, syscall(__NR_getpid));
1482 }
1483
1484 #ifndef PTRACE_O_TRACESECCOMP
1485 #define PTRACE_O_TRACESECCOMP   0x00000080
1486 #endif
1487
1488 /* Catch the Ubuntu 12.04 value error. */
1489 #if PTRACE_EVENT_SECCOMP != 7
1490 #undef PTRACE_EVENT_SECCOMP
1491 #endif
1492
1493 #ifndef PTRACE_EVENT_SECCOMP
1494 #define PTRACE_EVENT_SECCOMP 7
1495 #endif
1496
1497 #define PTRACE_EVENT_MASK(status) ((status) >> 16)
1498 bool tracer_running;
1499 void tracer_stop(int sig)
1500 {
1501         tracer_running = false;
1502 }
1503
1504 typedef void tracer_func_t(struct __test_metadata *_metadata,
1505                            pid_t tracee, int status, void *args);
1506
1507 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1508             tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1509 {
1510         int ret = -1;
1511         struct sigaction action = {
1512                 .sa_handler = tracer_stop,
1513         };
1514
1515         /* Allow external shutdown. */
1516         tracer_running = true;
1517         ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1518
1519         errno = 0;
1520         while (ret == -1 && errno != EINVAL)
1521                 ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1522         ASSERT_EQ(0, ret) {
1523                 kill(tracee, SIGKILL);
1524         }
1525         /* Wait for attach stop */
1526         wait(NULL);
1527
1528         ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1529                                                       PTRACE_O_TRACESYSGOOD :
1530                                                       PTRACE_O_TRACESECCOMP);
1531         ASSERT_EQ(0, ret) {
1532                 TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1533                 kill(tracee, SIGKILL);
1534         }
1535         ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1536                      tracee, NULL, 0);
1537         ASSERT_EQ(0, ret);
1538
1539         /* Unblock the tracee */
1540         ASSERT_EQ(1, write(fd, "A", 1));
1541         ASSERT_EQ(0, close(fd));
1542
1543         /* Run until we're shut down. Must assert to stop execution. */
1544         while (tracer_running) {
1545                 int status;
1546
1547                 if (wait(&status) != tracee)
1548                         continue;
1549
1550                 if (WIFSIGNALED(status)) {
1551                         /* Child caught a fatal signal. */
1552                         return;
1553                 }
1554                 if (WIFEXITED(status)) {
1555                         /* Child exited with code. */
1556                         return;
1557                 }
1558
1559                 /* Check if we got an expected event. */
1560                 ASSERT_EQ(WIFCONTINUED(status), false);
1561                 ASSERT_EQ(WIFSTOPPED(status), true);
1562                 ASSERT_EQ(WSTOPSIG(status) & SIGTRAP, SIGTRAP) {
1563                         TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
1564                 }
1565
1566                 tracer_func(_metadata, tracee, status, args);
1567
1568                 ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1569                              tracee, NULL, 0);
1570                 ASSERT_EQ(0, ret);
1571         }
1572         /* Directly report the status of our test harness results. */
1573         syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
1574 }
1575
1576 /* Common tracer setup/teardown functions. */
1577 void cont_handler(int num)
1578 { }
1579 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1580                           tracer_func_t func, void *args, bool ptrace_syscall)
1581 {
1582         char sync;
1583         int pipefd[2];
1584         pid_t tracer_pid;
1585         pid_t tracee = getpid();
1586
1587         /* Setup a pipe for clean synchronization. */
1588         ASSERT_EQ(0, pipe(pipefd));
1589
1590         /* Fork a child which we'll promote to tracer */
1591         tracer_pid = fork();
1592         ASSERT_LE(0, tracer_pid);
1593         signal(SIGALRM, cont_handler);
1594         if (tracer_pid == 0) {
1595                 close(pipefd[0]);
1596                 start_tracer(_metadata, pipefd[1], tracee, func, args,
1597                              ptrace_syscall);
1598                 syscall(__NR_exit, 0);
1599         }
1600         close(pipefd[1]);
1601         prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1602         read(pipefd[0], &sync, 1);
1603         close(pipefd[0]);
1604
1605         return tracer_pid;
1606 }
1607
1608 void teardown_trace_fixture(struct __test_metadata *_metadata,
1609                             pid_t tracer)
1610 {
1611         if (tracer) {
1612                 int status;
1613                 /*
1614                  * Extract the exit code from the other process and
1615                  * adopt it for ourselves in case its asserts failed.
1616                  */
1617                 ASSERT_EQ(0, kill(tracer, SIGUSR1));
1618                 ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1619                 if (WEXITSTATUS(status))
1620                         _metadata->passed = 0;
1621         }
1622 }
1623
1624 /* "poke" tracer arguments and function. */
1625 struct tracer_args_poke_t {
1626         unsigned long poke_addr;
1627 };
1628
1629 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1630                  void *args)
1631 {
1632         int ret;
1633         unsigned long msg;
1634         struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1635
1636         ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1637         EXPECT_EQ(0, ret);
1638         /* If this fails, don't try to recover. */
1639         ASSERT_EQ(0x1001, msg) {
1640                 kill(tracee, SIGKILL);
1641         }
1642         /*
1643          * Poke in the message.
1644          * Registers are not touched to try to keep this relatively arch
1645          * agnostic.
1646          */
1647         ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1648         EXPECT_EQ(0, ret);
1649 }
1650
1651 FIXTURE(TRACE_poke) {
1652         struct sock_fprog prog;
1653         pid_t tracer;
1654         long poked;
1655         struct tracer_args_poke_t tracer_args;
1656 };
1657
1658 FIXTURE_SETUP(TRACE_poke)
1659 {
1660         struct sock_filter filter[] = {
1661                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1662                         offsetof(struct seccomp_data, nr)),
1663                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1664                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1665                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1666         };
1667
1668         self->poked = 0;
1669         memset(&self->prog, 0, sizeof(self->prog));
1670         self->prog.filter = malloc(sizeof(filter));
1671         ASSERT_NE(NULL, self->prog.filter);
1672         memcpy(self->prog.filter, filter, sizeof(filter));
1673         self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1674
1675         /* Set up tracer args. */
1676         self->tracer_args.poke_addr = (unsigned long)&self->poked;
1677
1678         /* Launch tracer. */
1679         self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1680                                            &self->tracer_args, false);
1681 }
1682
1683 FIXTURE_TEARDOWN(TRACE_poke)
1684 {
1685         teardown_trace_fixture(_metadata, self->tracer);
1686         if (self->prog.filter)
1687                 free(self->prog.filter);
1688 }
1689
1690 TEST_F(TRACE_poke, read_has_side_effects)
1691 {
1692         ssize_t ret;
1693
1694         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1695         ASSERT_EQ(0, ret);
1696
1697         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1698         ASSERT_EQ(0, ret);
1699
1700         EXPECT_EQ(0, self->poked);
1701         ret = read(-1, NULL, 0);
1702         EXPECT_EQ(-1, ret);
1703         EXPECT_EQ(0x1001, self->poked);
1704 }
1705
1706 TEST_F(TRACE_poke, getpid_runs_normally)
1707 {
1708         long ret;
1709
1710         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1711         ASSERT_EQ(0, ret);
1712
1713         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1714         ASSERT_EQ(0, ret);
1715
1716         EXPECT_EQ(0, self->poked);
1717         EXPECT_NE(0, syscall(__NR_getpid));
1718         EXPECT_EQ(0, self->poked);
1719 }
1720
1721 #if defined(__x86_64__)
1722 # define ARCH_REGS              struct user_regs_struct
1723 # define SYSCALL_NUM(_regs)     (_regs).orig_rax
1724 # define SYSCALL_RET(_regs)     (_regs).rax
1725 #elif defined(__i386__)
1726 # define ARCH_REGS              struct user_regs_struct
1727 # define SYSCALL_NUM(_regs)     (_regs).orig_eax
1728 # define SYSCALL_RET(_regs)     (_regs).eax
1729 #elif defined(__arm__)
1730 # define ARCH_REGS              struct pt_regs
1731 # define SYSCALL_NUM(_regs)     (_regs).ARM_r7
1732 # ifndef PTRACE_SET_SYSCALL
1733 #  define PTRACE_SET_SYSCALL   23
1734 # endif
1735 # define SYSCALL_NUM_SET(_regs, _nr)    \
1736                 EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
1737 # define SYSCALL_RET(_regs)     (_regs).ARM_r0
1738 #elif defined(__aarch64__)
1739 # define ARCH_REGS              struct user_pt_regs
1740 # define SYSCALL_NUM(_regs)     (_regs).regs[8]
1741 # ifndef NT_ARM_SYSTEM_CALL
1742 #  define NT_ARM_SYSTEM_CALL 0x404
1743 # endif
1744 # define SYSCALL_NUM_SET(_regs, _nr)                            \
1745         do {                                                    \
1746                 struct iovec __v;                               \
1747                 typeof(_nr) __nr = (_nr);                       \
1748                 __v.iov_base = &__nr;                           \
1749                 __v.iov_len = sizeof(__nr);                     \
1750                 EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee,   \
1751                                     NT_ARM_SYSTEM_CALL, &__v)); \
1752         } while (0)
1753 # define SYSCALL_RET(_regs)     (_regs).regs[0]
1754 #elif defined(__riscv) && __riscv_xlen == 64
1755 # define ARCH_REGS              struct user_regs_struct
1756 # define SYSCALL_NUM(_regs)     (_regs).a7
1757 # define SYSCALL_RET(_regs)     (_regs).a0
1758 #elif defined(__csky__)
1759 # define ARCH_REGS              struct pt_regs
1760 #  if defined(__CSKYABIV2__)
1761 #   define SYSCALL_NUM(_regs)   (_regs).regs[3]
1762 #  else
1763 #   define SYSCALL_NUM(_regs)   (_regs).regs[9]
1764 #  endif
1765 # define SYSCALL_RET(_regs)     (_regs).a0
1766 #elif defined(__hppa__)
1767 # define ARCH_REGS              struct user_regs_struct
1768 # define SYSCALL_NUM(_regs)     (_regs).gr[20]
1769 # define SYSCALL_RET(_regs)     (_regs).gr[28]
1770 #elif defined(__powerpc__)
1771 # define ARCH_REGS              struct pt_regs
1772 # define SYSCALL_NUM(_regs)     (_regs).gpr[0]
1773 # define SYSCALL_RET(_regs)     (_regs).gpr[3]
1774 # define SYSCALL_RET_SET(_regs, _val)                           \
1775         do {                                                    \
1776                 typeof(_val) _result = (_val);                  \
1777                 if ((_regs.trap & 0xfff0) == 0x3000) {          \
1778                         /*                                      \
1779                          * scv 0 system call uses -ve result    \
1780                          * for error, so no need to adjust.     \
1781                          */                                     \
1782                         SYSCALL_RET(_regs) = _result;           \
1783                 } else {                                        \
1784                         /*                                      \
1785                          * A syscall error is signaled by the   \
1786                          * CR0 SO bit and the code is stored as \
1787                          * a positive value.                    \
1788                          */                                     \
1789                         if (_result < 0) {                      \
1790                                 SYSCALL_RET(_regs) = -_result;  \
1791                                 (_regs).ccr |= 0x10000000;      \
1792                         } else {                                \
1793                                 SYSCALL_RET(_regs) = _result;   \
1794                                 (_regs).ccr &= ~0x10000000;     \
1795                         }                                       \
1796                 }                                               \
1797         } while (0)
1798 # define SYSCALL_RET_SET_ON_PTRACE_EXIT
1799 #elif defined(__s390__)
1800 # define ARCH_REGS              s390_regs
1801 # define SYSCALL_NUM(_regs)     (_regs).gprs[2]
1802 # define SYSCALL_RET_SET(_regs, _val)                   \
1803                 TH_LOG("Can't modify syscall return on this architecture")
1804 #elif defined(__mips__)
1805 # include <asm/unistd_nr_n32.h>
1806 # include <asm/unistd_nr_n64.h>
1807 # include <asm/unistd_nr_o32.h>
1808 # define ARCH_REGS              struct pt_regs
1809 # define SYSCALL_NUM(_regs)                             \
1810         ({                                              \
1811                 typeof((_regs).regs[2]) _nr;            \
1812                 if ((_regs).regs[2] == __NR_O32_Linux)  \
1813                         _nr = (_regs).regs[4];          \
1814                 else                                    \
1815                         _nr = (_regs).regs[2];          \
1816                 _nr;                                    \
1817         })
1818 # define SYSCALL_NUM_SET(_regs, _nr)                    \
1819         do {                                            \
1820                 if ((_regs).regs[2] == __NR_O32_Linux)  \
1821                         (_regs).regs[4] = _nr;          \
1822                 else                                    \
1823                         (_regs).regs[2] = _nr;          \
1824         } while (0)
1825 # define SYSCALL_RET_SET(_regs, _val)                   \
1826                 TH_LOG("Can't modify syscall return on this architecture")
1827 #elif defined(__xtensa__)
1828 # define ARCH_REGS              struct user_pt_regs
1829 # define SYSCALL_NUM(_regs)     (_regs).syscall
1830 /*
1831  * On xtensa syscall return value is in the register
1832  * a2 of the current window which is not fixed.
1833  */
1834 #define SYSCALL_RET(_regs)      (_regs).a[(_regs).windowbase * 4 + 2]
1835 #elif defined(__sh__)
1836 # define ARCH_REGS              struct pt_regs
1837 # define SYSCALL_NUM(_regs)     (_regs).regs[3]
1838 # define SYSCALL_RET(_regs)     (_regs).regs[0]
1839 #else
1840 # error "Do not know how to find your architecture's registers and syscalls"
1841 #endif
1842
1843 /*
1844  * Most architectures can change the syscall by just updating the
1845  * associated register. This is the default if not defined above.
1846  */
1847 #ifndef SYSCALL_NUM_SET
1848 # define SYSCALL_NUM_SET(_regs, _nr)            \
1849         do {                                    \
1850                 SYSCALL_NUM(_regs) = (_nr);     \
1851         } while (0)
1852 #endif
1853 /*
1854  * Most architectures can change the syscall return value by just
1855  * writing to the SYSCALL_RET register. This is the default if not
1856  * defined above. If an architecture cannot set the return value
1857  * (for example when the syscall and return value register is
1858  * shared), report it with TH_LOG() in an arch-specific definition
1859  * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
1860  */
1861 #if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
1862 # error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
1863 #endif
1864 #ifndef SYSCALL_RET_SET
1865 # define SYSCALL_RET_SET(_regs, _val)           \
1866         do {                                    \
1867                 SYSCALL_RET(_regs) = (_val);    \
1868         } while (0)
1869 #endif
1870
1871 /* When the syscall return can't be changed, stub out the tests for it. */
1872 #ifndef SYSCALL_RET
1873 # define EXPECT_SYSCALL_RETURN(val, action)     EXPECT_EQ(-1, action)
1874 #else
1875 # define EXPECT_SYSCALL_RETURN(val, action)             \
1876         do {                                            \
1877                 errno = 0;                              \
1878                 if (val < 0) {                          \
1879                         EXPECT_EQ(-1, action);          \
1880                         EXPECT_EQ(-(val), errno);       \
1881                 } else {                                \
1882                         EXPECT_EQ(val, action);         \
1883                 }                                       \
1884         } while (0)
1885 #endif
1886
1887 /*
1888  * Some architectures (e.g. powerpc) can only set syscall
1889  * return values on syscall exit during ptrace.
1890  */
1891 const bool ptrace_entry_set_syscall_nr = true;
1892 const bool ptrace_entry_set_syscall_ret =
1893 #ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
1894         true;
1895 #else
1896         false;
1897 #endif
1898
1899 /*
1900  * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1901  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1902  */
1903 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__)
1904 # define ARCH_GETREGS(_regs)    ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
1905 # define ARCH_SETREGS(_regs)    ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
1906 #else
1907 # define ARCH_GETREGS(_regs)    ({                                      \
1908                 struct iovec __v;                                       \
1909                 __v.iov_base = &(_regs);                                \
1910                 __v.iov_len = sizeof(_regs);                            \
1911                 ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v);    \
1912         })
1913 # define ARCH_SETREGS(_regs)    ({                                      \
1914                 struct iovec __v;                                       \
1915                 __v.iov_base = &(_regs);                                \
1916                 __v.iov_len = sizeof(_regs);                            \
1917                 ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v);    \
1918         })
1919 #endif
1920
1921 /* Architecture-specific syscall fetching routine. */
1922 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1923 {
1924         ARCH_REGS regs;
1925
1926         EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1927                 return -1;
1928         }
1929
1930         return SYSCALL_NUM(regs);
1931 }
1932
1933 /* Architecture-specific syscall changing routine. */
1934 void __change_syscall(struct __test_metadata *_metadata,
1935                     pid_t tracee, long *syscall, long *ret)
1936 {
1937         ARCH_REGS orig, regs;
1938
1939         /* Do not get/set registers if we have nothing to do. */
1940         if (!syscall && !ret)
1941                 return;
1942
1943         EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1944                 return;
1945         }
1946         orig = regs;
1947
1948         if (syscall)
1949                 SYSCALL_NUM_SET(regs, *syscall);
1950
1951         if (ret)
1952                 SYSCALL_RET_SET(regs, *ret);
1953
1954         /* Flush any register changes made. */
1955         if (memcmp(&orig, &regs, sizeof(orig)) != 0)
1956                 EXPECT_EQ(0, ARCH_SETREGS(regs));
1957 }
1958
1959 /* Change only syscall number. */
1960 void change_syscall_nr(struct __test_metadata *_metadata,
1961                        pid_t tracee, long syscall)
1962 {
1963         __change_syscall(_metadata, tracee, &syscall, NULL);
1964 }
1965
1966 /* Change syscall return value (and set syscall number to -1). */
1967 void change_syscall_ret(struct __test_metadata *_metadata,
1968                         pid_t tracee, long ret)
1969 {
1970         long syscall = -1;
1971
1972         __change_syscall(_metadata, tracee, &syscall, &ret);
1973 }
1974
1975 void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
1976                     int status, void *args)
1977 {
1978         int ret;
1979         unsigned long msg;
1980
1981         EXPECT_EQ(PTRACE_EVENT_MASK(status), PTRACE_EVENT_SECCOMP) {
1982                 TH_LOG("Unexpected ptrace event: %d", PTRACE_EVENT_MASK(status));
1983                 return;
1984         }
1985
1986         /* Make sure we got the right message. */
1987         ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1988         EXPECT_EQ(0, ret);
1989
1990         /* Validate and take action on expected syscalls. */
1991         switch (msg) {
1992         case 0x1002:
1993                 /* change getpid to getppid. */
1994                 EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
1995                 change_syscall_nr(_metadata, tracee, __NR_getppid);
1996                 break;
1997         case 0x1003:
1998                 /* skip gettid with valid return code. */
1999                 EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
2000                 change_syscall_ret(_metadata, tracee, 45000);
2001                 break;
2002         case 0x1004:
2003                 /* skip openat with error. */
2004                 EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
2005                 change_syscall_ret(_metadata, tracee, -ESRCH);
2006                 break;
2007         case 0x1005:
2008                 /* do nothing (allow getppid) */
2009                 EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
2010                 break;
2011         default:
2012                 EXPECT_EQ(0, msg) {
2013                         TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
2014                         kill(tracee, SIGKILL);
2015                 }
2016         }
2017
2018 }
2019
2020 FIXTURE(TRACE_syscall) {
2021         struct sock_fprog prog;
2022         pid_t tracer, mytid, mypid, parent;
2023         long syscall_nr;
2024 };
2025
2026 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
2027                    int status, void *args)
2028 {
2029         int ret;
2030         unsigned long msg;
2031         static bool entry;
2032         long syscall_nr_val, syscall_ret_val;
2033         long *syscall_nr = NULL, *syscall_ret = NULL;
2034         FIXTURE_DATA(TRACE_syscall) *self = args;
2035
2036         EXPECT_EQ(WSTOPSIG(status) & 0x80, 0x80) {
2037                 TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
2038                 return;
2039         }
2040
2041         /*
2042          * The traditional way to tell PTRACE_SYSCALL entry/exit
2043          * is by counting.
2044          */
2045         entry = !entry;
2046
2047         /* Make sure we got an appropriate message. */
2048         ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2049         EXPECT_EQ(0, ret);
2050         EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
2051                         : PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
2052
2053         /*
2054          * Some architectures only support setting return values during
2055          * syscall exit under ptrace, and on exit the syscall number may
2056          * no longer be available. Therefore, save the initial sycall
2057          * number here, so it can be examined during both entry and exit
2058          * phases.
2059          */
2060         if (entry)
2061                 self->syscall_nr = get_syscall(_metadata, tracee);
2062
2063         /*
2064          * Depending on the architecture's syscall setting abilities, we
2065          * pick which things to set during this phase (entry or exit).
2066          */
2067         if (entry == ptrace_entry_set_syscall_nr)
2068                 syscall_nr = &syscall_nr_val;
2069         if (entry == ptrace_entry_set_syscall_ret)
2070                 syscall_ret = &syscall_ret_val;
2071
2072         /* Now handle the actual rewriting cases. */
2073         switch (self->syscall_nr) {
2074         case __NR_getpid:
2075                 syscall_nr_val = __NR_getppid;
2076                 /* Never change syscall return for this case. */
2077                 syscall_ret = NULL;
2078                 break;
2079         case __NR_gettid:
2080                 syscall_nr_val = -1;
2081                 syscall_ret_val = 45000;
2082                 break;
2083         case __NR_openat:
2084                 syscall_nr_val = -1;
2085                 syscall_ret_val = -ESRCH;
2086                 break;
2087         default:
2088                 /* Unhandled, do nothing. */
2089                 return;
2090         }
2091
2092         __change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
2093 }
2094
2095 FIXTURE_VARIANT(TRACE_syscall) {
2096         /*
2097          * All of the SECCOMP_RET_TRACE behaviors can be tested with either
2098          * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
2099          * This indicates if we should use SECCOMP_RET_TRACE (false), or
2100          * ptrace (true).
2101          */
2102         bool use_ptrace;
2103 };
2104
2105 FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
2106         .use_ptrace = true,
2107 };
2108
2109 FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
2110         .use_ptrace = false,
2111 };
2112
2113 FIXTURE_SETUP(TRACE_syscall)
2114 {
2115         struct sock_filter filter[] = {
2116                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2117                         offsetof(struct seccomp_data, nr)),
2118                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2119                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
2120                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
2121                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
2122                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
2123                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
2124                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2125                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
2126                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2127         };
2128         struct sock_fprog prog = {
2129                 .len = (unsigned short)ARRAY_SIZE(filter),
2130                 .filter = filter,
2131         };
2132         long ret;
2133
2134         /* Prepare some testable syscall results. */
2135         self->mytid = syscall(__NR_gettid);
2136         ASSERT_GT(self->mytid, 0);
2137         ASSERT_NE(self->mytid, 1) {
2138                 TH_LOG("Running this test as init is not supported. :)");
2139         }
2140
2141         self->mypid = getpid();
2142         ASSERT_GT(self->mypid, 0);
2143         ASSERT_EQ(self->mytid, self->mypid);
2144
2145         self->parent = getppid();
2146         ASSERT_GT(self->parent, 0);
2147         ASSERT_NE(self->parent, self->mypid);
2148
2149         /* Launch tracer. */
2150         self->tracer = setup_trace_fixture(_metadata,
2151                                            variant->use_ptrace ? tracer_ptrace
2152                                                                : tracer_seccomp,
2153                                            self, variant->use_ptrace);
2154
2155         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2156         ASSERT_EQ(0, ret);
2157
2158         /* Do not install seccomp rewrite filters, as we'll use ptrace instead. */
2159         if (variant->use_ptrace)
2160                 return;
2161
2162         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2163         ASSERT_EQ(0, ret);
2164 }
2165
2166 FIXTURE_TEARDOWN(TRACE_syscall)
2167 {
2168         teardown_trace_fixture(_metadata, self->tracer);
2169 }
2170
2171 TEST(negative_ENOSYS)
2172 {
2173         /*
2174          * There should be no difference between an "internal" skip
2175          * and userspace asking for syscall "-1".
2176          */
2177         errno = 0;
2178         EXPECT_EQ(-1, syscall(-1));
2179         EXPECT_EQ(errno, ENOSYS);
2180         /* And no difference for "still not valid but not -1". */
2181         errno = 0;
2182         EXPECT_EQ(-1, syscall(-101));
2183         EXPECT_EQ(errno, ENOSYS);
2184 }
2185
2186 TEST_F(TRACE_syscall, negative_ENOSYS)
2187 {
2188         negative_ENOSYS(_metadata);
2189 }
2190
2191 TEST_F(TRACE_syscall, syscall_allowed)
2192 {
2193         /* getppid works as expected (no changes). */
2194         EXPECT_EQ(self->parent, syscall(__NR_getppid));
2195         EXPECT_NE(self->mypid, syscall(__NR_getppid));
2196 }
2197
2198 TEST_F(TRACE_syscall, syscall_redirected)
2199 {
2200         /* getpid has been redirected to getppid as expected. */
2201         EXPECT_EQ(self->parent, syscall(__NR_getpid));
2202         EXPECT_NE(self->mypid, syscall(__NR_getpid));
2203 }
2204
2205 TEST_F(TRACE_syscall, syscall_errno)
2206 {
2207         /* Tracer should skip the open syscall, resulting in ESRCH. */
2208         EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2209 }
2210
2211 TEST_F(TRACE_syscall, syscall_faked)
2212 {
2213         /* Tracer skips the gettid syscall and store altered return value. */
2214         EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2215 }
2216
2217 TEST_F_SIGNAL(TRACE_syscall, kill_immediate, SIGSYS)
2218 {
2219         struct sock_filter filter[] = {
2220                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2221                         offsetof(struct seccomp_data, nr)),
2222                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mknodat, 0, 1),
2223                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
2224                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2225         };
2226         struct sock_fprog prog = {
2227                 .len = (unsigned short)ARRAY_SIZE(filter),
2228                 .filter = filter,
2229         };
2230         long ret;
2231
2232         /* Install "kill on mknodat" filter. */
2233         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2234         ASSERT_EQ(0, ret);
2235
2236         /* This should immediately die with SIGSYS, regardless of tracer. */
2237         EXPECT_EQ(-1, syscall(__NR_mknodat, -1, NULL, 0, 0));
2238 }
2239
2240 TEST_F(TRACE_syscall, skip_after)
2241 {
2242         struct sock_filter filter[] = {
2243                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2244                         offsetof(struct seccomp_data, nr)),
2245                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2246                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2247                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2248         };
2249         struct sock_fprog prog = {
2250                 .len = (unsigned short)ARRAY_SIZE(filter),
2251                 .filter = filter,
2252         };
2253         long ret;
2254
2255         /* Install additional "errno on getppid" filter. */
2256         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2257         ASSERT_EQ(0, ret);
2258
2259         /* Tracer will redirect getpid to getppid, and we should see EPERM. */
2260         errno = 0;
2261         EXPECT_EQ(-1, syscall(__NR_getpid));
2262         EXPECT_EQ(EPERM, errno);
2263 }
2264
2265 TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2266 {
2267         struct sock_filter filter[] = {
2268                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2269                         offsetof(struct seccomp_data, nr)),
2270                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2271                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2272                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2273         };
2274         struct sock_fprog prog = {
2275                 .len = (unsigned short)ARRAY_SIZE(filter),
2276                 .filter = filter,
2277         };
2278         long ret;
2279
2280         /* Install additional "death on getppid" filter. */
2281         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2282         ASSERT_EQ(0, ret);
2283
2284         /* Tracer will redirect getpid to getppid, and we should die. */
2285         EXPECT_NE(self->mypid, syscall(__NR_getpid));
2286 }
2287
2288 TEST(seccomp_syscall)
2289 {
2290         struct sock_filter filter[] = {
2291                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2292         };
2293         struct sock_fprog prog = {
2294                 .len = (unsigned short)ARRAY_SIZE(filter),
2295                 .filter = filter,
2296         };
2297         long ret;
2298
2299         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2300         ASSERT_EQ(0, ret) {
2301                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2302         }
2303
2304         /* Reject insane operation. */
2305         ret = seccomp(-1, 0, &prog);
2306         ASSERT_NE(ENOSYS, errno) {
2307                 TH_LOG("Kernel does not support seccomp syscall!");
2308         }
2309         EXPECT_EQ(EINVAL, errno) {
2310                 TH_LOG("Did not reject crazy op value!");
2311         }
2312
2313         /* Reject strict with flags or pointer. */
2314         ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2315         EXPECT_EQ(EINVAL, errno) {
2316                 TH_LOG("Did not reject mode strict with flags!");
2317         }
2318         ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2319         EXPECT_EQ(EINVAL, errno) {
2320                 TH_LOG("Did not reject mode strict with uargs!");
2321         }
2322
2323         /* Reject insane args for filter. */
2324         ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2325         EXPECT_EQ(EINVAL, errno) {
2326                 TH_LOG("Did not reject crazy filter flags!");
2327         }
2328         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2329         EXPECT_EQ(EFAULT, errno) {
2330                 TH_LOG("Did not reject NULL filter!");
2331         }
2332
2333         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2334         EXPECT_EQ(0, errno) {
2335                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2336                         strerror(errno));
2337         }
2338 }
2339
2340 TEST(seccomp_syscall_mode_lock)
2341 {
2342         struct sock_filter filter[] = {
2343                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2344         };
2345         struct sock_fprog prog = {
2346                 .len = (unsigned short)ARRAY_SIZE(filter),
2347                 .filter = filter,
2348         };
2349         long ret;
2350
2351         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2352         ASSERT_EQ(0, ret) {
2353                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2354         }
2355
2356         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2357         ASSERT_NE(ENOSYS, errno) {
2358                 TH_LOG("Kernel does not support seccomp syscall!");
2359         }
2360         EXPECT_EQ(0, ret) {
2361                 TH_LOG("Could not install filter!");
2362         }
2363
2364         /* Make sure neither entry point will switch to strict. */
2365         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2366         EXPECT_EQ(EINVAL, errno) {
2367                 TH_LOG("Switched to mode strict!");
2368         }
2369
2370         ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2371         EXPECT_EQ(EINVAL, errno) {
2372                 TH_LOG("Switched to mode strict!");
2373         }
2374 }
2375
2376 /*
2377  * Test detection of known and unknown filter flags. Userspace needs to be able
2378  * to check if a filter flag is supported by the current kernel and a good way
2379  * of doing that is by attempting to enter filter mode, with the flag bit in
2380  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2381  * that the flag is valid and EINVAL indicates that the flag is invalid.
2382  */
2383 TEST(detect_seccomp_filter_flags)
2384 {
2385         unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2386                                  SECCOMP_FILTER_FLAG_LOG,
2387                                  SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2388                                  SECCOMP_FILTER_FLAG_NEW_LISTENER,
2389                                  SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2390         unsigned int exclusive[] = {
2391                                 SECCOMP_FILTER_FLAG_TSYNC,
2392                                 SECCOMP_FILTER_FLAG_NEW_LISTENER };
2393         unsigned int flag, all_flags, exclusive_mask;
2394         int i;
2395         long ret;
2396
2397         /* Test detection of individual known-good filter flags */
2398         for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2399                 int bits = 0;
2400
2401                 flag = flags[i];
2402                 /* Make sure the flag is a single bit! */
2403                 while (flag) {
2404                         if (flag & 0x1)
2405                                 bits ++;
2406                         flag >>= 1;
2407                 }
2408                 ASSERT_EQ(1, bits);
2409                 flag = flags[i];
2410
2411                 ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2412                 ASSERT_NE(ENOSYS, errno) {
2413                         TH_LOG("Kernel does not support seccomp syscall!");
2414                 }
2415                 EXPECT_EQ(-1, ret);
2416                 EXPECT_EQ(EFAULT, errno) {
2417                         TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2418                                flag);
2419                 }
2420
2421                 all_flags |= flag;
2422         }
2423
2424         /*
2425          * Test detection of all known-good filter flags combined. But
2426          * for the exclusive flags we need to mask them out and try them
2427          * individually for the "all flags" testing.
2428          */
2429         exclusive_mask = 0;
2430         for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2431                 exclusive_mask |= exclusive[i];
2432         for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2433                 flag = all_flags & ~exclusive_mask;
2434                 flag |= exclusive[i];
2435
2436                 ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2437                 EXPECT_EQ(-1, ret);
2438                 EXPECT_EQ(EFAULT, errno) {
2439                         TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2440                                flag);
2441                 }
2442         }
2443
2444         /* Test detection of an unknown filter flags, without exclusives. */
2445         flag = -1;
2446         flag &= ~exclusive_mask;
2447         ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2448         EXPECT_EQ(-1, ret);
2449         EXPECT_EQ(EINVAL, errno) {
2450                 TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2451                        flag);
2452         }
2453
2454         /*
2455          * Test detection of an unknown filter flag that may simply need to be
2456          * added to this test
2457          */
2458         flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2459         ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2460         EXPECT_EQ(-1, ret);
2461         EXPECT_EQ(EINVAL, errno) {
2462                 TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2463                        flag);
2464         }
2465 }
2466
2467 TEST(TSYNC_first)
2468 {
2469         struct sock_filter filter[] = {
2470                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2471         };
2472         struct sock_fprog prog = {
2473                 .len = (unsigned short)ARRAY_SIZE(filter),
2474                 .filter = filter,
2475         };
2476         long ret;
2477
2478         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2479         ASSERT_EQ(0, ret) {
2480                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2481         }
2482
2483         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2484                       &prog);
2485         ASSERT_NE(ENOSYS, errno) {
2486                 TH_LOG("Kernel does not support seccomp syscall!");
2487         }
2488         EXPECT_EQ(0, ret) {
2489                 TH_LOG("Could not install initial filter with TSYNC!");
2490         }
2491 }
2492
2493 #define TSYNC_SIBLINGS 2
2494 struct tsync_sibling {
2495         pthread_t tid;
2496         pid_t system_tid;
2497         sem_t *started;
2498         pthread_cond_t *cond;
2499         pthread_mutex_t *mutex;
2500         int diverge;
2501         int num_waits;
2502         struct sock_fprog *prog;
2503         struct __test_metadata *metadata;
2504 };
2505
2506 /*
2507  * To avoid joining joined threads (which is not allowed by Bionic),
2508  * make sure we both successfully join and clear the tid to skip a
2509  * later join attempt during fixture teardown. Any remaining threads
2510  * will be directly killed during teardown.
2511  */
2512 #define PTHREAD_JOIN(tid, status)                                       \
2513         do {                                                            \
2514                 int _rc = pthread_join(tid, status);                    \
2515                 if (_rc) {                                              \
2516                         TH_LOG("pthread_join of tid %u failed: %d\n",   \
2517                                 (unsigned int)tid, _rc);                \
2518                 } else {                                                \
2519                         tid = 0;                                        \
2520                 }                                                       \
2521         } while (0)
2522
2523 FIXTURE(TSYNC) {
2524         struct sock_fprog root_prog, apply_prog;
2525         struct tsync_sibling sibling[TSYNC_SIBLINGS];
2526         sem_t started;
2527         pthread_cond_t cond;
2528         pthread_mutex_t mutex;
2529         int sibling_count;
2530 };
2531
2532 FIXTURE_SETUP(TSYNC)
2533 {
2534         struct sock_filter root_filter[] = {
2535                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2536         };
2537         struct sock_filter apply_filter[] = {
2538                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2539                         offsetof(struct seccomp_data, nr)),
2540                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2541                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2542                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2543         };
2544
2545         memset(&self->root_prog, 0, sizeof(self->root_prog));
2546         memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2547         memset(&self->sibling, 0, sizeof(self->sibling));
2548         self->root_prog.filter = malloc(sizeof(root_filter));
2549         ASSERT_NE(NULL, self->root_prog.filter);
2550         memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2551         self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2552
2553         self->apply_prog.filter = malloc(sizeof(apply_filter));
2554         ASSERT_NE(NULL, self->apply_prog.filter);
2555         memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2556         self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2557
2558         self->sibling_count = 0;
2559         pthread_mutex_init(&self->mutex, NULL);
2560         pthread_cond_init(&self->cond, NULL);
2561         sem_init(&self->started, 0, 0);
2562         self->sibling[0].tid = 0;
2563         self->sibling[0].cond = &self->cond;
2564         self->sibling[0].started = &self->started;
2565         self->sibling[0].mutex = &self->mutex;
2566         self->sibling[0].diverge = 0;
2567         self->sibling[0].num_waits = 1;
2568         self->sibling[0].prog = &self->root_prog;
2569         self->sibling[0].metadata = _metadata;
2570         self->sibling[1].tid = 0;
2571         self->sibling[1].cond = &self->cond;
2572         self->sibling[1].started = &self->started;
2573         self->sibling[1].mutex = &self->mutex;
2574         self->sibling[1].diverge = 0;
2575         self->sibling[1].prog = &self->root_prog;
2576         self->sibling[1].num_waits = 1;
2577         self->sibling[1].metadata = _metadata;
2578 }
2579
2580 FIXTURE_TEARDOWN(TSYNC)
2581 {
2582         int sib = 0;
2583
2584         if (self->root_prog.filter)
2585                 free(self->root_prog.filter);
2586         if (self->apply_prog.filter)
2587                 free(self->apply_prog.filter);
2588
2589         for ( ; sib < self->sibling_count; ++sib) {
2590                 struct tsync_sibling *s = &self->sibling[sib];
2591
2592                 if (!s->tid)
2593                         continue;
2594                 /*
2595                  * If a thread is still running, it may be stuck, so hit
2596                  * it over the head really hard.
2597                  */
2598                 pthread_kill(s->tid, 9);
2599         }
2600         pthread_mutex_destroy(&self->mutex);
2601         pthread_cond_destroy(&self->cond);
2602         sem_destroy(&self->started);
2603 }
2604
2605 void *tsync_sibling(void *data)
2606 {
2607         long ret = 0;
2608         struct tsync_sibling *me = data;
2609
2610         me->system_tid = syscall(__NR_gettid);
2611
2612         pthread_mutex_lock(me->mutex);
2613         if (me->diverge) {
2614                 /* Just re-apply the root prog to fork the tree */
2615                 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2616                                 me->prog, 0, 0);
2617         }
2618         sem_post(me->started);
2619         /* Return outside of started so parent notices failures. */
2620         if (ret) {
2621                 pthread_mutex_unlock(me->mutex);
2622                 return (void *)SIBLING_EXIT_FAILURE;
2623         }
2624         do {
2625                 pthread_cond_wait(me->cond, me->mutex);
2626                 me->num_waits = me->num_waits - 1;
2627         } while (me->num_waits);
2628         pthread_mutex_unlock(me->mutex);
2629
2630         ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2631         if (!ret)
2632                 return (void *)SIBLING_EXIT_NEWPRIVS;
2633         read(-1, NULL, 0);
2634         return (void *)SIBLING_EXIT_UNKILLED;
2635 }
2636
2637 void tsync_start_sibling(struct tsync_sibling *sibling)
2638 {
2639         pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2640 }
2641
2642 TEST_F(TSYNC, siblings_fail_prctl)
2643 {
2644         long ret;
2645         void *status;
2646         struct sock_filter filter[] = {
2647                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2648                         offsetof(struct seccomp_data, nr)),
2649                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2650                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2651                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2652         };
2653         struct sock_fprog prog = {
2654                 .len = (unsigned short)ARRAY_SIZE(filter),
2655                 .filter = filter,
2656         };
2657
2658         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2659                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2660         }
2661
2662         /* Check prctl failure detection by requesting sib 0 diverge. */
2663         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2664         ASSERT_NE(ENOSYS, errno) {
2665                 TH_LOG("Kernel does not support seccomp syscall!");
2666         }
2667         ASSERT_EQ(0, ret) {
2668                 TH_LOG("setting filter failed");
2669         }
2670
2671         self->sibling[0].diverge = 1;
2672         tsync_start_sibling(&self->sibling[0]);
2673         tsync_start_sibling(&self->sibling[1]);
2674
2675         while (self->sibling_count < TSYNC_SIBLINGS) {
2676                 sem_wait(&self->started);
2677                 self->sibling_count++;
2678         }
2679
2680         /* Signal the threads to clean up*/
2681         pthread_mutex_lock(&self->mutex);
2682         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2683                 TH_LOG("cond broadcast non-zero");
2684         }
2685         pthread_mutex_unlock(&self->mutex);
2686
2687         /* Ensure diverging sibling failed to call prctl. */
2688         PTHREAD_JOIN(self->sibling[0].tid, &status);
2689         EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2690         PTHREAD_JOIN(self->sibling[1].tid, &status);
2691         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2692 }
2693
2694 TEST_F(TSYNC, two_siblings_with_ancestor)
2695 {
2696         long ret;
2697         void *status;
2698
2699         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2700                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2701         }
2702
2703         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2704         ASSERT_NE(ENOSYS, errno) {
2705                 TH_LOG("Kernel does not support seccomp syscall!");
2706         }
2707         ASSERT_EQ(0, ret) {
2708                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2709         }
2710         tsync_start_sibling(&self->sibling[0]);
2711         tsync_start_sibling(&self->sibling[1]);
2712
2713         while (self->sibling_count < TSYNC_SIBLINGS) {
2714                 sem_wait(&self->started);
2715                 self->sibling_count++;
2716         }
2717
2718         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2719                       &self->apply_prog);
2720         ASSERT_EQ(0, ret) {
2721                 TH_LOG("Could install filter on all threads!");
2722         }
2723         /* Tell the siblings to test the policy */
2724         pthread_mutex_lock(&self->mutex);
2725         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2726                 TH_LOG("cond broadcast non-zero");
2727         }
2728         pthread_mutex_unlock(&self->mutex);
2729         /* Ensure they are both killed and don't exit cleanly. */
2730         PTHREAD_JOIN(self->sibling[0].tid, &status);
2731         EXPECT_EQ(0x0, (long)status);
2732         PTHREAD_JOIN(self->sibling[1].tid, &status);
2733         EXPECT_EQ(0x0, (long)status);
2734 }
2735
2736 TEST_F(TSYNC, two_sibling_want_nnp)
2737 {
2738         void *status;
2739
2740         /* start siblings before any prctl() operations */
2741         tsync_start_sibling(&self->sibling[0]);
2742         tsync_start_sibling(&self->sibling[1]);
2743         while (self->sibling_count < TSYNC_SIBLINGS) {
2744                 sem_wait(&self->started);
2745                 self->sibling_count++;
2746         }
2747
2748         /* Tell the siblings to test no policy */
2749         pthread_mutex_lock(&self->mutex);
2750         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2751                 TH_LOG("cond broadcast non-zero");
2752         }
2753         pthread_mutex_unlock(&self->mutex);
2754
2755         /* Ensure they are both upset about lacking nnp. */
2756         PTHREAD_JOIN(self->sibling[0].tid, &status);
2757         EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2758         PTHREAD_JOIN(self->sibling[1].tid, &status);
2759         EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2760 }
2761
2762 TEST_F(TSYNC, two_siblings_with_no_filter)
2763 {
2764         long ret;
2765         void *status;
2766
2767         /* start siblings before any prctl() operations */
2768         tsync_start_sibling(&self->sibling[0]);
2769         tsync_start_sibling(&self->sibling[1]);
2770         while (self->sibling_count < TSYNC_SIBLINGS) {
2771                 sem_wait(&self->started);
2772                 self->sibling_count++;
2773         }
2774
2775         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2776                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2777         }
2778
2779         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2780                       &self->apply_prog);
2781         ASSERT_NE(ENOSYS, errno) {
2782                 TH_LOG("Kernel does not support seccomp syscall!");
2783         }
2784         ASSERT_EQ(0, ret) {
2785                 TH_LOG("Could install filter on all threads!");
2786         }
2787
2788         /* Tell the siblings to test the policy */
2789         pthread_mutex_lock(&self->mutex);
2790         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2791                 TH_LOG("cond broadcast non-zero");
2792         }
2793         pthread_mutex_unlock(&self->mutex);
2794
2795         /* Ensure they are both killed and don't exit cleanly. */
2796         PTHREAD_JOIN(self->sibling[0].tid, &status);
2797         EXPECT_EQ(0x0, (long)status);
2798         PTHREAD_JOIN(self->sibling[1].tid, &status);
2799         EXPECT_EQ(0x0, (long)status);
2800 }
2801
2802 TEST_F(TSYNC, two_siblings_with_one_divergence)
2803 {
2804         long ret;
2805         void *status;
2806
2807         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2808                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2809         }
2810
2811         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2812         ASSERT_NE(ENOSYS, errno) {
2813                 TH_LOG("Kernel does not support seccomp syscall!");
2814         }
2815         ASSERT_EQ(0, ret) {
2816                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2817         }
2818         self->sibling[0].diverge = 1;
2819         tsync_start_sibling(&self->sibling[0]);
2820         tsync_start_sibling(&self->sibling[1]);
2821
2822         while (self->sibling_count < TSYNC_SIBLINGS) {
2823                 sem_wait(&self->started);
2824                 self->sibling_count++;
2825         }
2826
2827         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2828                       &self->apply_prog);
2829         ASSERT_EQ(self->sibling[0].system_tid, ret) {
2830                 TH_LOG("Did not fail on diverged sibling.");
2831         }
2832
2833         /* Wake the threads */
2834         pthread_mutex_lock(&self->mutex);
2835         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2836                 TH_LOG("cond broadcast non-zero");
2837         }
2838         pthread_mutex_unlock(&self->mutex);
2839
2840         /* Ensure they are both unkilled. */
2841         PTHREAD_JOIN(self->sibling[0].tid, &status);
2842         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2843         PTHREAD_JOIN(self->sibling[1].tid, &status);
2844         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2845 }
2846
2847 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2848 {
2849         long ret, flags;
2850         void *status;
2851
2852         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2853                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2854         }
2855
2856         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2857         ASSERT_NE(ENOSYS, errno) {
2858                 TH_LOG("Kernel does not support seccomp syscall!");
2859         }
2860         ASSERT_EQ(0, ret) {
2861                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2862         }
2863         self->sibling[0].diverge = 1;
2864         tsync_start_sibling(&self->sibling[0]);
2865         tsync_start_sibling(&self->sibling[1]);
2866
2867         while (self->sibling_count < TSYNC_SIBLINGS) {
2868                 sem_wait(&self->started);
2869                 self->sibling_count++;
2870         }
2871
2872         flags = SECCOMP_FILTER_FLAG_TSYNC | \
2873                 SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2874         ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2875         ASSERT_EQ(ESRCH, errno) {
2876                 TH_LOG("Did not return ESRCH for diverged sibling.");
2877         }
2878         ASSERT_EQ(-1, ret) {
2879                 TH_LOG("Did not fail on diverged sibling.");
2880         }
2881
2882         /* Wake the threads */
2883         pthread_mutex_lock(&self->mutex);
2884         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2885                 TH_LOG("cond broadcast non-zero");
2886         }
2887         pthread_mutex_unlock(&self->mutex);
2888
2889         /* Ensure they are both unkilled. */
2890         PTHREAD_JOIN(self->sibling[0].tid, &status);
2891         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2892         PTHREAD_JOIN(self->sibling[1].tid, &status);
2893         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2894 }
2895
2896 TEST_F(TSYNC, two_siblings_not_under_filter)
2897 {
2898         long ret, sib;
2899         void *status;
2900         struct timespec delay = { .tv_nsec = 100000000 };
2901
2902         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2903                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2904         }
2905
2906         /*
2907          * Sibling 0 will have its own seccomp policy
2908          * and Sibling 1 will not be under seccomp at
2909          * all. Sibling 1 will enter seccomp and 0
2910          * will cause failure.
2911          */
2912         self->sibling[0].diverge = 1;
2913         tsync_start_sibling(&self->sibling[0]);
2914         tsync_start_sibling(&self->sibling[1]);
2915
2916         while (self->sibling_count < TSYNC_SIBLINGS) {
2917                 sem_wait(&self->started);
2918                 self->sibling_count++;
2919         }
2920
2921         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2922         ASSERT_NE(ENOSYS, errno) {
2923                 TH_LOG("Kernel does not support seccomp syscall!");
2924         }
2925         ASSERT_EQ(0, ret) {
2926                 TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2927         }
2928
2929         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2930                       &self->apply_prog);
2931         ASSERT_EQ(ret, self->sibling[0].system_tid) {
2932                 TH_LOG("Did not fail on diverged sibling.");
2933         }
2934         sib = 1;
2935         if (ret == self->sibling[0].system_tid)
2936                 sib = 0;
2937
2938         pthread_mutex_lock(&self->mutex);
2939
2940         /* Increment the other siblings num_waits so we can clean up
2941          * the one we just saw.
2942          */
2943         self->sibling[!sib].num_waits += 1;
2944
2945         /* Signal the thread to clean up*/
2946         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2947                 TH_LOG("cond broadcast non-zero");
2948         }
2949         pthread_mutex_unlock(&self->mutex);
2950         PTHREAD_JOIN(self->sibling[sib].tid, &status);
2951         EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2952         /* Poll for actual task death. pthread_join doesn't guarantee it. */
2953         while (!kill(self->sibling[sib].system_tid, 0))
2954                 nanosleep(&delay, NULL);
2955         /* Switch to the remaining sibling */
2956         sib = !sib;
2957
2958         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2959                       &self->apply_prog);
2960         ASSERT_EQ(0, ret) {
2961                 TH_LOG("Expected the remaining sibling to sync");
2962         };
2963
2964         pthread_mutex_lock(&self->mutex);
2965
2966         /* If remaining sibling didn't have a chance to wake up during
2967          * the first broadcast, manually reduce the num_waits now.
2968          */
2969         if (self->sibling[sib].num_waits > 1)
2970                 self->sibling[sib].num_waits = 1;
2971         ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2972                 TH_LOG("cond broadcast non-zero");
2973         }
2974         pthread_mutex_unlock(&self->mutex);
2975         PTHREAD_JOIN(self->sibling[sib].tid, &status);
2976         EXPECT_EQ(0, (long)status);
2977         /* Poll for actual task death. pthread_join doesn't guarantee it. */
2978         while (!kill(self->sibling[sib].system_tid, 0))
2979                 nanosleep(&delay, NULL);
2980
2981         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2982                       &self->apply_prog);
2983         ASSERT_EQ(0, ret);  /* just us chickens */
2984 }
2985
2986 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
2987 TEST(syscall_restart)
2988 {
2989         long ret;
2990         unsigned long msg;
2991         pid_t child_pid;
2992         int pipefd[2];
2993         int status;
2994         siginfo_t info = { };
2995         struct sock_filter filter[] = {
2996                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2997                          offsetof(struct seccomp_data, nr)),
2998
2999 #ifdef __NR_sigreturn
3000                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
3001 #endif
3002                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
3003                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
3004                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
3005                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
3006                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
3007                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
3008
3009                 /* Allow __NR_write for easy logging. */
3010                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
3011                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3012                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3013                 /* The nanosleep jump target. */
3014                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
3015                 /* The restart_syscall jump target. */
3016                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
3017         };
3018         struct sock_fprog prog = {
3019                 .len = (unsigned short)ARRAY_SIZE(filter),
3020                 .filter = filter,
3021         };
3022 #if defined(__arm__)
3023         struct utsname utsbuf;
3024 #endif
3025
3026         ASSERT_EQ(0, pipe(pipefd));
3027
3028         child_pid = fork();
3029         ASSERT_LE(0, child_pid);
3030         if (child_pid == 0) {
3031                 /* Child uses EXPECT not ASSERT to deliver status correctly. */
3032                 char buf = ' ';
3033                 struct timespec timeout = { };
3034
3035                 /* Attach parent as tracer and stop. */
3036                 EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
3037                 EXPECT_EQ(0, raise(SIGSTOP));
3038
3039                 EXPECT_EQ(0, close(pipefd[1]));
3040
3041                 EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
3042                         TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3043                 }
3044
3045                 ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
3046                 EXPECT_EQ(0, ret) {
3047                         TH_LOG("Failed to install filter!");
3048                 }
3049
3050                 EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3051                         TH_LOG("Failed to read() sync from parent");
3052                 }
3053                 EXPECT_EQ('.', buf) {
3054                         TH_LOG("Failed to get sync data from read()");
3055                 }
3056
3057                 /* Start nanosleep to be interrupted. */
3058                 timeout.tv_sec = 1;
3059                 errno = 0;
3060                 EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
3061                         TH_LOG("Call to nanosleep() failed (errno %d)", errno);
3062                 }
3063
3064                 /* Read final sync from parent. */
3065                 EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3066                         TH_LOG("Failed final read() from parent");
3067                 }
3068                 EXPECT_EQ('!', buf) {
3069                         TH_LOG("Failed to get final data from read()");
3070                 }
3071
3072                 /* Directly report the status of our test harness results. */
3073                 syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS
3074                                                      : EXIT_FAILURE);
3075         }
3076         EXPECT_EQ(0, close(pipefd[0]));
3077
3078         /* Attach to child, setup options, and release. */
3079         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3080         ASSERT_EQ(true, WIFSTOPPED(status));
3081         ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
3082                             PTRACE_O_TRACESECCOMP));
3083         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3084         ASSERT_EQ(1, write(pipefd[1], ".", 1));
3085
3086         /* Wait for nanosleep() to start. */
3087         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3088         ASSERT_EQ(true, WIFSTOPPED(status));
3089         ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3090         ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3091         ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3092         ASSERT_EQ(0x100, msg);
3093         ret = get_syscall(_metadata, child_pid);
3094         EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
3095
3096         /* Might as well check siginfo for sanity while we're here. */
3097         ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3098         ASSERT_EQ(SIGTRAP, info.si_signo);
3099         ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
3100         EXPECT_EQ(0, info.si_errno);
3101         EXPECT_EQ(getuid(), info.si_uid);
3102         /* Verify signal delivery came from child (seccomp-triggered). */
3103         EXPECT_EQ(child_pid, info.si_pid);
3104
3105         /* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
3106         ASSERT_EQ(0, kill(child_pid, SIGSTOP));
3107         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3108         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3109         ASSERT_EQ(true, WIFSTOPPED(status));
3110         ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
3111         ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3112         /*
3113          * There is no siginfo on SIGSTOP any more, so we can't verify
3114          * signal delivery came from parent now (getpid() == info.si_pid).
3115          * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
3116          * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
3117          */
3118         EXPECT_EQ(SIGSTOP, info.si_signo);
3119
3120         /* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
3121         ASSERT_EQ(0, kill(child_pid, SIGCONT));
3122         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3123         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3124         ASSERT_EQ(true, WIFSTOPPED(status));
3125         ASSERT_EQ(SIGCONT, WSTOPSIG(status));
3126         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3127
3128         /* Wait for restart_syscall() to start. */
3129         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3130         ASSERT_EQ(true, WIFSTOPPED(status));
3131         ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3132         ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3133         ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3134
3135         ASSERT_EQ(0x200, msg);
3136         ret = get_syscall(_metadata, child_pid);
3137 #if defined(__arm__)
3138         /*
3139          * FIXME:
3140          * - native ARM registers do NOT expose true syscall.
3141          * - compat ARM registers on ARM64 DO expose true syscall.
3142          */
3143         ASSERT_EQ(0, uname(&utsbuf));
3144         if (strncmp(utsbuf.machine, "arm", 3) == 0) {
3145                 EXPECT_EQ(__NR_nanosleep, ret);
3146         } else
3147 #endif
3148         {
3149                 EXPECT_EQ(__NR_restart_syscall, ret);
3150         }
3151
3152         /* Write again to end test. */
3153         ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3154         ASSERT_EQ(1, write(pipefd[1], "!", 1));
3155         EXPECT_EQ(0, close(pipefd[1]));
3156
3157         ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3158         if (WIFSIGNALED(status) || WEXITSTATUS(status))
3159                 _metadata->passed = 0;
3160 }
3161
3162 TEST_SIGNAL(filter_flag_log, SIGSYS)
3163 {
3164         struct sock_filter allow_filter[] = {
3165                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3166         };
3167         struct sock_filter kill_filter[] = {
3168                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3169                         offsetof(struct seccomp_data, nr)),
3170                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
3171                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3172                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3173         };
3174         struct sock_fprog allow_prog = {
3175                 .len = (unsigned short)ARRAY_SIZE(allow_filter),
3176                 .filter = allow_filter,
3177         };
3178         struct sock_fprog kill_prog = {
3179                 .len = (unsigned short)ARRAY_SIZE(kill_filter),
3180                 .filter = kill_filter,
3181         };
3182         long ret;
3183         pid_t parent = getppid();
3184
3185         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3186         ASSERT_EQ(0, ret);
3187
3188         /* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
3189         ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
3190                       &allow_prog);
3191         ASSERT_NE(ENOSYS, errno) {
3192                 TH_LOG("Kernel does not support seccomp syscall!");
3193         }
3194         EXPECT_NE(0, ret) {
3195                 TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3196         }
3197         EXPECT_EQ(EINVAL, errno) {
3198                 TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3199         }
3200
3201         /* Verify that a simple, permissive filter can be added with no flags */
3202         ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3203         EXPECT_EQ(0, ret);
3204
3205         /* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3206         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3207                       &allow_prog);
3208         ASSERT_NE(EINVAL, errno) {
3209                 TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3210         }
3211         EXPECT_EQ(0, ret);
3212
3213         /* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3214         ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3215                       &kill_prog);
3216         EXPECT_EQ(0, ret);
3217
3218         EXPECT_EQ(parent, syscall(__NR_getppid));
3219         /* getpid() should never return. */
3220         EXPECT_EQ(0, syscall(__NR_getpid));
3221 }
3222
3223 TEST(get_action_avail)
3224 {
3225         __u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3226                             SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3227                             SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3228         __u32 unknown_action = 0x10000000U;
3229         int i;
3230         long ret;
3231
3232         ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3233         ASSERT_NE(ENOSYS, errno) {
3234                 TH_LOG("Kernel does not support seccomp syscall!");
3235         }
3236         ASSERT_NE(EINVAL, errno) {
3237                 TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3238         }
3239         EXPECT_EQ(ret, 0);
3240
3241         for (i = 0; i < ARRAY_SIZE(actions); i++) {
3242                 ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3243                 EXPECT_EQ(ret, 0) {
3244                         TH_LOG("Expected action (0x%X) not available!",
3245                                actions[i]);
3246                 }
3247         }
3248
3249         /* Check that an unknown action is handled properly (EOPNOTSUPP) */
3250         ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3251         EXPECT_EQ(ret, -1);
3252         EXPECT_EQ(errno, EOPNOTSUPP);
3253 }
3254
3255 TEST(get_metadata)
3256 {
3257         pid_t pid;
3258         int pipefd[2];
3259         char buf;
3260         struct seccomp_metadata md;
3261         long ret;
3262
3263         /* Only real root can get metadata. */
3264         if (geteuid()) {
3265                 SKIP(return, "get_metadata requires real root");
3266                 return;
3267         }
3268
3269         ASSERT_EQ(0, pipe(pipefd));
3270
3271         pid = fork();
3272         ASSERT_GE(pid, 0);
3273         if (pid == 0) {
3274                 struct sock_filter filter[] = {
3275                         BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3276                 };
3277                 struct sock_fprog prog = {
3278                         .len = (unsigned short)ARRAY_SIZE(filter),
3279                         .filter = filter,
3280                 };
3281
3282                 /* one with log, one without */
3283                 EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3284                                      SECCOMP_FILTER_FLAG_LOG, &prog));
3285                 EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3286
3287                 EXPECT_EQ(0, close(pipefd[0]));
3288                 ASSERT_EQ(1, write(pipefd[1], "1", 1));
3289                 ASSERT_EQ(0, close(pipefd[1]));
3290
3291                 while (1)
3292                         sleep(100);
3293         }
3294
3295         ASSERT_EQ(0, close(pipefd[1]));
3296         ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3297
3298         ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3299         ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3300
3301         /* Past here must not use ASSERT or child process is never killed. */
3302
3303         md.filter_off = 0;
3304         errno = 0;
3305         ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3306         EXPECT_EQ(sizeof(md), ret) {
3307                 if (errno == EINVAL)
3308                         SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3309         }
3310
3311         EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3312         EXPECT_EQ(md.filter_off, 0);
3313
3314         md.filter_off = 1;
3315         ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3316         EXPECT_EQ(sizeof(md), ret);
3317         EXPECT_EQ(md.flags, 0);
3318         EXPECT_EQ(md.filter_off, 1);
3319
3320 skip:
3321         ASSERT_EQ(0, kill(pid, SIGKILL));
3322 }
3323
3324 static int user_notif_syscall(int nr, unsigned int flags)
3325 {
3326         struct sock_filter filter[] = {
3327                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3328                         offsetof(struct seccomp_data, nr)),
3329                 BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
3330                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
3331                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3332         };
3333
3334         struct sock_fprog prog = {
3335                 .len = (unsigned short)ARRAY_SIZE(filter),
3336                 .filter = filter,
3337         };
3338
3339         return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3340 }
3341
3342 #define USER_NOTIF_MAGIC INT_MAX
3343 TEST(user_notification_basic)
3344 {
3345         pid_t pid;
3346         long ret;
3347         int status, listener;
3348         struct seccomp_notif req = {};
3349         struct seccomp_notif_resp resp = {};
3350         struct pollfd pollfd;
3351
3352         struct sock_filter filter[] = {
3353                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3354         };
3355         struct sock_fprog prog = {
3356                 .len = (unsigned short)ARRAY_SIZE(filter),
3357                 .filter = filter,
3358         };
3359
3360         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3361         ASSERT_EQ(0, ret) {
3362                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3363         }
3364
3365         pid = fork();
3366         ASSERT_GE(pid, 0);
3367
3368         /* Check that we get -ENOSYS with no listener attached */
3369         if (pid == 0) {
3370                 if (user_notif_syscall(__NR_getppid, 0) < 0)
3371                         exit(1);
3372                 ret = syscall(__NR_getppid);
3373                 exit(ret >= 0 || errno != ENOSYS);
3374         }
3375
3376         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3377         EXPECT_EQ(true, WIFEXITED(status));
3378         EXPECT_EQ(0, WEXITSTATUS(status));
3379
3380         /* Add some no-op filters for grins. */
3381         EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3382         EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3383         EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3384         EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3385
3386         /* Check that the basic notification machinery works */
3387         listener = user_notif_syscall(__NR_getppid,
3388                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3389         ASSERT_GE(listener, 0);
3390
3391         /* Installing a second listener in the chain should EBUSY */
3392         EXPECT_EQ(user_notif_syscall(__NR_getppid,
3393                                      SECCOMP_FILTER_FLAG_NEW_LISTENER),
3394                   -1);
3395         EXPECT_EQ(errno, EBUSY);
3396
3397         pid = fork();
3398         ASSERT_GE(pid, 0);
3399
3400         if (pid == 0) {
3401                 ret = syscall(__NR_getppid);
3402                 exit(ret != USER_NOTIF_MAGIC);
3403         }
3404
3405         pollfd.fd = listener;
3406         pollfd.events = POLLIN | POLLOUT;
3407
3408         EXPECT_GT(poll(&pollfd, 1, -1), 0);
3409         EXPECT_EQ(pollfd.revents, POLLIN);
3410
3411         /* Test that we can't pass garbage to the kernel. */
3412         memset(&req, 0, sizeof(req));
3413         req.pid = -1;
3414         errno = 0;
3415         ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3416         EXPECT_EQ(-1, ret);
3417         EXPECT_EQ(EINVAL, errno);
3418
3419         if (ret) {
3420                 req.pid = 0;
3421                 EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3422         }
3423
3424         pollfd.fd = listener;
3425         pollfd.events = POLLIN | POLLOUT;
3426
3427         EXPECT_GT(poll(&pollfd, 1, -1), 0);
3428         EXPECT_EQ(pollfd.revents, POLLOUT);
3429
3430         EXPECT_EQ(req.data.nr,  __NR_getppid);
3431
3432         resp.id = req.id;
3433         resp.error = 0;
3434         resp.val = USER_NOTIF_MAGIC;
3435
3436         /* check that we make sure flags == 0 */
3437         resp.flags = 1;
3438         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3439         EXPECT_EQ(errno, EINVAL);
3440
3441         resp.flags = 0;
3442         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3443
3444         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3445         EXPECT_EQ(true, WIFEXITED(status));
3446         EXPECT_EQ(0, WEXITSTATUS(status));
3447 }
3448
3449 TEST(user_notification_with_tsync)
3450 {
3451         int ret;
3452         unsigned int flags;
3453
3454         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3455         ASSERT_EQ(0, ret) {
3456                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3457         }
3458
3459         /* these were exclusive */
3460         flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3461                 SECCOMP_FILTER_FLAG_TSYNC;
3462         ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3463         ASSERT_EQ(EINVAL, errno);
3464
3465         /* but now they're not */
3466         flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3467         ret = user_notif_syscall(__NR_getppid, flags);
3468         close(ret);
3469         ASSERT_LE(0, ret);
3470 }
3471
3472 TEST(user_notification_kill_in_middle)
3473 {
3474         pid_t pid;
3475         long ret;
3476         int listener;
3477         struct seccomp_notif req = {};
3478         struct seccomp_notif_resp resp = {};
3479
3480         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3481         ASSERT_EQ(0, ret) {
3482                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3483         }
3484
3485         listener = user_notif_syscall(__NR_getppid,
3486                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3487         ASSERT_GE(listener, 0);
3488
3489         /*
3490          * Check that nothing bad happens when we kill the task in the middle
3491          * of a syscall.
3492          */
3493         pid = fork();
3494         ASSERT_GE(pid, 0);
3495
3496         if (pid == 0) {
3497                 ret = syscall(__NR_getppid);
3498                 exit(ret != USER_NOTIF_MAGIC);
3499         }
3500
3501         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3502         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3503
3504         EXPECT_EQ(kill(pid, SIGKILL), 0);
3505         EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3506
3507         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3508
3509         resp.id = req.id;
3510         ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3511         EXPECT_EQ(ret, -1);
3512         EXPECT_EQ(errno, ENOENT);
3513 }
3514
3515 static int handled = -1;
3516
3517 static void signal_handler(int signal)
3518 {
3519         if (write(handled, "c", 1) != 1)
3520                 perror("write from signal");
3521 }
3522
3523 TEST(user_notification_signal)
3524 {
3525         pid_t pid;
3526         long ret;
3527         int status, listener, sk_pair[2];
3528         struct seccomp_notif req = {};
3529         struct seccomp_notif_resp resp = {};
3530         char c;
3531
3532         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3533         ASSERT_EQ(0, ret) {
3534                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3535         }
3536
3537         ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3538
3539         listener = user_notif_syscall(__NR_gettid,
3540                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3541         ASSERT_GE(listener, 0);
3542
3543         pid = fork();
3544         ASSERT_GE(pid, 0);
3545
3546         if (pid == 0) {
3547                 close(sk_pair[0]);
3548                 handled = sk_pair[1];
3549                 if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3550                         perror("signal");
3551                         exit(1);
3552                 }
3553                 /*
3554                  * ERESTARTSYS behavior is a bit hard to test, because we need
3555                  * to rely on a signal that has not yet been handled. Let's at
3556                  * least check that the error code gets propagated through, and
3557                  * hope that it doesn't break when there is actually a signal :)
3558                  */
3559                 ret = syscall(__NR_gettid);
3560                 exit(!(ret == -1 && errno == 512));
3561         }
3562
3563         close(sk_pair[1]);
3564
3565         memset(&req, 0, sizeof(req));
3566         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3567
3568         EXPECT_EQ(kill(pid, SIGUSR1), 0);
3569
3570         /*
3571          * Make sure the signal really is delivered, which means we're not
3572          * stuck in the user notification code any more and the notification
3573          * should be dead.
3574          */
3575         EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3576
3577         resp.id = req.id;
3578         resp.error = -EPERM;
3579         resp.val = 0;
3580
3581         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3582         EXPECT_EQ(errno, ENOENT);
3583
3584         memset(&req, 0, sizeof(req));
3585         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3586
3587         resp.id = req.id;
3588         resp.error = -512; /* -ERESTARTSYS */
3589         resp.val = 0;
3590
3591         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3592
3593         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3594         EXPECT_EQ(true, WIFEXITED(status));
3595         EXPECT_EQ(0, WEXITSTATUS(status));
3596 }
3597
3598 TEST(user_notification_closed_listener)
3599 {
3600         pid_t pid;
3601         long ret;
3602         int status, listener;
3603
3604         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3605         ASSERT_EQ(0, ret) {
3606                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3607         }
3608
3609         listener = user_notif_syscall(__NR_getppid,
3610                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3611         ASSERT_GE(listener, 0);
3612
3613         /*
3614          * Check that we get an ENOSYS when the listener is closed.
3615          */
3616         pid = fork();
3617         ASSERT_GE(pid, 0);
3618         if (pid == 0) {
3619                 close(listener);
3620                 ret = syscall(__NR_getppid);
3621                 exit(ret != -1 && errno != ENOSYS);
3622         }
3623
3624         close(listener);
3625
3626         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3627         EXPECT_EQ(true, WIFEXITED(status));
3628         EXPECT_EQ(0, WEXITSTATUS(status));
3629 }
3630
3631 /*
3632  * Check that a pid in a child namespace still shows up as valid in ours.
3633  */
3634 TEST(user_notification_child_pid_ns)
3635 {
3636         pid_t pid;
3637         int status, listener;
3638         struct seccomp_notif req = {};
3639         struct seccomp_notif_resp resp = {};
3640
3641         ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3642                 if (errno == EINVAL)
3643                         SKIP(return, "kernel missing CLONE_NEWUSER support");
3644         };
3645
3646         listener = user_notif_syscall(__NR_getppid,
3647                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3648         ASSERT_GE(listener, 0);
3649
3650         pid = fork();
3651         ASSERT_GE(pid, 0);
3652
3653         if (pid == 0)
3654                 exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3655
3656         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3657         EXPECT_EQ(req.pid, pid);
3658
3659         resp.id = req.id;
3660         resp.error = 0;
3661         resp.val = USER_NOTIF_MAGIC;
3662
3663         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3664
3665         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3666         EXPECT_EQ(true, WIFEXITED(status));
3667         EXPECT_EQ(0, WEXITSTATUS(status));
3668         close(listener);
3669 }
3670
3671 /*
3672  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3673  * invalid.
3674  */
3675 TEST(user_notification_sibling_pid_ns)
3676 {
3677         pid_t pid, pid2;
3678         int status, listener;
3679         struct seccomp_notif req = {};
3680         struct seccomp_notif_resp resp = {};
3681
3682         ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3683                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3684         }
3685
3686         listener = user_notif_syscall(__NR_getppid,
3687                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3688         ASSERT_GE(listener, 0);
3689
3690         pid = fork();
3691         ASSERT_GE(pid, 0);
3692
3693         if (pid == 0) {
3694                 ASSERT_EQ(unshare(CLONE_NEWPID), 0);
3695
3696                 pid2 = fork();
3697                 ASSERT_GE(pid2, 0);
3698
3699                 if (pid2 == 0)
3700                         exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3701
3702                 EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3703                 EXPECT_EQ(true, WIFEXITED(status));
3704                 EXPECT_EQ(0, WEXITSTATUS(status));
3705                 exit(WEXITSTATUS(status));
3706         }
3707
3708         /* Create the sibling ns, and sibling in it. */
3709         ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3710                 if (errno == EPERM)
3711                         SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3712         }
3713         ASSERT_EQ(errno, 0);
3714
3715         pid2 = fork();
3716         ASSERT_GE(pid2, 0);
3717
3718         if (pid2 == 0) {
3719                 ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3720                 /*
3721                  * The pid should be 0, i.e. the task is in some namespace that
3722                  * we can't "see".
3723                  */
3724                 EXPECT_EQ(req.pid, 0);
3725
3726                 resp.id = req.id;
3727                 resp.error = 0;
3728                 resp.val = USER_NOTIF_MAGIC;
3729
3730                 ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3731                 exit(0);
3732         }
3733
3734         close(listener);
3735
3736         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3737         EXPECT_EQ(true, WIFEXITED(status));
3738         EXPECT_EQ(0, WEXITSTATUS(status));
3739
3740         EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3741         EXPECT_EQ(true, WIFEXITED(status));
3742         EXPECT_EQ(0, WEXITSTATUS(status));
3743 }
3744
3745 TEST(user_notification_fault_recv)
3746 {
3747         pid_t pid;
3748         int status, listener;
3749         struct seccomp_notif req = {};
3750         struct seccomp_notif_resp resp = {};
3751
3752         ASSERT_EQ(unshare(CLONE_NEWUSER), 0) {
3753                 if (errno == EINVAL)
3754                         SKIP(return, "kernel missing CLONE_NEWUSER support");
3755         }
3756
3757         listener = user_notif_syscall(__NR_getppid,
3758                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
3759         ASSERT_GE(listener, 0);
3760
3761         pid = fork();
3762         ASSERT_GE(pid, 0);
3763
3764         if (pid == 0)
3765                 exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3766
3767         /* Do a bad recv() */
3768         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3769         EXPECT_EQ(errno, EFAULT);
3770
3771         /* We should still be able to receive this notification, though. */
3772         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3773         EXPECT_EQ(req.pid, pid);
3774
3775         resp.id = req.id;
3776         resp.error = 0;
3777         resp.val = USER_NOTIF_MAGIC;
3778
3779         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3780
3781         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3782         EXPECT_EQ(true, WIFEXITED(status));
3783         EXPECT_EQ(0, WEXITSTATUS(status));
3784 }
3785
3786 TEST(seccomp_get_notif_sizes)
3787 {
3788         struct seccomp_notif_sizes sizes;
3789
3790         ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3791         EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3792         EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3793 }
3794
3795 TEST(user_notification_continue)
3796 {
3797         pid_t pid;
3798         long ret;
3799         int status, listener;
3800         struct seccomp_notif req = {};
3801         struct seccomp_notif_resp resp = {};
3802         struct pollfd pollfd;
3803
3804         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3805         ASSERT_EQ(0, ret) {
3806                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3807         }
3808
3809         listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3810         ASSERT_GE(listener, 0);
3811
3812         pid = fork();
3813         ASSERT_GE(pid, 0);
3814
3815         if (pid == 0) {
3816                 int dup_fd, pipe_fds[2];
3817                 pid_t self;
3818
3819                 ASSERT_GE(pipe(pipe_fds), 0);
3820
3821                 dup_fd = dup(pipe_fds[0]);
3822                 ASSERT_GE(dup_fd, 0);
3823                 EXPECT_NE(pipe_fds[0], dup_fd);
3824
3825                 self = getpid();
3826                 ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3827                 exit(0);
3828         }
3829
3830         pollfd.fd = listener;
3831         pollfd.events = POLLIN | POLLOUT;
3832
3833         EXPECT_GT(poll(&pollfd, 1, -1), 0);
3834         EXPECT_EQ(pollfd.revents, POLLIN);
3835
3836         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3837
3838         pollfd.fd = listener;
3839         pollfd.events = POLLIN | POLLOUT;
3840
3841         EXPECT_GT(poll(&pollfd, 1, -1), 0);
3842         EXPECT_EQ(pollfd.revents, POLLOUT);
3843
3844         EXPECT_EQ(req.data.nr, __NR_dup);
3845
3846         resp.id = req.id;
3847         resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3848
3849         /*
3850          * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3851          * args be set to 0.
3852          */
3853         resp.error = 0;
3854         resp.val = USER_NOTIF_MAGIC;
3855         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3856         EXPECT_EQ(errno, EINVAL);
3857
3858         resp.error = USER_NOTIF_MAGIC;
3859         resp.val = 0;
3860         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3861         EXPECT_EQ(errno, EINVAL);
3862
3863         resp.error = 0;
3864         resp.val = 0;
3865         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3866                 if (errno == EINVAL)
3867                         SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3868         }
3869
3870 skip:
3871         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3872         EXPECT_EQ(true, WIFEXITED(status));
3873         EXPECT_EQ(0, WEXITSTATUS(status)) {
3874                 if (WEXITSTATUS(status) == 2) {
3875                         SKIP(return, "Kernel does not support kcmp() syscall");
3876                         return;
3877                 }
3878         }
3879 }
3880
3881 TEST(user_notification_filter_empty)
3882 {
3883         pid_t pid;
3884         long ret;
3885         int status;
3886         struct pollfd pollfd;
3887         struct __clone_args args = {
3888                 .flags = CLONE_FILES,
3889                 .exit_signal = SIGCHLD,
3890         };
3891
3892         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3893         ASSERT_EQ(0, ret) {
3894                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3895         }
3896
3897         pid = sys_clone3(&args, sizeof(args));
3898         ASSERT_GE(pid, 0);
3899
3900         if (pid == 0) {
3901                 int listener;
3902
3903                 listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3904                 if (listener < 0)
3905                         _exit(EXIT_FAILURE);
3906
3907                 if (dup2(listener, 200) != 200)
3908                         _exit(EXIT_FAILURE);
3909
3910                 close(listener);
3911
3912                 _exit(EXIT_SUCCESS);
3913         }
3914
3915         EXPECT_EQ(waitpid(pid, &status, 0), pid);
3916         EXPECT_EQ(true, WIFEXITED(status));
3917         EXPECT_EQ(0, WEXITSTATUS(status));
3918
3919         /*
3920          * The seccomp filter has become unused so we should be notified once
3921          * the kernel gets around to cleaning up task struct.
3922          */
3923         pollfd.fd = 200;
3924         pollfd.events = POLLHUP;
3925
3926         EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3927         EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3928 }
3929
3930 static void *do_thread(void *data)
3931 {
3932         return NULL;
3933 }
3934
3935 TEST(user_notification_filter_empty_threaded)
3936 {
3937         pid_t pid;
3938         long ret;
3939         int status;
3940         struct pollfd pollfd;
3941         struct __clone_args args = {
3942                 .flags = CLONE_FILES,
3943                 .exit_signal = SIGCHLD,
3944         };
3945
3946         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3947         ASSERT_EQ(0, ret) {
3948                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3949         }
3950
3951         pid = sys_clone3(&args, sizeof(args));
3952         ASSERT_GE(pid, 0);
3953
3954         if (pid == 0) {
3955                 pid_t pid1, pid2;
3956                 int listener, status;
3957                 pthread_t thread;
3958
3959                 listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3960                 if (listener < 0)
3961                         _exit(EXIT_FAILURE);
3962
3963                 if (dup2(listener, 200) != 200)
3964                         _exit(EXIT_FAILURE);
3965
3966                 close(listener);
3967
3968                 pid1 = fork();
3969                 if (pid1 < 0)
3970                         _exit(EXIT_FAILURE);
3971
3972                 if (pid1 == 0)
3973                         _exit(EXIT_SUCCESS);
3974
3975                 pid2 = fork();
3976                 if (pid2 < 0)
3977                         _exit(EXIT_FAILURE);
3978
3979                 if (pid2 == 0)
3980                         _exit(EXIT_SUCCESS);
3981
3982                 if (pthread_create(&thread, NULL, do_thread, NULL) ||
3983                     pthread_join(thread, NULL))
3984                         _exit(EXIT_FAILURE);
3985
3986                 if (pthread_create(&thread, NULL, do_thread, NULL) ||
3987                     pthread_join(thread, NULL))
3988                         _exit(EXIT_FAILURE);
3989
3990                 if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
3991                     WEXITSTATUS(status))
3992                         _exit(EXIT_FAILURE);
3993
3994                 if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
3995                     WEXITSTATUS(status))
3996                         _exit(EXIT_FAILURE);
3997
3998                 exit(EXIT_SUCCESS);
3999         }
4000
4001         EXPECT_EQ(waitpid(pid, &status, 0), pid);
4002         EXPECT_EQ(true, WIFEXITED(status));
4003         EXPECT_EQ(0, WEXITSTATUS(status));
4004
4005         /*
4006          * The seccomp filter has become unused so we should be notified once
4007          * the kernel gets around to cleaning up task struct.
4008          */
4009         pollfd.fd = 200;
4010         pollfd.events = POLLHUP;
4011
4012         EXPECT_GT(poll(&pollfd, 1, 2000), 0);
4013         EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
4014 }
4015
4016 TEST(user_notification_addfd)
4017 {
4018         pid_t pid;
4019         long ret;
4020         int status, listener, memfd, fd, nextfd;
4021         struct seccomp_notif_addfd addfd = {};
4022         struct seccomp_notif_addfd_small small = {};
4023         struct seccomp_notif_addfd_big big = {};
4024         struct seccomp_notif req = {};
4025         struct seccomp_notif_resp resp = {};
4026         /* 100 ms */
4027         struct timespec delay = { .tv_nsec = 100000000 };
4028
4029         /* There may be arbitrary already-open fds at test start. */
4030         memfd = memfd_create("test", 0);
4031         ASSERT_GE(memfd, 0);
4032         nextfd = memfd + 1;
4033
4034         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4035         ASSERT_EQ(0, ret) {
4036                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4037         }
4038
4039         /* fd: 4 */
4040         /* Check that the basic notification machinery works */
4041         listener = user_notif_syscall(__NR_getppid,
4042                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
4043         ASSERT_EQ(listener, nextfd++);
4044
4045         pid = fork();
4046         ASSERT_GE(pid, 0);
4047
4048         if (pid == 0) {
4049                 /* fds will be added and this value is expected */
4050                 if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
4051                         exit(1);
4052
4053                 /* Atomic addfd+send is received here. Check it is a valid fd */
4054                 if (fcntl(syscall(__NR_getppid), F_GETFD) == -1)
4055                         exit(1);
4056
4057                 exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4058         }
4059
4060         ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4061
4062         addfd.srcfd = memfd;
4063         addfd.newfd = 0;
4064         addfd.id = req.id;
4065         addfd.flags = 0x0;
4066
4067         /* Verify bad newfd_flags cannot be set */
4068         addfd.newfd_flags = ~O_CLOEXEC;
4069         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4070         EXPECT_EQ(errno, EINVAL);
4071         addfd.newfd_flags = O_CLOEXEC;
4072
4073         /* Verify bad flags cannot be set */
4074         addfd.flags = 0xff;
4075         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4076         EXPECT_EQ(errno, EINVAL);
4077         addfd.flags = 0;
4078
4079         /* Verify that remote_fd cannot be set without setting flags */
4080         addfd.newfd = 1;
4081         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4082         EXPECT_EQ(errno, EINVAL);
4083         addfd.newfd = 0;
4084
4085         /* Verify small size cannot be set */
4086         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
4087         EXPECT_EQ(errno, EINVAL);
4088
4089         /* Verify we can't send bits filled in unknown buffer area */
4090         memset(&big, 0xAA, sizeof(big));
4091         big.addfd = addfd;
4092         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
4093         EXPECT_EQ(errno, E2BIG);
4094
4095
4096         /* Verify we can set an arbitrary remote fd */
4097         fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4098         EXPECT_EQ(fd, nextfd++);
4099         EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4100
4101         /* Verify we can set an arbitrary remote fd with large size */
4102         memset(&big, 0x0, sizeof(big));
4103         big.addfd = addfd;
4104         fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
4105         EXPECT_EQ(fd, nextfd++);
4106
4107         /* Verify we can set a specific remote fd */
4108         addfd.newfd = 42;
4109         addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4110         fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4111         EXPECT_EQ(fd, 42);
4112         EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4113
4114         /* Resume syscall */
4115         resp.id = req.id;
4116         resp.error = 0;
4117         resp.val = USER_NOTIF_MAGIC;
4118         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4119
4120         /*
4121          * This sets the ID of the ADD FD to the last request plus 1. The
4122          * notification ID increments 1 per notification.
4123          */
4124         addfd.id = req.id + 1;
4125
4126         /* This spins until the underlying notification is generated */
4127         while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4128                errno != -EINPROGRESS)
4129                 nanosleep(&delay, NULL);
4130
4131         memset(&req, 0, sizeof(req));
4132         ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4133         ASSERT_EQ(addfd.id, req.id);
4134
4135         /* Verify we can do an atomic addfd and send */
4136         addfd.newfd = 0;
4137         addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4138         fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4139         /*
4140          * Child has earlier "low" fds and now 42, so we expect the next
4141          * lowest available fd to be assigned here.
4142          */
4143         EXPECT_EQ(fd, nextfd++);
4144         ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4145
4146         /*
4147          * This sets the ID of the ADD FD to the last request plus 1. The
4148          * notification ID increments 1 per notification.
4149          */
4150         addfd.id = req.id + 1;
4151
4152         /* This spins until the underlying notification is generated */
4153         while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4154                errno != -EINPROGRESS)
4155                 nanosleep(&delay, NULL);
4156
4157         memset(&req, 0, sizeof(req));
4158         ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4159         ASSERT_EQ(addfd.id, req.id);
4160
4161         resp.id = req.id;
4162         resp.error = 0;
4163         resp.val = USER_NOTIF_MAGIC;
4164         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4165
4166         /* Wait for child to finish. */
4167         EXPECT_EQ(waitpid(pid, &status, 0), pid);
4168         EXPECT_EQ(true, WIFEXITED(status));
4169         EXPECT_EQ(0, WEXITSTATUS(status));
4170
4171         close(memfd);
4172 }
4173
4174 TEST(user_notification_addfd_rlimit)
4175 {
4176         pid_t pid;
4177         long ret;
4178         int status, listener, memfd;
4179         struct seccomp_notif_addfd addfd = {};
4180         struct seccomp_notif req = {};
4181         struct seccomp_notif_resp resp = {};
4182         const struct rlimit lim = {
4183                 .rlim_cur       = 0,
4184                 .rlim_max       = 0,
4185         };
4186
4187         memfd = memfd_create("test", 0);
4188         ASSERT_GE(memfd, 0);
4189
4190         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4191         ASSERT_EQ(0, ret) {
4192                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4193         }
4194
4195         /* Check that the basic notification machinery works */
4196         listener = user_notif_syscall(__NR_getppid,
4197                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
4198         ASSERT_GE(listener, 0);
4199
4200         pid = fork();
4201         ASSERT_GE(pid, 0);
4202
4203         if (pid == 0)
4204                 exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4205
4206
4207         ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4208
4209         ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
4210
4211         addfd.srcfd = memfd;
4212         addfd.newfd_flags = O_CLOEXEC;
4213         addfd.newfd = 0;
4214         addfd.id = req.id;
4215         addfd.flags = 0;
4216
4217         /* Should probably spot check /proc/sys/fs/file-nr */
4218         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4219         EXPECT_EQ(errno, EMFILE);
4220
4221         addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4222         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4223         EXPECT_EQ(errno, EMFILE);
4224
4225         addfd.newfd = 100;
4226         addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4227         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4228         EXPECT_EQ(errno, EBADF);
4229
4230         resp.id = req.id;
4231         resp.error = 0;
4232         resp.val = USER_NOTIF_MAGIC;
4233
4234         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4235
4236         /* Wait for child to finish. */
4237         EXPECT_EQ(waitpid(pid, &status, 0), pid);
4238         EXPECT_EQ(true, WIFEXITED(status));
4239         EXPECT_EQ(0, WEXITSTATUS(status));
4240
4241         close(memfd);
4242 }
4243
4244 /* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */
4245 FIXTURE(O_SUSPEND_SECCOMP) {
4246         pid_t pid;
4247 };
4248
4249 FIXTURE_SETUP(O_SUSPEND_SECCOMP)
4250 {
4251         ERRNO_FILTER(block_read, E2BIG);
4252         cap_value_t cap_list[] = { CAP_SYS_ADMIN };
4253         cap_t caps;
4254
4255         self->pid = 0;
4256
4257         /* make sure we don't have CAP_SYS_ADMIN */
4258         caps = cap_get_proc();
4259         ASSERT_NE(NULL, caps);
4260         ASSERT_EQ(0, cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
4261         ASSERT_EQ(0, cap_set_proc(caps));
4262         cap_free(caps);
4263
4264         ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
4265         ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_block_read));
4266
4267         self->pid = fork();
4268         ASSERT_GE(self->pid, 0);
4269
4270         if (self->pid == 0) {
4271                 while (1)
4272                         pause();
4273                 _exit(127);
4274         }
4275 }
4276
4277 FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)
4278 {
4279         if (self->pid)
4280                 kill(self->pid, SIGKILL);
4281 }
4282
4283 TEST_F(O_SUSPEND_SECCOMP, setoptions)
4284 {
4285         int wstatus;
4286
4287         ASSERT_EQ(0, ptrace(PTRACE_ATTACH, self->pid, NULL, 0));
4288         ASSERT_EQ(self->pid, wait(&wstatus));
4289         ASSERT_EQ(-1, ptrace(PTRACE_SETOPTIONS, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP));
4290         if (errno == EINVAL)
4291                 SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4292         ASSERT_EQ(EPERM, errno);
4293 }
4294
4295 TEST_F(O_SUSPEND_SECCOMP, seize)
4296 {
4297         int ret;
4298
4299         ret = ptrace(PTRACE_SEIZE, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP);
4300         ASSERT_EQ(-1, ret);
4301         if (errno == EINVAL)
4302                 SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4303         ASSERT_EQ(EPERM, errno);
4304 }
4305
4306 /*
4307  * get_nth - Get the nth, space separated entry in a file.
4308  *
4309  * Returns the length of the read field.
4310  * Throws error if field is zero-lengthed.
4311  */
4312 static ssize_t get_nth(struct __test_metadata *_metadata, const char *path,
4313                      const unsigned int position, char **entry)
4314 {
4315         char *line = NULL;
4316         unsigned int i;
4317         ssize_t nread;
4318         size_t len = 0;
4319         FILE *f;
4320
4321         f = fopen(path, "r");
4322         ASSERT_NE(f, NULL) {
4323                 TH_LOG("Could not open %s: %s", path, strerror(errno));
4324         }
4325
4326         for (i = 0; i < position; i++) {
4327                 nread = getdelim(&line, &len, ' ', f);
4328                 ASSERT_GE(nread, 0) {
4329                         TH_LOG("Failed to read %d entry in file %s", i, path);
4330                 }
4331         }
4332         fclose(f);
4333
4334         ASSERT_GT(nread, 0) {
4335                 TH_LOG("Entry in file %s had zero length", path);
4336         }
4337
4338         *entry = line;
4339         return nread - 1;
4340 }
4341
4342 /* For a given PID, get the task state (D, R, etc...) */
4343 static char get_proc_stat(struct __test_metadata *_metadata, pid_t pid)
4344 {
4345         char proc_path[100] = {0};
4346         char status;
4347         char *line;
4348
4349         snprintf(proc_path, sizeof(proc_path), "/proc/%d/stat", pid);
4350         ASSERT_EQ(get_nth(_metadata, proc_path, 3, &line), 1);
4351
4352         status = *line;
4353         free(line);
4354
4355         return status;
4356 }
4357
4358 TEST(user_notification_fifo)
4359 {
4360         struct seccomp_notif_resp resp = {};
4361         struct seccomp_notif req = {};
4362         int i, status, listener;
4363         pid_t pid, pids[3];
4364         __u64 baseid;
4365         long ret;
4366         /* 100 ms */
4367         struct timespec delay = { .tv_nsec = 100000000 };
4368
4369         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4370         ASSERT_EQ(0, ret) {
4371                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4372         }
4373
4374         /* Setup a listener */
4375         listener = user_notif_syscall(__NR_getppid,
4376                                       SECCOMP_FILTER_FLAG_NEW_LISTENER);
4377         ASSERT_GE(listener, 0);
4378
4379         pid = fork();
4380         ASSERT_GE(pid, 0);
4381
4382         if (pid == 0) {
4383                 ret = syscall(__NR_getppid);
4384                 exit(ret != USER_NOTIF_MAGIC);
4385         }
4386
4387         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4388         baseid = req.id + 1;
4389
4390         resp.id = req.id;
4391         resp.error = 0;
4392         resp.val = USER_NOTIF_MAGIC;
4393
4394         /* check that we make sure flags == 0 */
4395         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4396
4397         EXPECT_EQ(waitpid(pid, &status, 0), pid);
4398         EXPECT_EQ(true, WIFEXITED(status));
4399         EXPECT_EQ(0, WEXITSTATUS(status));
4400
4401         /* Start children, and generate notifications */
4402         for (i = 0; i < ARRAY_SIZE(pids); i++) {
4403                 pid = fork();
4404                 if (pid == 0) {
4405                         ret = syscall(__NR_getppid);
4406                         exit(ret != USER_NOTIF_MAGIC);
4407                 }
4408                 pids[i] = pid;
4409         }
4410
4411         /* This spins until all of the children are sleeping */
4412 restart_wait:
4413         for (i = 0; i < ARRAY_SIZE(pids); i++) {
4414                 if (get_proc_stat(_metadata, pids[i]) != 'S') {
4415                         nanosleep(&delay, NULL);
4416                         goto restart_wait;
4417                 }
4418         }
4419
4420         /* Read the notifications in order (and respond) */
4421         for (i = 0; i < ARRAY_SIZE(pids); i++) {
4422                 memset(&req, 0, sizeof(req));
4423                 EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4424                 EXPECT_EQ(req.id, baseid + i);
4425                 resp.id = req.id;
4426                 EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4427         }
4428
4429         /* Make sure notifications were received */
4430         for (i = 0; i < ARRAY_SIZE(pids); i++) {
4431                 EXPECT_EQ(waitpid(pids[i], &status, 0), pids[i]);
4432                 EXPECT_EQ(true, WIFEXITED(status));
4433                 EXPECT_EQ(0, WEXITSTATUS(status));
4434         }
4435 }
4436
4437 /* get_proc_syscall - Get the syscall in progress for a given pid
4438  *
4439  * Returns the current syscall number for a given process
4440  * Returns -1 if not in syscall (running or blocked)
4441  */
4442 static long get_proc_syscall(struct __test_metadata *_metadata, int pid)
4443 {
4444         char proc_path[100] = {0};
4445         long ret = -1;
4446         ssize_t nread;
4447         char *line;
4448
4449         snprintf(proc_path, sizeof(proc_path), "/proc/%d/syscall", pid);
4450         nread = get_nth(_metadata, proc_path, 1, &line);
4451         ASSERT_GT(nread, 0);
4452
4453         if (!strncmp("running", line, MIN(7, nread)))
4454                 ret = strtol(line, NULL, 16);
4455
4456         free(line);
4457         return ret;
4458 }
4459
4460 /* Ensure non-fatal signals prior to receive are unmodified */
4461 TEST(user_notification_wait_killable_pre_notification)
4462 {
4463         struct sigaction new_action = {
4464                 .sa_handler = signal_handler,
4465         };
4466         int listener, status, sk_pair[2];
4467         pid_t pid;
4468         long ret;
4469         char c;
4470         /* 100 ms */
4471         struct timespec delay = { .tv_nsec = 100000000 };
4472
4473         ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4474
4475         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4476         ASSERT_EQ(0, ret)
4477         {
4478                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4479         }
4480
4481         ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4482
4483         listener = user_notif_syscall(
4484                 __NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4485                                       SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4486         ASSERT_GE(listener, 0);
4487
4488         /*
4489          * Check that we can kill the process with SIGUSR1 prior to receiving
4490          * the notification. SIGUSR1 is wired up to a custom signal handler,
4491          * and make sure it gets called.
4492          */
4493         pid = fork();
4494         ASSERT_GE(pid, 0);
4495
4496         if (pid == 0) {
4497                 close(sk_pair[0]);
4498                 handled = sk_pair[1];
4499
4500                 /* Setup the non-fatal sigaction without SA_RESTART */
4501                 if (sigaction(SIGUSR1, &new_action, NULL)) {
4502                         perror("sigaction");
4503                         exit(1);
4504                 }
4505
4506                 ret = syscall(__NR_getppid);
4507                 /* Make sure we got a return from a signal interruption */
4508                 exit(ret != -1 || errno != EINTR);
4509         }
4510
4511         /*
4512          * Make sure we've gotten to the seccomp user notification wait
4513          * from getppid prior to sending any signals
4514          */
4515         while (get_proc_syscall(_metadata, pid) != __NR_getppid &&
4516                get_proc_stat(_metadata, pid) != 'S')
4517                 nanosleep(&delay, NULL);
4518
4519         /* Send non-fatal kill signal */
4520         EXPECT_EQ(kill(pid, SIGUSR1), 0);
4521
4522         /* wait for process to exit (exit checks for EINTR) */
4523         EXPECT_EQ(waitpid(pid, &status, 0), pid);
4524         EXPECT_EQ(true, WIFEXITED(status));
4525         EXPECT_EQ(0, WEXITSTATUS(status));
4526
4527         EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4528 }
4529
4530 /* Ensure non-fatal signals after receive are blocked */
4531 TEST(user_notification_wait_killable)
4532 {
4533         struct sigaction new_action = {
4534                 .sa_handler = signal_handler,
4535         };
4536         struct seccomp_notif_resp resp = {};
4537         struct seccomp_notif req = {};
4538         int listener, status, sk_pair[2];
4539         pid_t pid;
4540         long ret;
4541         char c;
4542         /* 100 ms */
4543         struct timespec delay = { .tv_nsec = 100000000 };
4544
4545         ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4546
4547         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4548         ASSERT_EQ(0, ret)
4549         {
4550                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4551         }
4552
4553         ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4554
4555         listener = user_notif_syscall(
4556                 __NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4557                                       SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4558         ASSERT_GE(listener, 0);
4559
4560         pid = fork();
4561         ASSERT_GE(pid, 0);
4562
4563         if (pid == 0) {
4564                 close(sk_pair[0]);
4565                 handled = sk_pair[1];
4566
4567                 /* Setup the sigaction without SA_RESTART */
4568                 if (sigaction(SIGUSR1, &new_action, NULL)) {
4569                         perror("sigaction");
4570                         exit(1);
4571                 }
4572
4573                 /* Make sure that the syscall is completed (no EINTR) */
4574                 ret = syscall(__NR_getppid);
4575                 exit(ret != USER_NOTIF_MAGIC);
4576         }
4577
4578         /*
4579          * Get the notification, to make move the notifying process into a
4580          * non-preemptible (TASK_KILLABLE) state.
4581          */
4582         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4583         /* Send non-fatal kill signal */
4584         EXPECT_EQ(kill(pid, SIGUSR1), 0);
4585
4586         /*
4587          * Make sure the task enters moves to TASK_KILLABLE by waiting for
4588          * D (Disk Sleep) state after receiving non-fatal signal.
4589          */
4590         while (get_proc_stat(_metadata, pid) != 'D')
4591                 nanosleep(&delay, NULL);
4592
4593         resp.id = req.id;
4594         resp.val = USER_NOTIF_MAGIC;
4595         /* Make sure the notification is found and able to be replied to */
4596         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4597
4598         /*
4599          * Make sure that the signal handler does get called once we're back in
4600          * userspace.
4601          */
4602         EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4603         /* wait for process to exit (exit checks for USER_NOTIF_MAGIC) */
4604         EXPECT_EQ(waitpid(pid, &status, 0), pid);
4605         EXPECT_EQ(true, WIFEXITED(status));
4606         EXPECT_EQ(0, WEXITSTATUS(status));
4607 }
4608
4609 /* Ensure fatal signals after receive are not blocked */
4610 TEST(user_notification_wait_killable_fatal)
4611 {
4612         struct seccomp_notif req = {};
4613         int listener, status;
4614         pid_t pid;
4615         long ret;
4616         /* 100 ms */
4617         struct timespec delay = { .tv_nsec = 100000000 };
4618
4619         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4620         ASSERT_EQ(0, ret)
4621         {
4622                 TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4623         }
4624
4625         listener = user_notif_syscall(
4626                 __NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4627                                       SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4628         ASSERT_GE(listener, 0);
4629
4630         pid = fork();
4631         ASSERT_GE(pid, 0);
4632
4633         if (pid == 0) {
4634                 /* This should never complete as it should get a SIGTERM */
4635                 syscall(__NR_getppid);
4636                 exit(1);
4637         }
4638
4639         while (get_proc_stat(_metadata, pid) != 'S')
4640                 nanosleep(&delay, NULL);
4641
4642         /*
4643          * Get the notification, to make move the notifying process into a
4644          * non-preemptible (TASK_KILLABLE) state.
4645          */
4646         EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4647         /* Kill the process with a fatal signal */
4648         EXPECT_EQ(kill(pid, SIGTERM), 0);
4649
4650         /*
4651          * Wait for the process to exit, and make sure the process terminated
4652          * due to the SIGTERM signal.
4653          */
4654         EXPECT_EQ(waitpid(pid, &status, 0), pid);
4655         EXPECT_EQ(true, WIFSIGNALED(status));
4656         EXPECT_EQ(SIGTERM, WTERMSIG(status));
4657 }
4658
4659 /*
4660  * TODO:
4661  * - expand NNP testing
4662  * - better arch-specific TRACE and TRAP handlers.
4663  * - endianness checking when appropriate
4664  * - 64-bit arg prodding
4665  * - arch value testing (x86 modes especially)
4666  * - verify that FILTER_FLAG_LOG filters generate log messages
4667  * - verify that RET_LOG generates log messages
4668  */
4669
4670 TEST_HARNESS_MAIN