Merge tag 'pidfd-updates-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/braun...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 11 Jul 2019 05:17:21 +0000 (22:17 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 11 Jul 2019 05:17:21 +0000 (22:17 -0700)
Pull pidfd updates from Christian Brauner:
 "This adds two main features.

   - First, it adds polling support for pidfds. This allows process
     managers to know when a (non-parent) process dies in a race-free
     way.

     The notification mechanism used follows the same logic that is
     currently used when the parent of a task is notified of a child's
     death. With this patchset it is possible to put pidfds in an
     {e}poll loop and get reliable notifications for process (i.e.
     thread-group) exit.

   - The second feature compliments the first one by making it possible
     to retrieve pollable pidfds for processes that were not created
     using CLONE_PIDFD.

     A lot of processes get created with traditional PID-based calls
     such as fork() or clone() (without CLONE_PIDFD). For these
     processes a caller can currently not create a pollable pidfd. This
     is a problem for Android's low memory killer (LMK) and service
     managers such as systemd.

  Both patchsets are accompanied by selftests.

  It's perhaps worth noting that the work done so far and the work done
  in this branch for pidfd_open() and polling support do already see
  some adoption:

   - Android is in the process of backporting this work to all their LTS
     kernels [1]

   - Service managers make use of pidfd_send_signal but will need to
     wait until we enable waiting on pidfds for full adoption.

   - And projects I maintain make use of both pidfd_send_signal and
     CLONE_PIDFD [2] and will use polling support and pidfd_open() too"

[1] https://android-review.googlesource.com/q/topic:%22pidfd+polling+support+4.9+backport%22
    https://android-review.googlesource.com/q/topic:%22pidfd+polling+support+4.14+backport%22
    https://android-review.googlesource.com/q/topic:%22pidfd+polling+support+4.19+backport%22

[2] https://github.com/lxc/lxc/blob/aab6e3eb73c343231cdde775db938994fc6f2803/src/lxc/start.c#L1753

* tag 'pidfd-updates-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  tests: add pidfd_open() tests
  arch: wire-up pidfd_open()
  pid: add pidfd_open()
  pidfd: add polling selftests
  pidfd: add polling support

29 files changed:
arch/alpha/kernel/syscalls/syscall.tbl
arch/arm/tools/syscall.tbl
arch/arm64/include/asm/unistd.h
arch/arm64/include/asm/unistd32.h
arch/ia64/kernel/syscalls/syscall.tbl
arch/m68k/kernel/syscalls/syscall.tbl
arch/microblaze/kernel/syscalls/syscall.tbl
arch/mips/kernel/syscalls/syscall_n32.tbl
arch/mips/kernel/syscalls/syscall_n64.tbl
arch/mips/kernel/syscalls/syscall_o32.tbl
arch/parisc/kernel/syscalls/syscall.tbl
arch/powerpc/kernel/syscalls/syscall.tbl
arch/s390/kernel/syscalls/syscall.tbl
arch/sh/kernel/syscalls/syscall.tbl
arch/sparc/kernel/syscalls/syscall.tbl
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/xtensa/kernel/syscalls/syscall.tbl
include/linux/pid.h
include/linux/syscalls.h
include/uapi/asm-generic/unistd.h
kernel/fork.c
kernel/pid.c
kernel/signal.c
tools/testing/selftests/pidfd/.gitignore
tools/testing/selftests/pidfd/Makefile
tools/testing/selftests/pidfd/pidfd.h [new file with mode: 0644]
tools/testing/selftests/pidfd/pidfd_open_test.c [new file with mode: 0644]
tools/testing/selftests/pidfd/pidfd_test.c

index 9e7704e..1db9bbc 100644 (file)
 541    common  fsconfig                        sys_fsconfig
 542    common  fsmount                         sys_fsmount
 543    common  fspick                          sys_fspick
+544    common  pidfd_open                      sys_pidfd_open
index aaf479a..81e6e18 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 2a23614..ede7b88 100644 (file)
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls                (__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END            (__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls           434
+#define __NR_compat_syscalls           435
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
index aa99592..5241592 100644 (file)
@@ -875,6 +875,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_pidfd_open 434
+__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
 
 /*
  * Please add new compat syscalls above this comment and update
index e01df3f..ecc4492 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 7e3d073..9a3eb25 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 26339e4..ad706f8 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 0e2dd68..97035e1 100644 (file)
 431    n32     fsconfig                        sys_fsconfig
 432    n32     fsmount                         sys_fsmount
 433    n32     fspick                          sys_fspick
+434    n32     pidfd_open                      sys_pidfd_open
index 5eebfa0..d729272 100644 (file)
 431    n64     fsconfig                        sys_fsconfig
 432    n64     fsmount                         sys_fsmount
 433    n64     fspick                          sys_fspick
+434    n64     pidfd_open                      sys_pidfd_open
index 3cc1374..dba084c 100644 (file)
 431    o32     fsconfig                        sys_fsconfig
 432    o32     fsmount                         sys_fsmount
 433    o32     fspick                          sys_fspick
+434    o32     pidfd_open                      sys_pidfd_open
index c9e377d..5022b9e 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 103655d..f2c3bda 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index e822b29..6ebacfe 100644 (file)
 431  common    fsconfig                sys_fsconfig                    sys_fsconfig
 432  common    fsmount                 sys_fsmount                     sys_fsmount
 433  common    fspick                  sys_fspick                      sys_fspick
+434  common    pidfd_open              sys_pidfd_open                  sys_pidfd_open
index 016a727..834c9c7 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index e047480..c58e71f 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index ad968b7..43e4429 100644 (file)
 431    i386    fsconfig                sys_fsconfig                    __ia32_sys_fsconfig
 432    i386    fsmount                 sys_fsmount                     __ia32_sys_fsmount
 433    i386    fspick                  sys_fspick                      __ia32_sys_fspick
+434    i386    pidfd_open              sys_pidfd_open                  __ia32_sys_pidfd_open
index b4e6f9e..1bee0a7 100644 (file)
 431    common  fsconfig                __x64_sys_fsconfig
 432    common  fsmount                 __x64_sys_fsmount
 433    common  fspick                  __x64_sys_fspick
+434    common  pidfd_open              __x64_sys_pidfd_open
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
index 5fa0ee1..782b819 100644 (file)
 431    common  fsconfig                        sys_fsconfig
 432    common  fsmount                         sys_fsmount
 433    common  fspick                          sys_fspick
+434    common  pidfd_open                      sys_pidfd_open
index 3c8ef5a..1484db6 100644 (file)
@@ -3,6 +3,7 @@
 #define _LINUX_PID_H
 
 #include <linux/rculist.h>
+#include <linux/wait.h>
 
 enum pid_type
 {
@@ -60,6 +61,8 @@ struct pid
        unsigned int level;
        /* lists of tasks that use this pid */
        struct hlist_head tasks[PIDTYPE_MAX];
+       /* wait queue for pidfd notifications */
+       wait_queue_head_t wait_pidfd;
        struct rcu_head rcu;
        struct upid numbers[1];
 };
index bc4bbbb..699aed6 100644 (file)
@@ -927,6 +927,7 @@ asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
                                struct old_timex32 __user *tx);
 asmlinkage long sys_syncfs(int fd);
 asmlinkage long sys_setns(int fd, int nstype);
+asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
 asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
                             unsigned int vlen, unsigned flags);
 asmlinkage long sys_process_vm_readv(pid_t pid,
index a87904d..e5684a4 100644 (file)
@@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
 __SYSCALL(__NR_fsmount, sys_fsmount)
 #define __NR_fspick 433
 __SYSCALL(__NR_fspick, sys_fspick)
+#define __NR_pidfd_open 434
+__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
 
 #undef __NR_syscalls
-#define __NR_syscalls 434
+#define __NR_syscalls 435
 
 /*
  * 32 bit systems traditionally used different
index 847dd14..187c02c 100644 (file)
@@ -1711,8 +1711,34 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
+/*
+ * Poll support for process exit notification.
+ */
+static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+       struct task_struct *task;
+       struct pid *pid = file->private_data;
+       int poll_flags = 0;
+
+       poll_wait(file, &pid->wait_pidfd, pts);
+
+       rcu_read_lock();
+       task = pid_task(pid, PIDTYPE_PID);
+       /*
+        * Inform pollers only when the whole thread group exits.
+        * If the thread group leader exits before all other threads in the
+        * group, then poll(2) should block, similar to the wait(2) family.
+        */
+       if (!task || (task->exit_state && thread_group_empty(task)))
+               poll_flags = POLLIN | POLLRDNORM;
+       rcu_read_unlock();
+
+       return poll_flags;
+}
+
 const struct file_operations pidfd_fops = {
        .release = pidfd_release,
+       .poll = pidfd_poll,
 #ifdef CONFIG_PROC_FS
        .show_fdinfo = pidfd_show_fdinfo,
 #endif
index e5cad0c..16263b5 100644 (file)
@@ -38,6 +38,8 @@
 #include <linux/syscalls.h>
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
@@ -214,6 +216,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);
 
+       init_waitqueue_head(&pid->wait_pidfd);
+
        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
        if (!(ns->pid_allocated & PIDNS_ADDING))
@@ -451,6 +455,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
        return idr_get_next(&ns->idr, &nr);
 }
 
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid:  struct pid that the pidfd will reference
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid)
+{
+       int fd;
+
+       fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+                             O_RDWR | O_CLOEXEC);
+       if (fd < 0)
+               put_pid(pid);
+
+       return fd;
+}
+
+/**
+ * pidfd_open() - Open new pid file descriptor.
+ *
+ * @pid:   pid for which to retrieve a pidfd
+ * @flags: flags to pass
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set for
+ * the process identified by @pid. Currently, the process identified by
+ * @pid must be a thread-group leader. This restriction currently exists
+ * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
+ * be used with CLONE_THREAD) and pidfd polling (only supports thread group
+ * leaders).
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
+{
+       int fd, ret;
+       struct pid *p;
+
+       if (flags)
+               return -EINVAL;
+
+       if (pid <= 0)
+               return -EINVAL;
+
+       p = find_get_pid(pid);
+       if (!p)
+               return -ESRCH;
+
+       ret = 0;
+       rcu_read_lock();
+       if (!pid_task(p, PIDTYPE_TGID))
+               ret = -EINVAL;
+       rcu_read_unlock();
+
+       fd = ret ?: pidfd_create(p);
+       put_pid(p);
+       return fd;
+}
+
 void __init pid_idr_init(void)
 {
        /* Verify no one has done anything silly: */
index 91cb8ca..dabe100 100644 (file)
@@ -1881,6 +1881,14 @@ ret:
        return ret;
 }
 
+static void do_notify_pidfd(struct task_struct *task)
+{
+       struct pid *pid;
+
+       pid = task_pid(task);
+       wake_up_all(&pid->wait_pidfd);
+}
+
 /*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1904,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        BUG_ON(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 
+       /* Wake up all pidfd waiters */
+       do_notify_pidfd(tsk);
+
        if (sig != SIGCHLD) {
                /*
                 * This is only possible if parent == real_parent.
index 443fedb..720b2d8 100644 (file)
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -g -I../../../../usr/include/
+CFLAGS += -g -I../../../../usr/include/ -lpthread
 
-TEST_GEN_PROGS := pidfd_test
+TEST_GEN_PROGS := pidfd_test pidfd_open_test
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
new file mode 100644 (file)
index 0000000..8452e91
--- /dev/null
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PIDFD_H
+#define __PIDFD_H
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+
+#include "../kselftest.h"
+
+/*
+ * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
+ * That means, when it wraps around any pid < 300 will be skipped.
+ * So we need to use a pid > 300 in order to test recycling.
+ */
+#define PID_RECYCLE 1000
+
+/*
+ * Define a few custom error codes for the child process to clearly indicate
+ * what is happening. This way we can tell the difference between a system
+ * error, a test error, etc.
+ */
+#define PIDFD_PASS 0
+#define PIDFD_FAIL 1
+#define PIDFD_ERROR 2
+#define PIDFD_SKIP 3
+#define PIDFD_XFAIL 4
+
+int wait_for_pid(pid_t pid)
+{
+       int status, ret;
+
+again:
+       ret = waitpid(pid, &status, 0);
+       if (ret == -1) {
+               if (errno == EINTR)
+                       goto again;
+
+               return -1;
+       }
+
+       if (!WIFEXITED(status))
+               return -1;
+
+       return WEXITSTATUS(status);
+}
+
+
+#endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
new file mode 100644 (file)
index 0000000..0377133
--- /dev/null
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+       return syscall(__NR_pidfd_open, pid, flags);
+}
+
+static int safe_int(const char *numstr, int *converted)
+{
+       char *err = NULL;
+       long sli;
+
+       errno = 0;
+       sli = strtol(numstr, &err, 0);
+       if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN))
+               return -ERANGE;
+
+       if (errno != 0 && sli == 0)
+               return -EINVAL;
+
+       if (err == numstr || *err != '\0')
+               return -EINVAL;
+
+       if (sli > INT_MAX || sli < INT_MIN)
+               return -ERANGE;
+
+       *converted = (int)sli;
+       return 0;
+}
+
+static int char_left_gc(const char *buffer, size_t len)
+{
+       size_t i;
+
+       for (i = 0; i < len; i++) {
+               if (buffer[i] == ' ' ||
+                   buffer[i] == '\t')
+                       continue;
+
+               return i;
+       }
+
+       return 0;
+}
+
+static int char_right_gc(const char *buffer, size_t len)
+{
+       int i;
+
+       for (i = len - 1; i >= 0; i--) {
+               if (buffer[i] == ' '  ||
+                   buffer[i] == '\t' ||
+                   buffer[i] == '\n' ||
+                   buffer[i] == '\0')
+                       continue;
+
+               return i + 1;
+       }
+
+       return 0;
+}
+
+static char *trim_whitespace_in_place(char *buffer)
+{
+       buffer += char_left_gc(buffer, strlen(buffer));
+       buffer[char_right_gc(buffer, strlen(buffer))] = '\0';
+       return buffer;
+}
+
+static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen)
+{
+       int ret;
+       char path[512];
+       FILE *f;
+       size_t n = 0;
+       pid_t result = -1;
+       char *line = NULL;
+
+       snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
+
+       f = fopen(path, "re");
+       if (!f)
+               return -1;
+
+       while (getline(&line, &n, f) != -1) {
+               char *numstr;
+
+               if (strncmp(line, key, keylen))
+                       continue;
+
+               numstr = trim_whitespace_in_place(line + 4);
+               ret = safe_int(numstr, &result);
+               if (ret < 0)
+                       goto out;
+
+               break;
+       }
+
+out:
+       free(line);
+       fclose(f);
+       return result;
+}
+
+int main(int argc, char **argv)
+{
+       int pidfd = -1, ret = 1;
+       pid_t pid;
+
+       ksft_set_plan(3);
+
+       pidfd = sys_pidfd_open(-1, 0);
+       if (pidfd >= 0) {
+               ksft_print_msg(
+                       "%s - succeeded to open pidfd for invalid pid -1\n",
+                       strerror(errno));
+               goto on_error;
+       }
+       ksft_test_result_pass("do not allow invalid pid test: passed\n");
+
+       pidfd = sys_pidfd_open(getpid(), 1);
+       if (pidfd >= 0) {
+               ksft_print_msg(
+                       "%s - succeeded to open pidfd with invalid flag value specified\n",
+                       strerror(errno));
+               goto on_error;
+       }
+       ksft_test_result_pass("do not allow invalid flag test: passed\n");
+
+       pidfd = sys_pidfd_open(getpid(), 0);
+       if (pidfd < 0) {
+               ksft_print_msg("%s - failed to open pidfd\n", strerror(errno));
+               goto on_error;
+       }
+       ksft_test_result_pass("open a new pidfd test: passed\n");
+
+       pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
+       ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
+
+       ret = 0;
+
+on_error:
+       if (pidfd >= 0)
+               close(pidfd);
+
+       return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
index 104c75a..7eaa8a3 100644 (file)
@@ -4,22 +4,49 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/types.h>
+#include <pthread.h>
 #include <sched.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <syscall.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/wait.h>
+#include <time.h>
 #include <unistd.h>
 
+#include "pidfd.h"
 #include "../kselftest.h"
 
 #ifndef __NR_pidfd_send_signal
 #define __NR_pidfd_send_signal -1
 #endif
 
+#define str(s) _str(s)
+#define _str(s) #s
+#define CHILD_THREAD_MIN_WAIT 3 /* seconds */
+
+#define MAX_EVENTS 5
+
+#ifndef CLONE_PIDFD
+#define CLONE_PIDFD 0x00001000
+#endif
+
+static pid_t pidfd_clone(int flags, int *pidfd, int (*fn)(void *))
+{
+       size_t stack_size = 1024;
+       char *stack[1024] = { 0 };
+
+#ifdef __ia64__
+       return __clone2(fn, stack, stack_size, flags | SIGCHLD, NULL, pidfd);
+#else
+       return clone(fn, stack + stack_size, flags | SIGCHLD, NULL, pidfd);
+#endif
+}
+
 static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
                                        unsigned int flags)
 {
@@ -66,28 +93,6 @@ static int test_pidfd_send_signal_simple_success(void)
        return 0;
 }
 
-static int wait_for_pid(pid_t pid)
-{
-       int status, ret;
-
-again:
-       ret = waitpid(pid, &status, 0);
-       if (ret == -1) {
-               if (errno == EINTR)
-                       goto again;
-
-               return -1;
-       }
-
-       if (ret != pid)
-               goto again;
-
-       if (!WIFEXITED(status))
-               return -1;
-
-       return WEXITSTATUS(status);
-}
-
 static int test_pidfd_send_signal_exited_fail(void)
 {
        int pidfd, ret, saved_errno;
@@ -133,13 +138,6 @@ static int test_pidfd_send_signal_exited_fail(void)
 }
 
 /*
- * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
- * That means, when it wraps around any pid < 300 will be skipped.
- * So we need to use a pid > 300 in order to test recycling.
- */
-#define PID_RECYCLE 1000
-
-/*
  * Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT.
  * If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of
  * times then we skip the test to not go into an infinite loop or block for a
@@ -147,17 +145,6 @@ static int test_pidfd_send_signal_exited_fail(void)
  */
 #define PIDFD_MAX_DEFAULT 0x8000
 
-/*
- * Define a few custom error codes for the child process to clearly indicate
- * what is happening. This way we can tell the difference between a system
- * error, a test error, etc.
- */
-#define PIDFD_PASS 0
-#define PIDFD_FAIL 1
-#define PIDFD_ERROR 2
-#define PIDFD_SKIP 3
-#define PIDFD_XFAIL 4
-
 static int test_pidfd_send_signal_recycled_pid_fail(void)
 {
        int i, ret;
@@ -372,11 +359,192 @@ static int test_pidfd_send_signal_syscall_support(void)
        return 0;
 }
 
+static void *test_pidfd_poll_exec_thread(void *priv)
+{
+       ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
+                       getpid(), syscall(SYS_gettid));
+       ksft_print_msg("Child Thread: doing exec of sleep\n");
+
+       execl("/bin/sleep", "sleep", str(CHILD_THREAD_MIN_WAIT), (char *)NULL);
+
+       ksft_print_msg("Child Thread: DONE. pid %d tid %d\n",
+                       getpid(), syscall(SYS_gettid));
+       return NULL;
+}
+
+static void poll_pidfd(const char *test_name, int pidfd)
+{
+       int c;
+       int epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+       struct epoll_event event, events[MAX_EVENTS];
+
+       if (epoll_fd == -1)
+               ksft_exit_fail_msg("%s test: Failed to create epoll file descriptor "
+                                  "(errno %d)\n",
+                                  test_name, errno);
+
+       event.events = EPOLLIN;
+       event.data.fd = pidfd;
+
+       if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, &event)) {
+               ksft_exit_fail_msg("%s test: Failed to add epoll file descriptor "
+                                  "(errno %d)\n",
+                                  test_name, errno);
+       }
+
+       c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000);
+       if (c != 1 || !(events[0].events & EPOLLIN))
+               ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x) ",
+                                  "(errno %d)\n",
+                                  test_name, c, events[0].events, errno);
+
+       close(epoll_fd);
+       return;
+
+}
+
+static int child_poll_exec_test(void *args)
+{
+       pthread_t t1;
+
+       ksft_print_msg("Child (pidfd): starting. pid %d tid %d\n", getpid(),
+                       syscall(SYS_gettid));
+       pthread_create(&t1, NULL, test_pidfd_poll_exec_thread, NULL);
+       /*
+        * Exec in the non-leader thread will destroy the leader immediately.
+        * If the wait in the parent returns too soon, the test fails.
+        */
+       while (1)
+               sleep(1);
+}
+
+static void test_pidfd_poll_exec(int use_waitpid)
+{
+       int pid, pidfd = 0;
+       int status, ret;
+       pthread_t t1;
+       time_t prog_start = time(NULL);
+       const char *test_name = "pidfd_poll check for premature notification on child thread exec";
+
+       ksft_print_msg("Parent: pid: %d\n", getpid());
+       pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_exec_test);
+       if (pid < 0)
+               ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
+                                  test_name, pid, errno);
+
+       ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
+
+       if (use_waitpid) {
+               ret = waitpid(pid, &status, 0);
+               if (ret == -1)
+                       ksft_print_msg("Parent: error\n");
+
+               if (ret == pid)
+                       ksft_print_msg("Parent: Child process waited for.\n");
+       } else {
+               poll_pidfd(test_name, pidfd);
+       }
+
+       time_t prog_time = time(NULL) - prog_start;
+
+       ksft_print_msg("Time waited for child: %lu\n", prog_time);
+
+       close(pidfd);
+
+       if (prog_time < CHILD_THREAD_MIN_WAIT || prog_time > CHILD_THREAD_MIN_WAIT + 2)
+               ksft_exit_fail_msg("%s test: Failed\n", test_name);
+       else
+               ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+
+static void *test_pidfd_poll_leader_exit_thread(void *priv)
+{
+       ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
+                       getpid(), syscall(SYS_gettid));
+       sleep(CHILD_THREAD_MIN_WAIT);
+       ksft_print_msg("Child Thread: DONE. pid %d tid %d\n", getpid(), syscall(SYS_gettid));
+       return NULL;
+}
+
+static time_t *child_exit_secs;
+static int child_poll_leader_exit_test(void *args)
+{
+       pthread_t t1, t2;
+
+       ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(), syscall(SYS_gettid));
+       pthread_create(&t1, NULL, test_pidfd_poll_leader_exit_thread, NULL);
+       pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL);
+
+       /*
+        * glibc exit calls exit_group syscall, so explicity call exit only
+        * so that only the group leader exits, leaving the threads alone.
+        */
+       *child_exit_secs = time(NULL);
+       syscall(SYS_exit, 0);
+}
+
+static void test_pidfd_poll_leader_exit(int use_waitpid)
+{
+       int pid, pidfd = 0;
+       int status, ret;
+       time_t prog_start = time(NULL);
+       const char *test_name = "pidfd_poll check for premature notification on non-empty"
+                               "group leader exit";
+
+       child_exit_secs = mmap(NULL, sizeof *child_exit_secs, PROT_READ | PROT_WRITE,
+                       MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+       if (child_exit_secs == MAP_FAILED)
+               ksft_exit_fail_msg("%s test: mmap failed (errno %d)\n",
+                                  test_name, errno);
+
+       ksft_print_msg("Parent: pid: %d\n", getpid());
+       pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_leader_exit_test);
+       if (pid < 0)
+               ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
+                                  test_name, pid, errno);
+
+       ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
+
+       if (use_waitpid) {
+               ret = waitpid(pid, &status, 0);
+               if (ret == -1)
+                       ksft_print_msg("Parent: error\n");
+       } else {
+               /*
+                * This sleep tests for the case where if the child exits, and is in
+                * EXIT_ZOMBIE, but the thread group leader is non-empty, then the poll
+                * doesn't prematurely return even though there are active threads
+                */
+               sleep(1);
+               poll_pidfd(test_name, pidfd);
+       }
+
+       if (ret == pid)
+               ksft_print_msg("Parent: Child process waited for.\n");
+
+       time_t since_child_exit = time(NULL) - *child_exit_secs;
+
+       ksft_print_msg("Time since child exit: %lu\n", since_child_exit);
+
+       close(pidfd);
+
+       if (since_child_exit < CHILD_THREAD_MIN_WAIT ||
+                       since_child_exit > CHILD_THREAD_MIN_WAIT + 2)
+               ksft_exit_fail_msg("%s test: Failed\n", test_name);
+       else
+               ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+
 int main(int argc, char **argv)
 {
        ksft_print_header();
        ksft_set_plan(4);
 
+       test_pidfd_poll_exec(0);
+       test_pidfd_poll_exec(1);
+       test_pidfd_poll_leader_exit(0);
+       test_pidfd_poll_leader_exit(1);
        test_pidfd_send_signal_syscall_support();
        test_pidfd_send_signal_simple_success();
        test_pidfd_send_signal_exited_fail();