Merge branch 'proc-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 4 Jun 2020 20:54:34 +0000 (13:54 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 4 Jun 2020 20:54:34 +0000 (13:54 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 4 Jun 2020 20:54:34 +0000 (13:54 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 4 Jun 2020 20:54:34 +0000 (13:54 -0700)
diff --combined Documentation/filesystems/proc.rst

index 430963e,e2ecf24..996f3cf
--- 1/Documentation/filesystems/proc.rst
--- 2/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@@ -51,6 -51,8 +51,8 @@@ fixes/update part 1.1  Stefani Seibold 
     4   Configuring procfs
     4.1 Mount options
   
+   5   Filesystem behavior
+ 
   Preface
   =======
   
@@@ -543,7 -545,6 +545,7 @@@ encoded manner. The codes are the follo
       hg    huge page advise flag
       nh    no huge page advise flag
       mg    mergable advise flag
+ +    bt  - arm64 BTI guarded page
       ==    =======================================
   
   Note that there is no guarantee that every flag and associated mnemonic will
@@@ -1043,8 -1044,8 +1045,8 @@@ PageTable
                 amount of memory dedicated to the lowest level of page
                 tables.
   NFS_Unstable
- -              NFS pages sent to the server, but not yet committed to stable
- -            storage
+ +              Always zero. Previous counted pages which had been written to
+ +              the server, but has not been committed to stable storage.
   Bounce
                 Memory used for block device "bounce buffers"
   WritebackTmp
@@@ -1871,7 -1872,7 +1873,7 @@@ unbindable        mount is unbindabl
   
   For more information on mount propagation see:
   
- -  Documentation/filesystems/sharedsubtree.txt
+ +  Documentation/filesystems/sharedsubtree.rst
   
   
   3.6   /proc/<pid>/comm  & /proc/<pid>/task/<tid>/comm
@@@ -2143,28 -2144,80 +2145,80 @@@ The following mount options are support
         =========       ========================================================
         hidepid=        Set /proc/<pid>/ access mode.
         gid=            Set the group authorized to learn processes information.
+       subset=         Show only the specified subset of procfs.
         =========       ========================================================
   
- hidepid=0 means classic mode - everybody may access all /proc/<pid>/ directories
- (default).
- 
- hidepid=1 means users may not access any /proc/<pid>/ directories but their
- own.  Sensitive files like cmdline, sched*, status are now protected against
- other users.  This makes it impossible to learn whether any user runs
- specific program (given the program doesn't reveal itself by its behaviour).
- As an additional bonus, as /proc/<pid>/cmdline is unaccessible for other users,
- poorly written programs passing sensitive information via program arguments are
- now protected against local eavesdroppers.
- 
- hidepid=2 means hidepid=1 plus all /proc/<pid>/ will be fully invisible to other
- users.  It doesn't mean that it hides a fact whether a process with a specific
- pid value exists (it can be learned by other means, e.g. by "kill -0 $PID"),
- but it hides process' uid and gid, which may be learned by stat()'ing
- /proc/<pid>/ otherwise.  It greatly complicates an intruder's task of gathering
- information about running processes, whether some daemon runs with elevated
- privileges, whether other user runs some sensitive program, whether other users
- run any program at all, etc.
+ hidepid=off or hidepid=0 means classic mode - everybody may access all
+ /proc/<pid>/ directories (default).
+ 
+ hidepid=noaccess or hidepid=1 means users may not access any /proc/<pid>/
+ directories but their own.  Sensitive files like cmdline, sched*, status are now
+ protected against other users.  This makes it impossible to learn whether any
+ user runs specific program (given the program doesn't reveal itself by its
+ behaviour).  As an additional bonus, as /proc/<pid>/cmdline is unaccessible for
+ other users, poorly written programs passing sensitive information via program
+ arguments are now protected against local eavesdroppers.
+ 
+ hidepid=invisible or hidepid=2 means hidepid=1 plus all /proc/<pid>/ will be
+ fully invisible to other users.  It doesn't mean that it hides a fact whether a
+ process with a specific pid value exists (it can be learned by other means, e.g.
+ by "kill -0 $PID"), but it hides process' uid and gid, which may be learned by
+ stat()'ing /proc/<pid>/ otherwise.  It greatly complicates an intruder's task of
+ gathering information about running processes, whether some daemon runs with
+ elevated privileges, whether other user runs some sensitive program, whether
+ other users run any program at all, etc.
+ 
+ hidepid=ptraceable or hidepid=4 means that procfs should only contain
+ /proc/<pid>/ directories that the caller can ptrace.
   
   gid= defines a group authorized to learn processes information otherwise
   prohibited by hidepid=.  If you use some daemon like identd which needs to learn
   information about processes information, just add identd to this group.
+ 
+ subset=pid hides all top level files and directories in the procfs that
+ are not related to tasks.
+ 
+ 5     Filesystem behavior
+ ----------------------------
+ 
+ Originally, before the advent of pid namepsace, procfs was a global file
+ system. It means that there was only one procfs instance in the system.
+ 
+ When pid namespace was added, a separate procfs instance was mounted in
+ each pid namespace. So, procfs mount options are global among all
+ mountpoints within the same namespace.
+ 
+ ::
+ 
+ # grep ^proc /proc/mounts
+ proc /proc proc rw,relatime,hidepid=2 0 0
+ 
+ # strace -e mount mount -o hidepid=1 -t proc proc /tmp/proc
+ mount("proc", "/tmp/proc", "proc", 0, "hidepid=1") = 0
+ +++ exited with 0 +++
+ 
+ # grep ^proc /proc/mounts
+ proc /proc proc rw,relatime,hidepid=2 0 0
+ proc /tmp/proc proc rw,relatime,hidepid=2 0 0
+ 
+ and only after remounting procfs mount options will change at all
+ mountpoints.
+ 
+ # mount -o remount,hidepid=1 -t proc proc /tmp/proc
+ 
+ # grep ^proc /proc/mounts
+ proc /proc proc rw,relatime,hidepid=1 0 0
+ proc /tmp/proc proc rw,relatime,hidepid=1 0 0
+ 
+ This behavior is different from the behavior of other filesystems.
+ 
+ The new procfs behavior is more like other filesystems. Each procfs mount
+ creates a new procfs instance. Mount options affect own procfs instance.
+ It means that it became possible to have several procfs instances
+ displaying tasks with different filtering options in one pid namespace.
+ 
+ # mount -o hidepid=invisible -t proc proc /proc
+ # mount -o hidepid=noaccess -t proc proc /tmp/proc
+ # grep ^proc /proc/mounts
+ proc /proc proc rw,relatime,hidepid=invisible 0 0
+ proc /tmp/proc proc rw,relatime,hidepid=noaccess 0 0
diff --combined fs/exec.c

index 2c46511,6ab1c19..2f0a745
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -1176,7 -1176,6 +1176,6 @@@ static int de_thread(struct task_struc
                 tsk->start_boottime = leader->start_boottime;
   
                 BUG_ON(!same_thread_group(leader, tsk));
-               BUG_ON(has_group_leader_pid(tsk));
                 /*
                  * An exec() starts a new thread group with the
                  * TGID of the previous thread group. Rehash the
@@@ -1186,11 -1185,8 +1185,8 @@@
   
                 /* Become a process group leader with the old leader's pid.
                  * The old leader becomes a thread of the this thread group.
-                * Note: The old leader also uses this pid until release_task
-                *       is called.  Odd but simple and correct.
                  */
-               tsk->pid = leader->pid;
-               change_pid(tsk, PIDTYPE_PID, task_pid(leader));
+               exchange_tids(tsk, leader);
                 transfer_pid(leader, tsk, PIDTYPE_TGID);
                 transfer_pid(leader, tsk, PIDTYPE_PGID);
                 transfer_pid(leader, tsk, PIDTYPE_SID);
@@@ -1317,8 -1313,6 +1313,8 @@@ int flush_old_exec(struct linux_binprm 
          */
         set_mm_exe_file(bprm->mm, bprm->file);
   
+ +      would_dump(bprm, bprm->file);
+ +
         /*
          * Release all of the old mmap stuff
          */
@@@ -1878,6 -1872,8 +1874,6 @@@ static int __do_execve_file(int fd, str
         if (retval < 0)
                 goto out;
   
- -      would_dump(bprm, bprm->file);
- -
         retval = exec_binprm(bprm);
         if (retval < 0)
                 goto out;
diff --combined fs/locks.c

index 1d4f4d5,ab702d6..6fd1f6e
--- 1/fs/locks.c
--- 2/fs/locks.c
+++ b/fs/locks.c
@@@ -61,7 -61,7 +61,7 @@@
    *
    *  Initial implementation of mandatory locks. SunOS turned out to be
    *  a rotten model, so I implemented the "obvious" semantics.
- - *  See 'Documentation/filesystems/mandatory-locking.txt' for details.
+ + *  See 'Documentation/filesystems/mandatory-locking.rst' for details.
    *  Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996.
    *
    *  Don't allow mandatory locks on mmap()'ed files. Added simple functions to
@@@ -2823,7 -2823,7 +2823,7 @@@ static void lock_get_status(struct seq_
   {
         struct inode *inode = NULL;
         unsigned int fl_pid;
-       struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
+       struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
   
         fl_pid = locks_translate_pid(fl, proc_pidns);
         /*
@@@ -2901,7 -2901,7 +2901,7 @@@ static int locks_show(struct seq_file *
   {
         struct locks_iterator *iter = f->private;
         struct file_lock *fl, *bfl;
-       struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
+       struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
   
         fl = hlist_entry(v, struct file_lock, fl_link);
   
diff --combined fs/proc/base.c

index eb2255e,30c9fce..b1d94d1
--- 1/fs/proc/base.c
--- 2/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -697,13 -697,21 +697,21 @@@ int proc_setattr(struct dentry *dentry
    * May current process learn task's sched/cmdline info (for hide_pid_min=1)
    * or euid/egid (for hide_pid_min=2)?
    */
- static bool has_pid_permissions(struct pid_namespace *pid,
+ static bool has_pid_permissions(struct proc_fs_info *fs_info,
                                  struct task_struct *task,
-                                int hide_pid_min)
+                                enum proc_hidepid hide_pid_min)
   {
-       if (pid->hide_pid < hide_pid_min)
+       /*
+        * If 'hidpid' mount option is set force a ptrace check,
+        * we indicate that we are using a filesystem syscall
+        * by passing PTRACE_MODE_READ_FSCREDS
+        */
+       if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
+               return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ 
+       if (fs_info->hide_pid < hide_pid_min)
                 return true;
-       if (in_group_p(pid->pid_gid))
+       if (in_group_p(fs_info->pid_gid))
                 return true;
         return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
   }
@@@ -711,18 -719,18 +719,18 @@@
   
   static int proc_pid_permission(struct inode *inode, int mask)
   {
-       struct pid_namespace *pid = proc_pid_ns(inode);
+       struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
         struct task_struct *task;
         bool has_perms;
   
         task = get_proc_task(inode);
         if (!task)
                 return -ESRCH;
-       has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS);
+       has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
         put_task_struct(task);
   
         if (!has_perms) {
-               if (pid->hide_pid == HIDEPID_INVISIBLE) {
+               if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
                         /*
                          * Let's make getdents(), stat(), and open()
                          * consistent with each other.  If a process
@@@ -746,7 -754,7 +754,7 @@@ static const struct inode_operations pr
   static int proc_single_show(struct seq_file *m, void *v)
   {
         struct inode *inode = m->private;
-       struct pid_namespace *ns = proc_pid_ns(inode);
+       struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
         struct pid *pid = proc_pid(inode);
         struct task_struct *task;
         int ret;
@@@ -1415,7 -1423,7 +1423,7 @@@ static const struct file_operations pro
   static int sched_show(struct seq_file *m, void *v)
   {
         struct inode *inode = m->private;
-       struct pid_namespace *ns = proc_pid_ns(inode);
+       struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
         struct task_struct *p;
   
         p = get_proc_task(inode);
@@@ -1573,7 -1581,6 +1581,7 @@@ static ssize_t timens_offsets_write(str
         noffsets = 0;
         for (pos = kbuf; pos; pos = next_line) {
                 struct proc_timens_offset *off = &offsets[noffsets];
+ +              char clock[10];
                 int err;
   
                 /* Find the end of line and ensure we don't look past it */
@@@ -1585,21 -1592,10 +1593,21 @@@
                                 next_line = NULL;
                 }
   
- -              err = sscanf(pos, "%u %lld %lu", &off->clockid,
+ +              err = sscanf(pos, "%9s %lld %lu", clock,
                                 &off->val.tv_sec, &off->val.tv_nsec);
                 if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
                         goto out;
+ +
+ +              clock[sizeof(clock) - 1] = 0;
+ +              if (strcmp(clock, "monotonic") == 0 ||
+ +                  strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
+ +                      off->clockid = CLOCK_MONOTONIC;
+ +              else if (strcmp(clock, "boottime") == 0 ||
+ +                       strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
+ +                      off->clockid = CLOCK_BOOTTIME;
+ +              else
+ +                      goto out;
+ +
                 noffsets++;
                 if (noffsets == ARRAY_SIZE(offsets)) {
                         if (next_line)
@@@ -1909,7 -1905,7 +1917,7 @@@ int pid_getattr(const struct path *path
                 u32 request_mask, unsigned int query_flags)
   {
         struct inode *inode = d_inode(path->dentry);
-       struct pid_namespace *pid = proc_pid_ns(inode);
+       struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
         struct task_struct *task;
   
         generic_fillattr(inode, stat);
@@@ -1919,7 -1915,7 +1927,7 @@@
         rcu_read_lock();
         task = pid_task(proc_pid(inode), PIDTYPE_PID);
         if (task) {
-               if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
+               if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
                         rcu_read_unlock();
                         /*
                          * This doesn't prevent learning whether PID exists,
@@@ -2470,7 -2466,7 +2478,7 @@@ static int proc_timers_open(struct inod
                 return -ENOMEM;
   
         tp->pid = proc_pid(inode);
-       tp->ns = proc_pid_ns(inode);
+       tp->ns = proc_pid_ns(inode->i_sb);
         return 0;
   }
   
@@@ -3312,6 -3308,7 +3320,7 @@@ struct dentry *proc_pid_lookup(struct d
   {
         struct task_struct *task;
         unsigned tgid;
+       struct proc_fs_info *fs_info;
         struct pid_namespace *ns;
         struct dentry *result = ERR_PTR(-ENOENT);
   
@@@ -3319,7 -3316,8 +3328,8 @@@
         if (tgid == ~0U)
                 goto out;
   
-       ns = dentry->d_sb->s_fs_info;
+       fs_info = proc_sb_info(dentry->d_sb);
+       ns = fs_info->pid_ns;
         rcu_read_lock();
         task = find_task_by_pid_ns(tgid, ns);
         if (task)
@@@ -3328,7 -3326,14 +3338,14 @@@
         if (!task)
                 goto out;
   
+       /* Limit procfs to only ptraceable tasks */
+       if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
+               if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
+                       goto out_put_task;
+       }
+ 
         result = proc_pid_instantiate(dentry, task, NULL);
+ out_put_task:
         put_task_struct(task);
   out:
         return result;
@@@ -3354,20 -3359,8 +3371,8 @@@ retry
         pid = find_ge_pid(iter.tgid, ns);
         if (pid) {
                 iter.tgid = pid_nr_ns(pid, ns);
-               iter.task = pid_task(pid, PIDTYPE_PID);
-               /* What we to know is if the pid we have find is the
-                * pid of a thread_group_leader.  Testing for task
-                * being a thread_group_leader is the obvious thing
-                * todo but there is a window when it fails, due to
-                * the pid transfer logic in de_thread.
-                *
-                * So we perform the straight forward test of seeing
-                * if the pid we have found is the pid of a thread
-                * group leader, and don't worry if the task we have
-                * found doesn't happen to be a thread group leader.
-                * As we don't care in the case of readdir.
-                */
-               if (!iter.task || !has_group_leader_pid(iter.task)) {
+               iter.task = pid_task(pid, PIDTYPE_TGID);
+               if (!iter.task) {
                         iter.tgid += 1;
                         goto retry;
                 }
@@@ -3383,20 -3376,21 +3388,21 @@@
   int proc_pid_readdir(struct file *file, struct dir_context *ctx)
   {
         struct tgid_iter iter;
-       struct pid_namespace *ns = proc_pid_ns(file_inode(file));
+       struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
+       struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
         loff_t pos = ctx->pos;
   
         if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
                 return 0;
   
         if (pos == TGID_OFFSET - 2) {
-               struct inode *inode = d_inode(ns->proc_self);
+               struct inode *inode = d_inode(fs_info->proc_self);
                 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
                         return 0;
                 ctx->pos = pos = pos + 1;
         }
         if (pos == TGID_OFFSET - 1) {
-               struct inode *inode = d_inode(ns->proc_thread_self);
+               struct inode *inode = d_inode(fs_info->proc_thread_self);
                 if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
                         return 0;
                 ctx->pos = pos = pos + 1;
@@@ -3410,7 -3404,7 +3416,7 @@@
                 unsigned int len;
   
                 cond_resched();
-               if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
+               if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
                         continue;
   
                 len = snprintf(name, sizeof(name), "%u", iter.tgid);
@@@ -3610,6 -3604,7 +3616,7 @@@ static struct dentry *proc_task_lookup(
         struct task_struct *task;
         struct task_struct *leader = get_proc_task(dir);
         unsigned tid;
+       struct proc_fs_info *fs_info;
         struct pid_namespace *ns;
         struct dentry *result = ERR_PTR(-ENOENT);
   
@@@ -3620,7 -3615,8 +3627,8 @@@
         if (tid == ~0U)
                 goto out;
   
-       ns = dentry->d_sb->s_fs_info;
+       fs_info = proc_sb_info(dentry->d_sb);
+       ns = fs_info->pid_ns;
         rcu_read_lock();
         task = find_task_by_pid_ns(tid, ns);
         if (task)
@@@ -3734,7 -3730,7 +3742,7 @@@ static int proc_task_readdir(struct fil
         /* f_version caches the tgid value that the last readdir call couldn't
          * return. lseek aka telldir automagically resets f_version to 0.
          */
-       ns = proc_pid_ns(inode);
+       ns = proc_pid_ns(inode->i_sb);
         tid = (int)file->f_version;
         file->f_version = 0;
         for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
diff --combined fs/proc_namespace.c

index e4d70c0,9a8b624..3059a93
--- 1/fs/proc_namespace.c
--- 2/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@@ -37,23 -37,23 +37,23 @@@ static __poll_t mounts_poll(struct fil
         return res;
   }
   
- struct proc_fs_info {
+ struct proc_fs_opts {
         int flag;
         const char *str;
   };
   
   static int show_sb_opts(struct seq_file *m, struct super_block *sb)
   {
-       static const struct proc_fs_info fs_info[] = {
+       static const struct proc_fs_opts fs_opts[] = {
                 { SB_SYNCHRONOUS, ",sync" },
                 { SB_DIRSYNC, ",dirsync" },
                 { SB_MANDLOCK, ",mand" },
                 { SB_LAZYTIME, ",lazytime" },
                 { 0, NULL }
         };
-       const struct proc_fs_info *fs_infop;
+       const struct proc_fs_opts *fs_infop;
   
-       for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
+       for (fs_infop = fs_opts; fs_infop->flag; fs_infop++) {
                 if (sb->s_flags & fs_infop->flag)
                         seq_puts(m, fs_infop->str);
         }
@@@ -63,7 -63,7 +63,7 @@@
   
   static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
   {
-       static const struct proc_fs_info mnt_info[] = {
+       static const struct proc_fs_opts mnt_opts[] = {
                 { MNT_NOSUID, ",nosuid" },
                 { MNT_NODEV, ",nodev" },
                 { MNT_NOEXEC, ",noexec" },
@@@ -72,9 -72,9 +72,9 @@@
                 { MNT_RELATIME, ",relatime" },
                 { 0, NULL }
         };
-       const struct proc_fs_info *fs_infop;
+       const struct proc_fs_opts *fs_infop;
   
-       for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
+       for (fs_infop = mnt_opts; fs_infop->flag; fs_infop++) {
                 if (mnt->mnt_flags & fs_infop->flag)
                         seq_puts(m, fs_infop->str);
         }
@@@ -279,8 -279,7 +279,8 @@@ static int mounts_open_common(struct in
         p->ns = ns;
         p->root = root;
         p->show = show;
- -      p->cached_event = ~0ULL;
+ +      INIT_LIST_HEAD(&p->cursor.mnt_list);
+ +      p->cursor.mnt.mnt_flags = MNT_CURSOR;
   
         return 0;
   
@@@ -297,7 -296,6 +297,7 @@@ static int mounts_release(struct inode 
         struct seq_file *m = file->private_data;
         struct proc_mounts *p = m->private;
         path_put(&p->root);
+ +      mnt_cursor_del(p->ns, &p->cursor);
         put_mnt_ns(p->ns);
         return seq_release_private(inode, file);
   }
diff --combined include/linux/pid.h

index 93543cb,2159ffc..176d6cf
--- 1/include/linux/pid.h
--- 2/include/linux/pid.h
+++ b/include/linux/pid.h
@@@ -102,15 -102,13 +102,16 @@@ extern void attach_pid(struct task_stru
   extern void detach_pid(struct task_struct *task, enum pid_type);
   extern void change_pid(struct task_struct *task, enum pid_type,
                         struct pid *pid);
+ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
   extern void transfer_pid(struct task_struct *old, struct task_struct *new,
                          enum pid_type);
   
   struct pid_namespace;
   extern struct pid_namespace init_pid_ns;
   
+ +extern int pid_max;
+ +extern int pid_max_min, pid_max_max;
+ +
   /*
    * look up a PID in the hash table. Must be called with the tasklist_lock
    * or rcu_read_lock() held.
diff --combined include/linux/proc_fs.h

index ad4ff71,6ec524d..d1eed1b
--- 1/include/linux/proc_fs.h
--- 2/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@@ -42,6 -42,34 +42,34 @@@ struct proc_ops 
         unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
   } __randomize_layout;
   
+ /* definitions for hide_pid field */
+ enum proc_hidepid {
+       HIDEPID_OFF       = 0,
+       HIDEPID_NO_ACCESS = 1,
+       HIDEPID_INVISIBLE = 2,
+       HIDEPID_NOT_PTRACEABLE = 4, /* Limit pids to only ptraceable pids */
+ };
+ 
+ /* definitions for proc mount option pidonly */
+ enum proc_pidonly {
+       PROC_PIDONLY_OFF = 0,
+       PROC_PIDONLY_ON  = 1,
+ };
+ 
+ struct proc_fs_info {
+       struct pid_namespace *pid_ns;
+       struct dentry *proc_self;        /* For /proc/self */
+       struct dentry *proc_thread_self; /* For /proc/thread-self */
+       kgid_t pid_gid;
+       enum proc_hidepid hide_pid;
+       enum proc_pidonly pidonly;
+ };
+ 
+ static inline struct proc_fs_info *proc_sb_info(struct super_block *sb)
+ {
+       return sb->s_fs_info;
+ }
+ 
   #ifdef CONFIG_PROC_FS
   
   typedef int (*proc_write_t)(struct file *, char *, size_t);
@@@ -105,9 -133,6 +133,9 @@@ struct proc_dir_entry *proc_create_net_
                                                     void *data);
   extern struct pid *tgid_pidfd_to_pid(const struct file *file);
   
+ +extern int bpf_iter_init_seq_net(void *priv_data);
+ +extern void bpf_iter_fini_seq_net(void *priv_data);
+ +
   #ifdef CONFIG_PROC_PID_ARCH_STATUS
   /*
    * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
@@@ -177,11 -202,9 +205,11 @@@ int open_related_ns(struct ns_common *n
                    struct ns_common *(*get_ns)(struct ns_common *ns));
   
   /* get the associated pid namespace for a file in procfs */
- static inline struct pid_namespace *proc_pid_ns(const struct inode *inode)
+ static inline struct pid_namespace *proc_pid_ns(struct super_block *sb)
   {
-       return inode->i_sb->s_fs_info;
+       return proc_sb_info(sb)->pid_ns;
   }
   
+ +bool proc_ns_file(const struct file *file);
+ +
   #endif /* _LINUX_PROC_FS_H */
diff --combined include/linux/rculist.h

index 7375bb3,67867e0..df587d1
--- 1/include/linux/rculist.h
--- 2/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@@ -371,7 -371,7 +371,7 @@@ static inline void list_splice_tail_ini
    * @pos:      the type * to use as a loop cursor.
    * @head:     the head for your list.
    * @member:   the name of the list_head within the struct.
- - * @cond...:  optional lockdep expression if called from non-RCU protection.
+ + * @cond:     optional lockdep expression if called from non-RCU protection.
    *
    * This list-traversal primitive may safely run concurrently with
    * the _rcu list-mutation primitives such as list_add_rcu()
@@@ -506,6 -506,27 +506,27 @@@ static inline void hlist_replace_rcu(st
         WRITE_ONCE(old->pprev, LIST_POISON2);
   }
   
+ /**
+  * hlists_swap_heads_rcu - swap the lists the hlist heads point to
+  * @left:  The hlist head on the left
+  * @right: The hlist head on the right
+  *
+  * The lists start out as [@left  ][node1 ... ] and
+                           [@right ][node2 ... ]
+  * The lists end up as    [@left  ][node2 ... ]
+  *                        [@right ][node1 ... ]
+  */
+ static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right)
+ {
+       struct hlist_node *node1 = left->first;
+       struct hlist_node *node2 = right->first;
+ 
+       rcu_assign_pointer(left->first, node2);
+       rcu_assign_pointer(right->first, node1);
+       WRITE_ONCE(node2->pprev, &left->first);
+       WRITE_ONCE(node1->pprev, &right->first);
+ }
+ 
   /*
    * return the first or the next element in an RCU protected hlist
    */
@@@ -646,7 -667,7 +667,7 @@@ static inline void hlist_add_behind_rcu
    * @pos:      the type * to use as a loop cursor.
    * @head:     the head for your list.
    * @member:   the name of the hlist_node within the struct.
- - * @cond...:  optional lockdep expression if called from non-RCU protection.
+ + * @cond:     optional lockdep expression if called from non-RCU protection.
    *
    * This list-traversal primitive may safely run concurrently with
    * the _rcu list-mutation primitives such as hlist_add_head_rcu()
diff --combined kernel/fork.c

index be98e94,e7bdacc..cefe874
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -94,7 -94,6 +94,7 @@@
   #include <linux/thread_info.h>
   #include <linux/stackleak.h>
   #include <linux/kasan.h>
+ +#include <linux/scs.h>
   
   #include <asm/pgtable.h>
   #include <asm/pgalloc.h>
@@@ -457,8 -456,6 +457,8 @@@ void put_task_stack(struct task_struct 
   
   void free_task(struct task_struct *tsk)
   {
+ +      scs_release(tsk);
+ +
   #ifndef CONFIG_THREAD_INFO_IN_TASK
         /*
          * The task is finally done with both the stack and thread_info,
@@@ -843,8 -840,6 +843,8 @@@ void __init fork_init(void
                           NULL, free_vm_stack_cache);
   #endif
   
+ +      scs_init();
+ +
         lockdep_init_task(&init_task);
         uprobes_init();
   }
@@@ -904,10 -899,6 +904,10 @@@ static struct task_struct *dup_task_str
         if (err)
                 goto free_stack;
   
+ +      err = scs_prepare(tsk, node);
+ +      if (err)
+ +              goto free_stack;
+ +
   #ifdef CONFIG_SECCOMP
         /*
          * We must handle setting up seccomp filters once we're under
@@@ -1692,11 -1683,6 +1692,11 @@@ static inline void rcu_copy_process(str
         INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
         p->rcu_tasks_idle_cpu = -1;
   #endif /* #ifdef CONFIG_TASKS_RCU */
+ +#ifdef CONFIG_TASKS_TRACE_RCU
+ +      p->trc_reader_nesting = 0;
+ +      p->trc_reader_special.s = 0;
+ +      INIT_LIST_HEAD(&p->trc_holdout_list);
+ +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
   }
   
   struct pid *pidfd_pid(const struct file *file)
@@@ -1759,7 -1745,7 +1759,7 @@@ static void pidfd_show_fdinfo(struct se
         pid_t nr = -1;
   
         if (likely(pid_has_task(pid, PIDTYPE_PID))) {
-               ns = proc_pid_ns(file_inode(m->file));
+               ns = proc_pid_ns(file_inode(m->file)->i_sb);
                 nr = pid_nr_ns(pid, ns);
         }
   
@@@ -2500,11 -2486,11 +2500,11 @@@ long do_fork(unsigned long clone_flags
               int __user *child_tidptr)
   {
         struct kernel_clone_args args = {
- -              .flags          = (clone_flags & ~CSIGNAL),
+ +              .flags          = (lower_32_bits(clone_flags) & ~CSIGNAL),
                 .pidfd          = parent_tidptr,
                 .child_tid      = child_tidptr,
                 .parent_tid     = parent_tidptr,
- -              .exit_signal    = (clone_flags & CSIGNAL),
+ +              .exit_signal    = (lower_32_bits(clone_flags) & CSIGNAL),
                 .stack          = stack_start,
                 .stack_size     = stack_size,
         };
@@@ -2522,9 -2508,8 +2522,9 @@@
   pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
   {
         struct kernel_clone_args args = {
- -              .flags          = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
- -              .exit_signal    = (flags & CSIGNAL),
+ +              .flags          = ((lower_32_bits(flags) | CLONE_VM |
+ +                                  CLONE_UNTRACED) & ~CSIGNAL),
+ +              .exit_signal    = (lower_32_bits(flags) & CSIGNAL),
                 .stack          = (unsigned long)fn,
                 .stack_size     = (unsigned long)arg,
         };
@@@ -2585,11 -2570,11 +2585,11 @@@ SYSCALL_DEFINE5(clone, unsigned long, c
   #endif
   {
         struct kernel_clone_args args = {
- -              .flags          = (clone_flags & ~CSIGNAL),
+ +              .flags          = (lower_32_bits(clone_flags) & ~CSIGNAL),
                 .pidfd          = parent_tidptr,
                 .child_tid      = child_tidptr,
                 .parent_tid     = parent_tidptr,
- -              .exit_signal    = (clone_flags & CSIGNAL),
+ +              .exit_signal    = (lower_32_bits(clone_flags) & CSIGNAL),
                 .stack          = newsp,
                 .tls            = tls,
         };
@@@ -2620,14 -2605,6 +2620,14 @@@ noinline static int copy_clone_args_fro
         struct clone_args args;
         pid_t *kset_tid = kargs->set_tid;
   
+ +      BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
+ +                   CLONE_ARGS_SIZE_VER0);
+ +      BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
+ +                   CLONE_ARGS_SIZE_VER1);
+ +      BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
+ +                   CLONE_ARGS_SIZE_VER2);
+ +      BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);
+ +
         if (unlikely(usize > PAGE_SIZE))
                 return -E2BIG;
         if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
@@@ -2654,8 -2631,7 +2654,8 @@@
                      !valid_signal(args.exit_signal)))
                 return -EINVAL;
   
- -      if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
+ +      if ((args.flags & CLONE_INTO_CGROUP) &&
+ +          (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
                 return -EINVAL;
   
         *kargs = (struct kernel_clone_args){
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 4 Jun 2020 20:54:34 +0000 (13:54 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 4 Jun 2020 20:54:34 +0000 (13:54 -0700)
		1	2
Documentation/filesystems/proc.rst	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/locks.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/base.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc_namespace.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/pid.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/proc_fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/rculist.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history