Merge tag 'trace-v6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Apr 2023 22:57:53 +0000 (15:57 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 28 Apr 2023 22:57:53 +0000 (15:57 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Apr 2023 22:57:53 +0000 (15:57 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Apr 2023 22:57:53 +0000 (15:57 -0700)
diff --combined Documentation/trace/ftrace.rst

index e8bca5f,aaebb82..a9c8bce
--- 1/Documentation/trace/ftrace.rst
--- 2/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@@ -1027,6 -1027,7 +1027,7 @@@ To see what is available, simply cat th
         nohex
         nobin
         noblock
+       nofields
         trace_printk
         annotate
         nouserstacktrace
@@@ -1110,6 -1111,11 +1111,11 @@@ Here are the available options
     block
         When set, reading trace_pipe will not block when polled.
   
+   fields
+       Print the fields as described by their types. This is a better
+       option than using hex, bin or raw, as it gives a better parsing
+       of the content of the event.
+ 
     trace_printk
         Can disable trace_printk() from writing into the buffer.
   
@@@ -3510,7 -3516,7 +3516,7 @@@ directories, the rmdir will fail with E
   Stack trace
   -----------
   Since the kernel has a fixed sized stack, it is important not to
- -waste it in functions. A kernel developer must be conscience of
+ +waste it in functions. A kernel developer must be conscious of
   what they allocate on the stack. If they add too much, the system
   can be in danger of a stack overflow, and corruption will occur,
   usually leading to a system panic.
diff --combined fs/exec.c

index 87cf3a2,2b0042f..a466e79
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -65,6 -65,7 +65,7 @@@
   #include <linux/syscall_user_dispatch.h>
   #include <linux/coredump.h>
   #include <linux/time_namespace.h>
+ #include <linux/user_events.h>
   
   #include <linux/uaccess.h>
   #include <asm/mmu_context.h>
@@@ -1034,7 -1035,7 +1035,7 @@@ static int exec_mmap(struct mm_struct *
                 mmput(old_mm);
                 return 0;
         }
- -      mmdrop(active_mm);
+ +      mmdrop_lazy_tlb(active_mm);
         return 0;
   }
   
@@@ -1859,6 -1860,7 +1860,7 @@@ static int bprm_execve(struct linux_bin
         current->fs->in_exec = 0;
         current->in_execve = 0;
         rseq_execve(current);
+       user_events_execve(current);
         acct_update_integrals(current);
         task_numa_free(current, false);
         return retval;
diff --combined include/linux/ftrace.h

index 3e56cb6,327046f..6954e4e
--- 1/include/linux/ftrace.h
--- 2/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@@ -548,6 -548,7 +548,7 @@@ bool is_ftrace_trampoline(unsigned lon
    *  DIRECT   - there is a direct function to call
    *  CALL_OPS - the record can use callsite-specific ops
    *  CALL_OPS_EN - the function is set up to use callsite-specific ops
+  *  TOUCHED  - A callback was added since boot up
    *
    * When a new ftrace_ops is registered and wants a function to save
    * pt_regs, the rec->flags REGS is set. When the function has been
@@@ -567,9 -568,10 +568,10 @@@ enum 
         FTRACE_FL_DIRECT_EN     = (1UL << 23),
         FTRACE_FL_CALL_OPS      = (1UL << 22),
         FTRACE_FL_CALL_OPS_EN   = (1UL << 21),
+       FTRACE_FL_TOUCHED       = (1UL << 20),
   };
   
- #define FTRACE_REF_MAX_SHIFT  21
+ #define FTRACE_REF_MAX_SHIFT  20
   #define FTRACE_REF_MAX                ((1UL << FTRACE_REF_MAX_SHIFT) - 1)
   
   #define ftrace_rec_count(rec) ((rec)->flags & FTRACE_REF_MAX)
@@@ -628,6 -630,7 +630,7 @@@ enum 
         FTRACE_ITER_PROBE       = (1 << 4),
         FTRACE_ITER_MOD         = (1 << 5),
         FTRACE_ITER_ENABLED     = (1 << 6),
+       FTRACE_ITER_TOUCHED     = (1 << 7),
   };
   
   void arch_ftrace_update_code(int command);
@@@ -961,7 -964,7 +964,7 @@@ static inline void __ftrace_enabled_res
   #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
   #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
   
- -static inline unsigned long get_lock_parent_ip(void)
+ +static __always_inline unsigned long get_lock_parent_ip(void)
   {
         unsigned long addr = CALLER_ADDR0;
   
diff --combined include/linux/sched.h

index dc4ad4c,bf37846..eed5d65
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -36,7 -36,6 +36,7 @@@
   #include <linux/seqlock.h>
   #include <linux/kcsan.h>
   #include <linux/rv.h>
+ +#include <linux/livepatch_sched.h>
   #include <asm/kmap_size.h>
   
   /* task_struct member predeclarations (sorted alphabetically): */
@@@ -70,6 -69,7 +70,7 @@@ struct sighand_struct
   struct signal_struct;
   struct task_delay_info;
   struct task_group;
+ struct user_event_mm;
   
   /*
    * Task state bitmask. NOTE! These bits are also
@@@ -1314,14 -1314,16 +1315,14 @@@ struct task_struct 
   
   #ifdef CONFIG_SCHED_MM_CID
         int                             mm_cid;         /* Current cid in mm */
+ +      int                             last_mm_cid;    /* Most recent cid in mm */
+ +      int                             migrate_from_cpu;
         int                             mm_cid_active;  /* Whether cid bitmap is active */
+ +      struct callback_head            cid_work;
   #endif
   
         struct tlbflush_unmap_batch     tlb_ubc;
   
- -      union {
- -              refcount_t              rcu_users;
- -              struct rcu_head         rcu;
- -      };
- -
         /* Cache last used pipe for splice(): */
         struct pipe_inode_info          *splice_pipe;
   
@@@ -1458,8 -1460,6 +1459,8 @@@
         unsigned long                   saved_state_change;
   # endif
   #endif
+ +      struct rcu_head                 rcu;
+ +      refcount_t                      rcu_users;
         int                             pagefault_disabled;
   #ifdef CONFIG_MMU
         struct task_struct              *oom_reaper_list;
@@@ -1529,6 -1529,10 +1530,10 @@@
         union rv_task_monitor           rv[RV_PER_TASK_MONITORS];
   #endif
   
+ #ifdef CONFIG_USER_EVENTS
+       struct user_event_mm            *user_event_mm;
+ #endif
+ 
         /*
          * New fields for task_struct should be added above here, so that
          * they are included in the randomized portion of task_struct.
@@@ -1730,7 -1734,7 +1735,7 @@@ extern struct pid *cad_pid
   #define PF_MEMALLOC           0x00000800      /* Allocating memory */
   #define PF_NPROC_EXCEEDED     0x00001000      /* set_user() noticed that RLIMIT_NPROC was exceeded */
   #define PF_USED_MATH          0x00002000      /* If unset the fpu must be initialized before use */
- -#define PF__HOLE__00004000    0x00004000
+ +#define PF_USER_WORKER                0x00004000      /* Kernel thread cloned from userspace thread */
   #define PF_NOFREEZE           0x00008000      /* This thread should not be frozen */
   #define PF__HOLE__00010000    0x00010000
   #define PF_KSWAPD             0x00020000      /* I am kswapd */
@@@ -2071,9 -2075,6 +2076,9 @@@ extern int __cond_resched(void)
   
   #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
   
+ +void sched_dynamic_klp_enable(void);
+ +void sched_dynamic_klp_disable(void);
+ +
   DECLARE_STATIC_CALL(cond_resched, __cond_resched);
   
   static __always_inline int _cond_resched(void)
@@@ -2082,7 -2083,6 +2087,7 @@@
   }
   
   #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+ +
   extern int dynamic_cond_resched(void);
   
   static __always_inline int _cond_resched(void)
@@@ -2090,25 -2090,20 +2095,25 @@@
         return dynamic_cond_resched();
   }
   
- -#else
+ +#else /* !CONFIG_PREEMPTION */
   
   static inline int _cond_resched(void)
   {
+ +      klp_sched_try_switch();
         return __cond_resched();
   }
   
- -#endif /* CONFIG_PREEMPT_DYNAMIC */
+ +#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
   
- -#else
+ +#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
   
- -static inline int _cond_resched(void) { return 0; }
+ +static inline int _cond_resched(void)
+ +{
+ +      klp_sched_try_switch();
+ +      return 0;
+ +}
   
- -#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
+ +#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
   
   #define cond_resched() ({                     \
         __might_resched(__FILE__, __LINE__, 0); \
diff --combined kernel/exit.c

index 86902cb,875d6a1..34b90e2
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -68,6 -68,7 +68,7 @@@
   #include <linux/kprobes.h>
   #include <linux/rethook.h>
   #include <linux/sysfs.h>
+ #include <linux/user_events.h>
   
   #include <linux/uaccess.h>
   #include <asm/unistd.h>
@@@ -537,7 -538,7 +538,7 @@@ static void exit_mm(void
                 return;
         sync_mm_rss(mm);
         mmap_read_lock(mm);
- -      mmgrab(mm);
+ +      mmgrab_lazy_tlb(mm);
         BUG_ON(mm != current->active_mm);
         /* more a memory barrier than a real lock */
         task_lock(current);
@@@ -818,6 -819,7 +819,7 @@@ void __noreturn do_exit(long code
   
         coredump_task_exit(tsk);
         ptrace_event(PTRACE_EVENT_EXIT, code);
+       user_events_exit(tsk);
   
         validate_creds_for_do_exit(tsk);
   
diff --combined kernel/fork.c

index eccb35a,efb1f22..735d9f4
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -97,6 -97,7 +97,7 @@@
   #include <linux/io_uring.h>
   #include <linux/bpf.h>
   #include <linux/stackprotector.h>
+ #include <linux/user_events.h>
   
   #include <asm/pgalloc.h>
   #include <linux/uaccess.h>
@@@ -451,49 -452,13 +452,49 @@@ static struct kmem_cache *vm_area_cache
   /* SLAB cache for mm_struct structures (tsk->mm) */
   static struct kmem_cache *mm_cachep;
   
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +
+ +/* SLAB cache for vm_area_struct.lock */
+ +static struct kmem_cache *vma_lock_cachep;
+ +
+ +static bool vma_lock_alloc(struct vm_area_struct *vma)
+ +{
+ +      vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+ +      if (!vma->vm_lock)
+ +              return false;
+ +
+ +      init_rwsem(&vma->vm_lock->lock);
+ +      vma->vm_lock_seq = -1;
+ +
+ +      return true;
+ +}
+ +
+ +static inline void vma_lock_free(struct vm_area_struct *vma)
+ +{
+ +      kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+ +}
+ +
+ +#else /* CONFIG_PER_VMA_LOCK */
+ +
+ +static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+ +static inline void vma_lock_free(struct vm_area_struct *vma) {}
+ +
+ +#endif /* CONFIG_PER_VMA_LOCK */
+ +
   struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
   {
         struct vm_area_struct *vma;
   
         vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- -      if (vma)
- -              vma_init(vma, mm);
+ +      if (!vma)
+ +              return NULL;
+ +
+ +      vma_init(vma, mm);
+ +      if (!vma_lock_alloc(vma)) {
+ +              kmem_cache_free(vm_area_cachep, vma);
+ +              return NULL;
+ +      }
+ +
         return vma;
   }
   
@@@ -501,56 -466,26 +502,56 @@@ struct vm_area_struct *vm_area_dup(stru
   {
         struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
   
- -      if (new) {
- -              ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- -              ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- -              /*
- -               * orig->shared.rb may be modified concurrently, but the clone
- -               * will be reinitialized.
- -               */
- -              data_race(memcpy(new, orig, sizeof(*new)));
- -              INIT_LIST_HEAD(&new->anon_vma_chain);
- -              dup_anon_vma_name(orig, new);
+ +      if (!new)
+ +              return NULL;
+ +
+ +      ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ +      ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ +      /*
+ +       * orig->shared.rb may be modified concurrently, but the clone
+ +       * will be reinitialized.
+ +       */
+ +      data_race(memcpy(new, orig, sizeof(*new)));
+ +      if (!vma_lock_alloc(new)) {
+ +              kmem_cache_free(vm_area_cachep, new);
+ +              return NULL;
         }
+ +      INIT_LIST_HEAD(&new->anon_vma_chain);
+ +      vma_numab_state_init(new);
+ +      dup_anon_vma_name(orig, new);
+ +
         return new;
   }
   
- -void vm_area_free(struct vm_area_struct *vma)
+ +void __vm_area_free(struct vm_area_struct *vma)
   {
+ +      vma_numab_state_free(vma);
         free_anon_vma_name(vma);
+ +      vma_lock_free(vma);
         kmem_cache_free(vm_area_cachep, vma);
   }
   
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +static void vm_area_free_rcu_cb(struct rcu_head *head)
+ +{
+ +      struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+ +                                                vm_rcu);
+ +
+ +      /* The vma should not be locked while being destroyed. */
+ +      VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+ +      __vm_area_free(vma);
+ +}
+ +#endif
+ +
+ +void vm_area_free(struct vm_area_struct *vma)
+ +{
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +      call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+ +#else
+ +      __vm_area_free(vma);
+ +#endif
+ +}
+ +
   static void account_kernel_stack(struct task_struct *tsk, int account)
   {
         if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@@ -683,7 -618,6 +684,7 @@@ static __latent_entropy int dup_mmap(st
         if (retval)
                 goto out;
   
+ +      mt_clear_in_rcu(vmi.mas.tree);
         for_each_vma(old_vmi, mpnt) {
                 struct file *file;
   
@@@ -767,8 -701,6 +768,8 @@@
         retval = arch_dup_mmap(oldmm, mm);
   loop_out:
         vma_iter_free(&vmi);
+ +      if (!retval)
+ +              mt_set_in_rcu(vmi.mas.tree);
   out:
         mmap_write_unlock(mm);
         flush_tlb_mm(oldmm);
@@@ -824,6 -756,11 +825,6 @@@ static void check_mm(struct mm_struct *
         for (i = 0; i < NR_MM_COUNTERS; i++) {
                 long x = percpu_counter_sum(&mm->rss_stat[i]);
   
- -              if (likely(!x))
- -                      continue;
- -
- -              /* Making sure this is not due to race with CPU offlining. */
- -              x = percpu_counter_sum_all(&mm->rss_stat[i]);
                 if (unlikely(x))
                         pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
                                  mm, resident_page_types[i], x);
@@@ -841,67 -778,6 +842,67 @@@
   #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
   #define free_mm(mm)   (kmem_cache_free(mm_cachep, (mm)))
   
+ +static void do_check_lazy_tlb(void *arg)
+ +{
+ +      struct mm_struct *mm = arg;
+ +
+ +      WARN_ON_ONCE(current->active_mm == mm);
+ +}
+ +
+ +static void do_shoot_lazy_tlb(void *arg)
+ +{
+ +      struct mm_struct *mm = arg;
+ +
+ +      if (current->active_mm == mm) {
+ +              WARN_ON_ONCE(current->mm);
+ +              current->active_mm = &init_mm;
+ +              switch_mm(mm, &init_mm, current);
+ +      }
+ +}
+ +
+ +static void cleanup_lazy_tlbs(struct mm_struct *mm)
+ +{
+ +      if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ +              /*
+ +               * In this case, lazy tlb mms are refounted and would not reach
+ +               * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ +               */
+ +              return;
+ +      }
+ +
+ +      /*
+ +       * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ +       * requires lazy mm users to switch to another mm when the refcount
+ +       * drops to zero, before the mm is freed. This requires IPIs here to
+ +       * switch kernel threads to init_mm.
+ +       *
+ +       * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ +       * switch with the final userspace teardown TLB flush which leaves the
+ +       * mm lazy on this CPU but no others, reducing the need for additional
+ +       * IPIs here. There are cases where a final IPI is still required here,
+ +       * such as the final mmdrop being performed on a different CPU than the
+ +       * one exiting, or kernel threads using the mm when userspace exits.
+ +       *
+ +       * IPI overheads have not found to be expensive, but they could be
+ +       * reduced in a number of possible ways, for example (roughly
+ +       * increasing order of complexity):
+ +       * - The last lazy reference created by exit_mm() could instead switch
+ +       *   to init_mm, however it's probable this will run on the same CPU
+ +       *   immediately afterwards, so this may not reduce IPIs much.
+ +       * - A batch of mms requiring IPIs could be gathered and freed at once.
+ +       * - CPUs store active_mm where it can be remotely checked without a
+ +       *   lock, to filter out false-positives in the cpumask.
+ +       * - After mm_users or mm_count reaches zero, switching away from the
+ +       *   mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ +       *   with some batching or delaying of the final IPIs.
+ +       * - A delayed freeing and RCU-like quiescing sequence based on mm
+ +       *   switching to avoid IPIs completely.
+ +       */
+ +      on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ +      if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ +              on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+ +}
+ +
   /*
    * Called when the last reference to the mm
    * is dropped: either by a lazy thread or by
@@@ -913,10 -789,6 +914,10 @@@ void __mmdrop(struct mm_struct *mm
   
         BUG_ON(mm == &init_mm);
         WARN_ON_ONCE(mm == current->mm);
+ +
+ +      /* Ensure no CPUs are using this as their lazy tlb mm */
+ +      cleanup_lazy_tlbs(mm);
+ +
         WARN_ON_ONCE(mm == current->active_mm);
         mm_free_pgd(mm);
         destroy_context(mm);
@@@ -924,7 -796,6 +925,7 @@@
         check_mm(mm);
         put_user_ns(mm->user_ns);
         mm_pasid_drop(mm);
+ +      mm_destroy_cid(mm);
   
         for (i = 0; i < NR_MM_COUNTERS; i++)
                 percpu_counter_destroy(&mm->rss_stat[i]);
@@@ -1189,9 -1060,7 +1190,9 @@@ static struct task_struct *dup_task_str
   
   #ifdef CONFIG_SCHED_MM_CID
         tsk->mm_cid = -1;
+ +      tsk->last_mm_cid = -1;
         tsk->mm_cid_active = 0;
+ +      tsk->migrate_from_cpu = -1;
   #endif
         return tsk;
   
@@@ -1262,9 -1131,6 +1263,9 @@@ static struct mm_struct *mm_init(struc
         seqcount_init(&mm->write_protect_seq);
         mmap_init_lock(mm);
         INIT_LIST_HEAD(&mm->mmlist);
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +      mm->mm_lock_seq = 0;
+ +#endif
         mm_pgtables_bytes_init(mm);
         mm->map_count = 0;
         mm->locked_vm = 0;
@@@ -1299,23 -1165,18 +1300,23 @@@
         if (init_new_context(p, mm))
                 goto fail_nocontext;
   
+ +      if (mm_alloc_cid(mm))
+ +              goto fail_cid;
+ +
         for (i = 0; i < NR_MM_COUNTERS; i++)
                 if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
                         goto fail_pcpu;
   
         mm->user_ns = get_user_ns(user_ns);
         lru_gen_init_mm(mm);
- -      mm_init_cid(mm);
         return mm;
   
   fail_pcpu:
         while (i > 0)
                 percpu_counter_destroy(&mm->rss_stat[--i]);
+ +      mm_destroy_cid(mm);
+ +fail_cid:
+ +      destroy_context(mm);
   fail_nocontext:
         mm_free_pgd(mm);
   fail_nopgd:
@@@ -1767,8 -1628,7 +1768,8 @@@ static int copy_fs(unsigned long clone_
         return 0;
   }
   
- -static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+ +static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ +                    int no_files)
   {
         struct files_struct *oldf, *newf;
         int error = 0;
@@@ -1780,11 -1640,6 +1781,11 @@@
         if (!oldf)
                 goto out;
   
+ +      if (no_files) {
+ +              tsk->files = NULL;
+ +              goto out;
+ +      }
+ +
         if (clone_flags & CLONE_FILES) {
                 atomic_inc(&oldf->count);
                 goto out;
@@@ -2102,91 -1957,6 +2103,91 @@@ const struct file_operations pidfd_fop
   #endif
   };
   
+ +/**
+ + * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ + * @pid:   the struct pid for which to create a pidfd
+ + * @flags: flags of the new @pidfd
+ + * @pidfd: the pidfd to return
+ + *
+ + * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ + * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ +
+ + * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ + * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ + * pidfd file are prepared.
+ + *
+ + * If this function returns successfully the caller is responsible to either
+ + * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ + * order to install the pidfd into its file descriptor table or they must use
+ + * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ + * respectively.
+ + *
+ + * This function is useful when a pidfd must already be reserved but there
+ + * might still be points of failure afterwards and the caller wants to ensure
+ + * that no pidfd is leaked into its file descriptor table.
+ + *
+ + * Return: On success, a reserved pidfd is returned from the function and a new
+ + *         pidfd file is returned in the last argument to the function. On
+ + *         error, a negative error code is returned from the function and the
+ + *         last argument remains unchanged.
+ + */
+ +static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+ +{
+ +      int pidfd;
+ +      struct file *pidfd_file;
+ +
+ +      if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ +              return -EINVAL;
+ +
+ +      pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ +      if (pidfd < 0)
+ +              return pidfd;
+ +
+ +      pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ +                                      flags | O_RDWR | O_CLOEXEC);
+ +      if (IS_ERR(pidfd_file)) {
+ +              put_unused_fd(pidfd);
+ +              return PTR_ERR(pidfd_file);
+ +      }
+ +      get_pid(pid); /* held by pidfd_file now */
+ +      *ret = pidfd_file;
+ +      return pidfd;
+ +}
+ +
+ +/**
+ + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ + * @pid:   the struct pid for which to create a pidfd
+ + * @flags: flags of the new @pidfd
+ + * @pidfd: the pidfd to return
+ + *
+ + * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ + * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ + *
+ + * The helper verifies that @pid is used as a thread group leader.
+ + *
+ + * If this function returns successfully the caller is responsible to either
+ + * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ + * order to install the pidfd into its file descriptor table or they must use
+ + * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ + * respectively.
+ + *
+ + * This function is useful when a pidfd must already be reserved but there
+ + * might still be points of failure afterwards and the caller wants to ensure
+ + * that no pidfd is leaked into its file descriptor table.
+ + *
+ + * Return: On success, a reserved pidfd is returned from the function and a new
+ + *         pidfd file is returned in the last argument to the function. On
+ + *         error, a negative error code is returned from the function and the
+ + *         last argument remains unchanged.
+ + */
+ +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+ +{
+ +      if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ +              return -EINVAL;
+ +
+ +      return __pidfd_prepare(pid, flags, ret);
+ +}
+ +
   static void __delayed_free_task(struct rcu_head *rhp)
   {
         struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@@ -2241,7 -2011,7 +2242,7 @@@ static void rv_task_fork(struct task_st
    * parts of the process environment (as per the clone
    * flags). The actual kick-off is left to the caller.
    */
- -static __latent_entropy struct task_struct *copy_process(
+ +__latent_entropy struct task_struct *copy_process(
                                         struct pid *pid,
                                         int trace,
                                         int node,
@@@ -2334,8 -2104,6 +2335,8 @@@
         p->flags &= ~PF_KTHREAD;
         if (args->kthread)
                 p->flags |= PF_KTHREAD;
+ +      if (args->user_worker)
+ +              p->flags |= PF_USER_WORKER;
         if (args->io_thread) {
                 /*
                  * Mark us an IO worker, and block any signal that isn't
@@@ -2345,9 -2113,6 +2346,9 @@@
                 siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
         }
   
+ +      if (args->name)
+ +              strscpy_pad(p->comm, args->name, sizeof(p->comm));
+ +
         p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
         /*
          * Clear TID on mm_release()?
@@@ -2490,7 -2255,7 +2491,7 @@@
         retval = copy_semundo(clone_flags, p);
         if (retval)
                 goto bad_fork_cleanup_security;
- -      retval = copy_files(clone_flags, p);
+ +      retval = copy_files(clone_flags, p, args->no_files);
         if (retval)
                 goto bad_fork_cleanup_semundo;
         retval = copy_fs(clone_flags, p);
@@@ -2515,9 -2280,6 +2516,9 @@@
         if (retval)
                 goto bad_fork_cleanup_io;
   
+ +      if (args->ignore_signals)
+ +              ignore_signals(p);
+ +
         stackleak_task_init(p);
   
         if (pid != &init_struct_pid) {
@@@ -2535,12 -2297,21 +2536,12 @@@
          * if the fd table isn't shared).
          */
         if (clone_flags & CLONE_PIDFD) {
- -              retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ +              /* Note that no task has been attached to @pid yet. */
+ +              retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
                 if (retval < 0)
                         goto bad_fork_free_pid;
- -
                 pidfd = retval;
   
- -              pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- -                                            O_RDWR | O_CLOEXEC);
- -              if (IS_ERR(pidfile)) {
- -                      put_unused_fd(pidfd);
- -                      retval = PTR_ERR(pidfile);
- -                      goto bad_fork_free_pid;
- -              }
- -              get_pid(pid);   /* held by pidfile now */
- -
                 retval = put_user(pidfd, args->pidfd);
                 if (retval)
                         goto bad_fork_put_pidfd;
@@@ -2735,6 -2506,7 +2736,7 @@@
   
         trace_task_newtask(p, clone_flags);
         uprobe_copy_process(p, clone_flags);
+       user_events_fork(p, clone_flags);
   
         copy_oom_score_adj(clone_flags, p);
   
@@@ -2857,7 -2629,6 +2859,7 @@@ struct task_struct *create_io_thread(in
                 .fn             = fn,
                 .fn_arg         = arg,
                 .io_thread      = 1,
+ +              .user_worker    = 1,
         };
   
         return copy_process(NULL, 0, node, &args);
@@@ -2961,8 -2732,7 +2963,8 @@@ pid_t kernel_clone(struct kernel_clone_
   /*
    * Create a kernel thread.
    */
- -pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+ +pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ +                  unsigned long flags)
   {
         struct kernel_clone_args args = {
                 .flags          = ((lower_32_bits(flags) | CLONE_VM |
@@@ -2970,7 -2740,6 +2972,7 @@@
                 .exit_signal    = (lower_32_bits(flags) & CSIGNAL),
                 .fn             = fn,
                 .fn_arg         = arg,
+ +              .name           = name,
                 .kthread        = 1,
         };
   
@@@ -3300,9 -3069,6 +3302,9 @@@ void __init proc_caches_init(void
                         NULL);
   
         vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+ +#ifdef CONFIG_PER_VMA_LOCK
+ +      vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+ +#endif
         mmap_init();
         nsproxy_cache_init();
   }
diff --combined kernel/trace/bpf_trace.c

index bcf91bc,d804172..9a050e3
--- 1/kernel/trace/bpf_trace.c
--- 2/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@@ -1453,6 -1453,10 +1453,6 @@@ bpf_tracing_func_proto(enum bpf_func_i
                        NULL : &bpf_probe_read_compat_str_proto;
   #endif
   #ifdef CONFIG_CGROUPS
- -      case BPF_FUNC_get_current_cgroup_id:
- -              return &bpf_get_current_cgroup_id_proto;
- -      case BPF_FUNC_get_current_ancestor_cgroup_id:
- -              return &bpf_get_current_ancestor_cgroup_id_proto;
         case BPF_FUNC_cgrp_storage_get:
                 return &bpf_cgrp_storage_get_proto;
         case BPF_FUNC_cgrp_storage_delete:
@@@ -2640,9 -2644,20 +2640,20 @@@ kprobe_multi_link_prog_run(struct bpf_k
         return err;
   }
   
- static void
+ static int
   kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
-                         struct pt_regs *regs)
+                         struct pt_regs *regs, void *data)
+ {
+       struct bpf_kprobe_multi_link *link;
+ 
+       link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+       kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+       return 0;
+ }
+ 
+ static void
+ kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
+                              struct pt_regs *regs, void *data)
   {
         struct bpf_kprobe_multi_link *link;
   
@@@ -2844,7 -2859,7 +2855,7 @@@ int bpf_kprobe_multi_link_attach(const 
                 goto error;
   
         if (flags & BPF_F_KPROBE_MULTI_RETURN)
-               link->fp.exit_handler = kprobe_multi_link_handler;
+               link->fp.exit_handler = kprobe_multi_link_exit_handler;
         else
                 link->fp.entry_handler = kprobe_multi_link_handler;
   
diff --combined kernel/trace/ftrace.c

index 08155f6,db8532a..76973a7
--- 1/kernel/trace/ftrace.c
--- 2/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@@ -45,6 -45,9 +45,9 @@@
   #include "trace_output.h"
   #include "trace_stat.h"
   
+ /* Flags that do not get reset */
+ #define FTRACE_NOCLEAR_FLAGS  (FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED)
+ 
   #define FTRACE_INVALID_FUNCTION               "__ftrace_invalid_address__"
   
   #define FTRACE_WARN_ON(cond)                  \
@@@ -2256,7 -2259,7 +2259,7 @@@ static int ftrace_check_record(struct d
                 flag ^= rec->flags & FTRACE_FL_ENABLED;
   
                 if (update) {
-                       rec->flags |= FTRACE_FL_ENABLED;
+                       rec->flags |= FTRACE_FL_ENABLED | FTRACE_FL_TOUCHED;
                         if (flag & FTRACE_FL_REGS) {
                                 if (rec->flags & FTRACE_FL_REGS)
                                         rec->flags |= FTRACE_FL_REGS_EN;
@@@ -2326,7 -2329,7 +2329,7 @@@
         if (update) {
                 /* If there's no more users, clear all flags */
                 if (!ftrace_rec_count(rec))
-                       rec->flags &= FTRACE_FL_DISABLED;
+                       rec->flags &= FTRACE_NOCLEAR_FLAGS;
                 else
                         /*
                          * Just disable the record, but keep the ops TRAMP
@@@ -3147,7 -3150,7 +3150,7 @@@ int ftrace_shutdown(struct ftrace_ops *
                 struct dyn_ftrace *rec;
   
                 do_for_each_ftrace_rec(pg, rec) {
-                       if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
+                       if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_NOCLEAR_FLAGS))
                                 pr_warn("  %pS flags:%lx\n",
                                         (void *)rec->ip, rec->flags);
                 } while_for_each_ftrace_rec();
@@@ -3598,7 -3601,10 +3601,10 @@@ t_func_next(struct seq_file *m, loff_t 
                      !ftrace_lookup_ip(iter->hash, rec->ip)) ||
   
                     ((iter->flags & FTRACE_ITER_ENABLED) &&
-                    !(rec->flags & FTRACE_FL_ENABLED))) {
+                    !(rec->flags & FTRACE_FL_ENABLED)) ||
+ 
+                   ((iter->flags & FTRACE_ITER_TOUCHED) &&
+                    !(rec->flags & FTRACE_FL_TOUCHED))) {
   
                         rec = NULL;
                         goto retry;
@@@ -3857,7 -3863,7 +3863,7 @@@ static int t_show(struct seq_file *m, v
                 return 0;
         }
   
-       if (iter->flags & FTRACE_ITER_ENABLED) {
+       if (iter->flags & (FTRACE_ITER_ENABLED | FTRACE_ITER_TOUCHED)) {
                 struct ftrace_ops *ops;
   
                 seq_printf(m, " (%ld)%s%s%s%s",
@@@ -3959,6 -3965,31 +3965,31 @@@ ftrace_enabled_open(struct inode *inode
         return 0;
   }
   
+ static int
+ ftrace_touched_open(struct inode *inode, struct file *file)
+ {
+       struct ftrace_iterator *iter;
+ 
+       /*
+        * This shows us what functions have ever been enabled
+        * (traced, direct, patched, etc). Not sure if we want lockdown
+        * to hide such critical information for an admin.
+        * Although, perhaps it can show information we don't
+        * want people to see, but if something had traced
+        * something, we probably want to know about it.
+        */
+ 
+       iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+       if (!iter)
+               return -ENOMEM;
+ 
+       iter->pg = ftrace_pages_start;
+       iter->flags = FTRACE_ITER_TOUCHED;
+       iter->ops = &global_ops;
+ 
+       return 0;
+ }
+ 
   /**
    * ftrace_regex_open - initialize function tracer filter files
    * @ops: The ftrace_ops that hold the hash filters
@@@ -5872,6 -5903,13 +5903,13 @@@ static const struct file_operations ftr
         .release = seq_release_private,
   };
   
+ static const struct file_operations ftrace_touched_fops = {
+       .open = ftrace_touched_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = seq_release_private,
+ };
+ 
   static const struct file_operations ftrace_filter_fops = {
         .open = ftrace_filter_open,
         .read = seq_read,
@@@ -6336,6 -6374,9 +6374,9 @@@ static __init int ftrace_init_dyn_trace
         trace_create_file("enabled_functions", TRACE_MODE_READ,
                         d_tracer, NULL, &ftrace_enabled_fops);
   
+       trace_create_file("touched_functions", TRACE_MODE_READ,
+                       d_tracer, NULL, &ftrace_touched_fops);
+ 
         ftrace_create_filter_files(&global_ops, d_tracer);
   
   #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@@ -8006,7 -8047,8 +8047,7 @@@ struct kallsyms_data 
    * and returns 1 in case we resolved all the requested symbols,
    * 0 otherwise.
    */
- -static int kallsyms_callback(void *data, const char *name,
- -                           struct module *mod, unsigned long addr)
+ +static int kallsyms_callback(void *data, const char *name, unsigned long addr)
   {
         struct kallsyms_data *args = data;
         const char **sym;
diff --combined kernel/trace/ring_buffer.c

index 76a2d91,0d748f1..834b361
--- 1/kernel/trace/ring_buffer.c
--- 2/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@@ -163,7 -163,7 +163,7 @@@ enum 
   #define extended_time(event) \
         (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
   
- static inline int rb_null_event(struct ring_buffer_event *event)
+ static inline bool rb_null_event(struct ring_buffer_event *event)
   {
         return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
   }
@@@ -363,11 -363,9 +363,9 @@@ static void free_buffer_page(struct buf
   /*
    * We need to fit the time_stamp delta into 27 bits.
    */
- static inline int test_time_stamp(u64 delta)
+ static inline bool test_time_stamp(u64 delta)
   {
-       if (delta & TS_DELTA_TEST)
-               return 1;
-       return 0;
+       return !!(delta & TS_DELTA_TEST);
   }
   
   #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
@@@ -696,7 -694,7 +694,7 @@@ rb_time_read_cmpxchg(local_t *l, unsign
         return ret == expect;
   }
   
- static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
   {
         unsigned long cnt, top, bottom, msb;
         unsigned long cnt2, top2, bottom2, msb2;
@@@ -1486,7 -1484,7 +1484,7 @@@ rb_set_head_page(struct ring_buffer_per
         return NULL;
   }
   
- static int rb_head_page_replace(struct buffer_page *old,
+ static bool rb_head_page_replace(struct buffer_page *old,
                                 struct buffer_page *new)
   {
         unsigned long *ptr = (unsigned long *)&old->list.prev->next;
@@@ -1565,15 -1563,12 +1563,12 @@@ static void rb_tail_page_update(struct 
         }
   }
   
- static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
                           struct buffer_page *bpage)
   {
         unsigned long val = (unsigned long)bpage;
   
-       if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
-               return 1;
- 
-       return 0;
+       RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK);
   }
   
   /**
@@@ -1583,30 -1578,28 +1578,28 @@@
    * As a safety measure we check to make sure the data pages have not
    * been corrupted.
    */
- static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
   {
         struct list_head *head = rb_list_head(cpu_buffer->pages);
         struct list_head *tmp;
   
         if (RB_WARN_ON(cpu_buffer,
                         rb_list_head(rb_list_head(head->next)->prev) != head))
-               return -1;
+               return;
   
         if (RB_WARN_ON(cpu_buffer,
                         rb_list_head(rb_list_head(head->prev)->next) != head))
-               return -1;
+               return;
   
         for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
                 if (RB_WARN_ON(cpu_buffer,
                                 rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
-                       return -1;
+                       return;
   
                 if (RB_WARN_ON(cpu_buffer,
                                 rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
-                       return -1;
+                       return;
         }
- 
-       return 0;
   }
   
   static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
@@@ -1774,6 -1767,8 +1767,8 @@@ static void rb_free_cpu_buffer(struct r
         struct list_head *head = cpu_buffer->pages;
         struct buffer_page *bpage, *tmp;
   
+       irq_work_sync(&cpu_buffer->irq_work.work);
+ 
         free_buffer_page(cpu_buffer->reader_page);
   
         if (head) {
@@@ -1880,6 -1875,8 +1875,8 @@@ ring_buffer_free(struct trace_buffer *b
   
         cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
   
+       irq_work_sync(&buffer->irq_work.work);
+ 
         for_each_buffer_cpu(buffer, cpu)
                 rb_free_cpu_buffer(buffer->buffers[cpu]);
   
@@@ -1918,7 -1915,7 +1915,7 @@@ static inline unsigned long rb_page_wri
         return local_read(&bpage->write) & RB_WRITE_MASK;
   }
   
- static int
+ static bool
   rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
   {
         struct list_head *tail_page, *to_remove, *next_page;
@@@ -2031,12 -2028,13 +2028,13 @@@
         return nr_removed == 0;
   }
   
- static int
+ static bool
   rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
   {
         struct list_head *pages = &cpu_buffer->new_pages;
-       int retries, success;
         unsigned long flags;
+       bool success;
+       int retries;
   
         /* Can be called at early boot up, where interrupts must not been enabled */
         raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@@ -2055,15 -2053,16 +2053,16 @@@
          * spinning.
          */
         retries = 10;
-       success = 0;
+       success = false;
         while (retries--) {
                 struct list_head *head_page, *prev_page, *r;
                 struct list_head *last_page, *first_page;
                 struct list_head *head_page_with_bit;
+               struct buffer_page *hpage = rb_set_head_page(cpu_buffer);
   
-               head_page = &rb_set_head_page(cpu_buffer)->list;
-               if (!head_page)
+               if (!hpage)
                         break;
+               head_page = &hpage->list;
                 prev_page = head_page->prev;
   
                 first_page = pages->next;
@@@ -2084,7 -2083,7 +2083,7 @@@
                          * pointer to point to end of list
                          */
                         head_page->prev = last_page;
-                       success = 1;
+                       success = true;
                         break;
                 }
         }
@@@ -2112,7 -2111,7 +2111,7 @@@
   
   static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
   {
-       int success;
+       bool success;
   
         if (cpu_buffer->nr_pages_to_update > 0)
                 success = rb_insert_pages(cpu_buffer);
@@@ -2995,7 -2994,7 +2994,7 @@@ static u64 rb_time_delta(struct ring_bu
         }
   }
   
- static inline int
+ static inline bool
   rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
                   struct ring_buffer_event *event)
   {
@@@ -3016,7 -3015,7 +3015,7 @@@
         delta = rb_time_delta(event);
   
         if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
-               return 0;
+               return false;
   
         /* Make sure the write stamp is read before testing the location */
         barrier();
@@@ -3029,7 -3028,7 +3028,7 @@@
                 /* Something came in, can't discard */
                 if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
                                        write_stamp, write_stamp - delta))
-                       return 0;
+                       return false;
   
                 /*
                  * It's possible that the event time delta is zero
@@@ -3062,12 -3061,12 +3061,12 @@@
                 if (index == old_index) {
                         /* update counters */
                         local_sub(event_length, &cpu_buffer->entries_bytes);
-                       return 1;
+                       return true;
                 }
         }
   
         /* could not discard */
-       return 0;
+       return false;
   }
   
   static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
@@@ -3098,10 -3097,6 +3097,10 @@@ rb_set_commit_to_write(struct ring_buff
                 if (RB_WARN_ON(cpu_buffer,
                                rb_is_reader_page(cpu_buffer->tail_page)))
                         return;
+ +              /*
+ +               * No need for a memory barrier here, as the update
+ +               * of the tail_page did it for this page.
+ +               */
                 local_set(&cpu_buffer->commit_page->page->commit,
                           rb_page_write(cpu_buffer->commit_page));
                 rb_inc_page(&cpu_buffer->commit_page);
@@@ -3111,8 -3106,6 +3110,8 @@@
         while (rb_commit_index(cpu_buffer) !=
                rb_page_write(cpu_buffer->commit_page)) {
   
+ +              /* Make sure the readers see the content of what is committed. */
+ +              smp_wmb();
                 local_set(&cpu_buffer->commit_page->page->commit,
                           rb_page_write(cpu_buffer->commit_page));
                 RB_WARN_ON(cpu_buffer,
@@@ -3288,7 -3281,7 +3287,7 @@@ rb_wakeups(struct trace_buffer *buffer
    * Note: The TRANSITION bit only handles a single transition between context.
    */
   
- static __always_inline int
+ static __always_inline bool
   trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
   {
         unsigned int val = cpu_buffer->current_context;
@@@ -3305,14 -3298,14 +3304,14 @@@
                 bit = RB_CTX_TRANSITION;
                 if (val & (1 << (bit + cpu_buffer->nest))) {
                         do_ring_buffer_record_recursion();
-                       return 1;
+                       return true;
                 }
         }
   
         val |= (1 << (bit + cpu_buffer->nest));
         cpu_buffer->current_context = val;
   
-       return 0;
+       return false;
   }
   
   static __always_inline void
@@@ -4069,10 -4062,10 +4068,10 @@@ void ring_buffer_record_off(struct trac
         unsigned int rd;
         unsigned int new_rd;
   
+       rd = atomic_read(&buffer->record_disabled);
         do {
-               rd = atomic_read(&buffer->record_disabled);
                 new_rd = rd | RB_BUFFER_OFF;
-       } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+       } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
   }
   EXPORT_SYMBOL_GPL(ring_buffer_record_off);
   
@@@ -4092,10 -4085,10 +4091,10 @@@ void ring_buffer_record_on(struct trace
         unsigned int rd;
         unsigned int new_rd;
   
+       rd = atomic_read(&buffer->record_disabled);
         do {
-               rd = atomic_read(&buffer->record_disabled);
                 new_rd = rd & ~RB_BUFFER_OFF;
-       } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+       } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
   }
   EXPORT_SYMBOL_GPL(ring_buffer_record_on);
   
@@@ -4502,7 -4495,6 +4501,6 @@@ rb_update_read_stamp(struct ring_buffer
         default:
                 RB_WARN_ON(cpu_buffer, 1);
         }
-       return;
   }
   
   static void
@@@ -4533,7 -4525,6 +4531,6 @@@ rb_update_iter_read_stamp(struct ring_b
         default:
                 RB_WARN_ON(iter->cpu_buffer, 1);
         }
-       return;
   }
   
   static struct buffer_page *
@@@ -4543,7 -4534,7 +4540,7 @@@ rb_get_reader_page(struct ring_buffer_p
         unsigned long overwrite;
         unsigned long flags;
         int nr_loops = 0;
-       int ret;
+       bool ret;
   
         local_irq_save(flags);
         arch_spin_lock(&cpu_buffer->lock);
@@@ -4690,12 -4681,7 +4687,12 @@@
   
         /*
          * Make sure we see any padding after the write update
- -       * (see rb_reset_tail())
+ +       * (see rb_reset_tail()).
+ +       *
+ +       * In addition, a writer may be writing on the reader page
+ +       * if the page has not been fully filled, so the read barrier
+ +       * is also needed to make sure we see the content of what is
+ +       * committed by the writer (see rb_set_commit_to_write()).
          */
         smp_rmb();
   
@@@ -4953,7 -4939,6 +4950,6 @@@ rb_reader_unlock(struct ring_buffer_per
   {
         if (likely(locked))
                 raw_spin_unlock(&cpu_buffer->reader_lock);
-       return;
   }
   
   /**
@@@ -5345,6 -5330,9 +5341,9 @@@ void ring_buffer_reset_cpu(struct trace
   }
   EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
   
+ /* Flag to ensure proper resetting of atomic variables */
+ #define RESET_BIT     (1 << 30)
+ 
   /**
    * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
    * @buffer: The ring buffer to reset a per cpu buffer of
@@@ -5361,20 -5349,27 +5360,27 @@@ void ring_buffer_reset_online_cpus(stru
         for_each_online_buffer_cpu(buffer, cpu) {
                 cpu_buffer = buffer->buffers[cpu];
   
-               atomic_inc(&cpu_buffer->resize_disabled);
+               atomic_add(RESET_BIT, &cpu_buffer->resize_disabled);
                 atomic_inc(&cpu_buffer->record_disabled);
         }
   
         /* Make sure all commits have finished */
         synchronize_rcu();
   
-       for_each_online_buffer_cpu(buffer, cpu) {
+       for_each_buffer_cpu(buffer, cpu) {
                 cpu_buffer = buffer->buffers[cpu];
   
+               /*
+                * If a CPU came online during the synchronize_rcu(), then
+                * ignore it.
+                */
+               if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT))
+                       continue;
+ 
                 reset_disabled_cpu_buffer(cpu_buffer);
   
                 atomic_dec(&cpu_buffer->record_disabled);
-               atomic_dec(&cpu_buffer->resize_disabled);
+               atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
         }
   
         mutex_unlock(&buffer->mutex);
@@@ -5424,8 -5419,8 +5430,8 @@@ bool ring_buffer_empty(struct trace_buf
         struct ring_buffer_per_cpu *cpu_buffer;
         unsigned long flags;
         bool dolock;
+       bool ret;
         int cpu;
-       int ret;
   
         /* yes this is racy, but if you don't like the race, lock the buffer */
         for_each_buffer_cpu(buffer, cpu) {
@@@ -5454,7 -5449,7 +5460,7 @@@ bool ring_buffer_empty_cpu(struct trace
         struct ring_buffer_per_cpu *cpu_buffer;
         unsigned long flags;
         bool dolock;
-       int ret;
+       bool ret;
   
         if (!cpumask_test_cpu(cpu, buffer->cpumask))
                 return true;
diff --combined kernel/trace/trace.c

index 36a6037,076d893..427da23
--- 1/kernel/trace/trace.c
--- 2/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@@ -1149,22 -1149,22 +1149,22 @@@ static void tracing_snapshot_instance_c
         unsigned long flags;
   
         if (in_nmi()) {
- -              internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
- -              internal_trace_puts("*** snapshot is being ignored        ***\n");
+ +              trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ +              trace_array_puts(tr, "*** snapshot is being ignored        ***\n");
                 return;
         }
   
         if (!tr->allocated_snapshot) {
- -              internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
- -              internal_trace_puts("*** stopping trace here!   ***\n");
- -              tracing_off();
+ +              trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
+ +              trace_array_puts(tr, "*** stopping trace here!   ***\n");
+ +              tracer_tracing_off(tr);
                 return;
         }
   
         /* Note, snapshot can not be used when the tracer uses it */
         if (tracer->use_max_tr) {
- -              internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
- -              internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+ +              trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ +              trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
                 return;
         }
   
@@@ -3726,7 -3726,7 +3726,7 @@@ __find_next_entry(struct trace_iterato
   #define STATIC_FMT_BUF_SIZE   128
   static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
   
- static char *trace_iter_expand_format(struct trace_iterator *iter)
+ char *trace_iter_expand_format(struct trace_iterator *iter)
   {
         char *tmp;
   
@@@ -4446,8 -4446,11 +4446,11 @@@ static enum print_line_t print_trace_fm
         if (trace_seq_has_overflowed(s))
                 return TRACE_TYPE_PARTIAL_LINE;
   
-       if (event)
+       if (event) {
+               if (tr->trace_flags & TRACE_ITER_FIELDS)
+                       return print_event_fields(iter, event);
                 return event->funcs->trace(iter, sym_flags, event);
+       }
   
         trace_seq_printf(s, "Unknown type %d\n", entry->type);
   
@@@ -9516,7 -9519,6 +9519,7 @@@ static int __remove_instance(struct tra
         tracefs_remove(tr->dir);
         free_percpu(tr->last_func_repeats);
         free_trace_buffers(tr);
+ +      clear_tracing_err_log(tr);
   
         for (i = 0; i < tr->nr_topts; i++) {
                 kfree(tr->topts[i].topts);
@@@ -10394,20 -10396,19 +10397,20 @@@ out
   
   void __init ftrace_boot_snapshot(void)
   {
+ +#ifdef CONFIG_TRACER_MAX_TRACE
         struct trace_array *tr;
   
- -      if (snapshot_at_boot) {
- -              tracing_snapshot();
- -              internal_trace_puts("** Boot snapshot taken **\n");
- -      }
+ +      if (!snapshot_at_boot)
+ +              return;
   
         list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- -              if (tr == &global_trace)
+ +              if (!tr->allocated_snapshot)
                         continue;
- -              trace_array_puts(tr, "** Boot snapshot taken **\n");
+ +
                 tracing_snapshot_instance(tr);
+ +              trace_array_puts(tr, "** Boot snapshot taken **\n");
         }
+ +#endif
   }
   
   void __init early_trace_init(void)
diff --combined lib/test_fprobe.c

index fd61538,0fe5273..079435a
--- 1/lib/test_fprobe.c
--- 2/lib/test_fprobe.c
+++ b/lib/test_fprobe.c
@@@ -17,8 -17,11 +17,11 @@@ static u32 rand1, entry_val, exit_val
   /* Use indirect calls to avoid inlining the target functions */
   static u32 (*target)(u32 value);
   static u32 (*target2)(u32 value);
+ static u32 (*target_nest)(u32 value, u32 (*nest)(u32));
   static unsigned long target_ip;
   static unsigned long target2_ip;
+ static unsigned long target_nest_ip;
+ static int entry_return_value;
   
   static noinline u32 fprobe_selftest_target(u32 value)
   {
@@@ -30,16 -33,31 +33,31 @@@ static noinline u32 fprobe_selftest_tar
         return (value / div_factor) + 1;
   }
   
- static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+ static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32))
+ {
+       return nest(value + 2);
+ }
+ 
+ static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip,
+                                    struct pt_regs *regs, void *data)
   {
         KUNIT_EXPECT_FALSE(current_test, preemptible());
         /* This can be called on the fprobe_selftest_target and the fprobe_selftest_target2 */
         if (ip != target_ip)
                 KUNIT_EXPECT_EQ(current_test, ip, target2_ip);
         entry_val = (rand1 / div_factor);
+       if (fp->entry_data_size) {
+               KUNIT_EXPECT_NOT_NULL(current_test, data);
+               if (data)
+                       *(u32 *)data = entry_val;
+       } else
+               KUNIT_EXPECT_NULL(current_test, data);
+ 
+       return entry_return_value;
   }
   
- static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
+                                   struct pt_regs *regs, void *data)
   {
         unsigned long ret = regs_return_value(regs);
   
@@@ -51,6 -69,26 +69,26 @@@
                 KUNIT_EXPECT_EQ(current_test, ret, (rand1 / div_factor));
         KUNIT_EXPECT_EQ(current_test, entry_val, (rand1 / div_factor));
         exit_val = entry_val + div_factor;
+       if (fp->entry_data_size) {
+               KUNIT_EXPECT_NOT_NULL(current_test, data);
+               if (data)
+                       KUNIT_EXPECT_EQ(current_test, *(u32 *)data, entry_val);
+       } else
+               KUNIT_EXPECT_NULL(current_test, data);
+ }
+ 
+ static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip,
+                                    struct pt_regs *regs, void *data)
+ {
+       KUNIT_EXPECT_FALSE(current_test, preemptible());
+       return 0;
+ }
+ 
+ static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip,
+                                   struct pt_regs *regs, void *data)
+ {
+       KUNIT_EXPECT_FALSE(current_test, preemptible());
+       KUNIT_EXPECT_EQ(current_test, ip, target_nest_ip);
   }
   
   /* Test entry only (no rethook) */
@@@ -132,6 -170,64 +170,64 @@@ static void test_fprobe_syms(struct kun
         KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
   }
   
+ /* Test private entry_data */
+ static void test_fprobe_data(struct kunit *test)
+ {
+       struct fprobe fp = {
+               .entry_handler = fp_entry_handler,
+               .exit_handler = fp_exit_handler,
+               .entry_data_size = sizeof(u32),
+       };
+ 
+       current_test = test;
+       KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp, "fprobe_selftest_target", NULL));
+ 
+       target(rand1);
+ 
+       KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
+ }
+ 
+ /* Test nr_maxactive */
+ static void test_fprobe_nest(struct kunit *test)
+ {
+       static const char *syms[] = {"fprobe_selftest_target", "fprobe_selftest_nest_target"};
+       struct fprobe fp = {
+               .entry_handler = nest_entry_handler,
+               .exit_handler = nest_exit_handler,
+               .nr_maxactive = 1,
+       };
+ 
+       current_test = test;
+       KUNIT_EXPECT_EQ(test, 0, register_fprobe_syms(&fp, syms, 2));
+ 
+       target_nest(rand1, target);
+       KUNIT_EXPECT_EQ(test, 1, fp.nmissed);
+ 
+       KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
+ }
+ 
+ static void test_fprobe_skip(struct kunit *test)
+ {
+       struct fprobe fp = {
+               .entry_handler = fp_entry_handler,
+               .exit_handler = fp_exit_handler,
+       };
+ 
+       current_test = test;
+       KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp, "fprobe_selftest_target", NULL));
+ 
+       entry_return_value = 1;
+       entry_val = 0;
+       exit_val = 0;
+       target(rand1);
+       KUNIT_EXPECT_NE(test, 0, entry_val);
+       KUNIT_EXPECT_EQ(test, 0, exit_val);
+       KUNIT_EXPECT_EQ(test, 0, fp.nmissed);
+       entry_return_value = 0;
+ 
+       KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
+ }
+ 
   static unsigned long get_ftrace_location(void *func)
   {
         unsigned long size, addr = (unsigned long)func;
@@@ -147,8 -243,10 +243,10 @@@ static int fprobe_test_init(struct kuni
         rand1 = get_random_u32_above(div_factor);
         target = fprobe_selftest_target;
         target2 = fprobe_selftest_target2;
+       target_nest = fprobe_selftest_nest_target;
         target_ip = get_ftrace_location(target);
         target2_ip = get_ftrace_location(target2);
+       target_nest_ip = get_ftrace_location(target_nest);
   
         return 0;
   }
@@@ -157,6 -255,9 +255,9 @@@ static struct kunit_case fprobe_testcas
         KUNIT_CASE(test_fprobe_entry),
         KUNIT_CASE(test_fprobe),
         KUNIT_CASE(test_fprobe_syms),
+       KUNIT_CASE(test_fprobe_data),
+       KUNIT_CASE(test_fprobe_nest),
+       KUNIT_CASE(test_fprobe_skip),
         {}
   };
   
@@@ -168,3 -269,4 +269,3 @@@ static struct kunit_suite fprobe_test_s
   
   kunit_test_suites(&fprobe_test_suite);
   
- -MODULE_LICENSE("GPL");
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Apr 2023 22:57:53 +0000 (15:57 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 28 Apr 2023 22:57:53 +0000 (15:57 -0700)
		1	2
Documentation/trace/ftrace.rst	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/ftrace.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/bpf_trace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/ftrace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/ring_buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/test_fprobe.c	patch \|	diff1 \|	diff2 \|	blob \| history