nohex
nobin
noblock
+ nofields
trace_printk
annotate
nouserstacktrace
block
When set, reading trace_pipe will not block when polled.
+ fields
+ Print the fields as described by their types. This is a better
+ option than using hex, bin or raw, as it gives a better parsing
+ of the content of the event.
+
trace_printk
Can disable trace_printk() from writing into the buffer.
Stack trace
-----------
Since the kernel has a fixed sized stack, it is important not to
-waste it in functions. A kernel developer must be conscience of
+waste it in functions. A kernel developer must be conscious of
what they allocate on the stack. If they add too much, the system
can be in danger of a stack overflow, and corruption will occur,
usually leading to a system panic.
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
#include <linux/time_namespace.h>
+ #include <linux/user_events.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
mmput(old_mm);
return 0;
}
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
return 0;
}
current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current);
+ user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
return retval;
* DIRECT - there is a direct function to call
* CALL_OPS - the record can use callsite-specific ops
* CALL_OPS_EN - the function is set up to use callsite-specific ops
+ * TOUCHED - A callback was added since boot up
*
* When a new ftrace_ops is registered and wants a function to save
* pt_regs, the rec->flags REGS is set. When the function has been
FTRACE_FL_DIRECT_EN = (1UL << 23),
FTRACE_FL_CALL_OPS = (1UL << 22),
FTRACE_FL_CALL_OPS_EN = (1UL << 21),
+ FTRACE_FL_TOUCHED = (1UL << 20),
};
- #define FTRACE_REF_MAX_SHIFT 21
+ #define FTRACE_REF_MAX_SHIFT 20
#define FTRACE_REF_MAX ((1UL << FTRACE_REF_MAX_SHIFT) - 1)
#define ftrace_rec_count(rec) ((rec)->flags & FTRACE_REF_MAX)
FTRACE_ITER_PROBE = (1 << 4),
FTRACE_ITER_MOD = (1 << 5),
FTRACE_ITER_ENABLED = (1 << 6),
+ FTRACE_ITER_TOUCHED = (1 << 7),
};
void arch_ftrace_update_code(int command);
#define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
#define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
-static inline unsigned long get_lock_parent_ip(void)
+static __always_inline unsigned long get_lock_parent_ip(void)
{
unsigned long addr = CALLER_ADDR0;
#include <linux/seqlock.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
+#include <linux/livepatch_sched.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct signal_struct;
struct task_delay_info;
struct task_group;
+ struct user_event_mm;
/*
* Task state bitmask. NOTE! These bits are also
#ifdef CONFIG_SCHED_MM_CID
int mm_cid; /* Current cid in mm */
+ int last_mm_cid; /* Most recent cid in mm */
+ int migrate_from_cpu;
int mm_cid_active; /* Whether cid bitmap is active */
+ struct callback_head cid_work;
#endif
struct tlbflush_unmap_batch tlb_ubc;
- union {
- refcount_t rcu_users;
- struct rcu_head rcu;
- };
-
/* Cache last used pipe for splice(): */
struct pipe_inode_info *splice_pipe;
unsigned long saved_state_change;
# endif
#endif
+ struct rcu_head rcu;
+ refcount_t rcu_users;
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
union rv_task_monitor rv[RV_PER_TASK_MONITORS];
#endif
+ #ifdef CONFIG_USER_EVENTS
+ struct user_event_mm *user_event_mm;
+ #endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
#define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
-#define PF__HOLE__00004000 0x00004000
+#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
#define PF__HOLE__00010000 0x00010000
#define PF_KSWAPD 0x00020000 /* I am kswapd */
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+void sched_dynamic_klp_enable(void);
+void sched_dynamic_klp_disable(void);
+
DECLARE_STATIC_CALL(cond_resched, __cond_resched);
static __always_inline int _cond_resched(void)
}
#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+
extern int dynamic_cond_resched(void);
static __always_inline int _cond_resched(void)
return dynamic_cond_resched();
}
-#else
+#else /* !CONFIG_PREEMPTION */
static inline int _cond_resched(void)
{
+ klp_sched_try_switch();
return __cond_resched();
}
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
-#else
+#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
-static inline int _cond_resched(void) { return 0; }
+static inline int _cond_resched(void)
+{
+ klp_sched_try_switch();
+ return 0;
+}
-#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
+#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
#define cond_resched() ({ \
__might_resched(__FILE__, __LINE__, 0); \
#include <linux/kprobes.h>
#include <linux/rethook.h>
#include <linux/sysfs.h>
+ #include <linux/user_events.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
return;
sync_mm_rss(mm);
mmap_read_lock(mm);
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
+ user_events_exit(tsk);
validate_creds_for_do_exit(tsk);
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
+ #include <linux/user_events.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+ if (!vma->vm_lock)
+ return false;
+
+ init_rwsem(&vma->vm_lock->lock);
+ vma->vm_lock_seq = -1;
+
+ return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+ kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (vma)
- vma_init(vma, mm);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+ if (!vma_lock_alloc(vma)) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return NULL;
+ }
+
return vma;
}
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new) {
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- /*
- * orig->shared.rb may be modified concurrently, but the clone
- * will be reinitialized.
- */
- data_race(memcpy(new, orig, sizeof(*new)));
- INIT_LIST_HEAD(&new->anon_vma_chain);
- dup_anon_vma_name(orig, new);
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ /*
+ * orig->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ data_race(memcpy(new, orig, sizeof(*new)));
+ if (!vma_lock_alloc(new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
}
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
return new;
}
-void vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
{
+ vma_numab_state_free(vma);
free_anon_vma_name(vma);
+ vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma);
}
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
+{
+ struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+ vm_rcu);
+
+ /* The vma should not be locked while being destroyed. */
+ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+ __vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+ __vm_area_free(vma);
+#endif
+}
+
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
if (retval)
goto out;
+ mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(old_vmi, mpnt) {
struct file *file;
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
+ if (!retval)
+ mt_set_in_rcu(vmi.mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
- if (likely(!x))
- continue;
-
- /* Making sure this is not due to race with CPU offlining. */
- x = percpu_counter_sum_all(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ return;
+ }
+
+ /*
+ * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ * requires lazy mm users to switch to another mm when the refcount
+ * drops to zero, before the mm is freed. This requires IPIs here to
+ * switch kernel threads to init_mm.
+ *
+ * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ * switch with the final userspace teardown TLB flush which leaves the
+ * mm lazy on this CPU but no others, reducing the need for additional
+ * IPIs here. There are cases where a final IPI is still required here,
+ * such as the final mmdrop being performed on a different CPU than the
+ * one exiting, or kernel threads using the mm when userspace exits.
+ *
+ * IPI overheads have not found to be expensive, but they could be
+ * reduced in a number of possible ways, for example (roughly
+ * increasing order of complexity):
+ * - The last lazy reference created by exit_mm() could instead switch
+ * to init_mm, however it's probable this will run on the same CPU
+ * immediately afterwards, so this may not reduce IPIs much.
+ * - A batch of mms requiring IPIs could be gathered and freed at once.
+ * - CPUs store active_mm where it can be remotely checked without a
+ * lock, to filter out false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away from the
+ * mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ * with some batching or delaying of the final IPIs.
+ * - A delayed freeing and RCU-like quiescing sequence based on mm
+ * switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ cleanup_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+ mm_destroy_cid(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
percpu_counter_destroy(&mm->rss_stat[i]);
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
+ tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
+ tsk->migrate_from_cpu = -1;
#endif
return tsk;
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+ mm->mm_lock_seq = 0;
+#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
if (init_new_context(p, mm))
goto fail_nocontext;
+ if (mm_alloc_cid(mm))
+ goto fail_cid;
+
for (i = 0; i < NR_MM_COUNTERS; i++)
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
- mm_init_cid(mm);
return mm;
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
+ mm_destroy_cid(mm);
+fail_cid:
+ destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
int error = 0;
if (!oldf)
goto out;
+ if (no_files) {
+ tsk->files = NULL;
+ goto out;
+ }
+
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
#endif
};
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ int pidfd;
+ struct file *pidfd_file;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
+ pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ flags | O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfd_file)) {
+ put_unused_fd(pidfd);
+ return PTR_ERR(pidfd_file);
+ }
+ get_pid(pid); /* held by pidfd_file now */
+ *ret = pidfd_file;
+ return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ return __pidfd_prepare(pid, flags, ret);
+}
+
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
+ if (args->user_worker)
+ p->flags |= PF_USER_WORKER;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
- retval = copy_files(clone_flags, p);
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
if (retval)
goto bad_fork_cleanup_io;
+ if (args->ignore_signals)
+ ignore_signals(p);
+
stackleak_task_init(p);
if (pid != &init_struct_pid) {
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ /* Note that no task has been attached to @pid yet. */
+ retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
-
pidfd = retval;
- pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfile)) {
- put_unused_fd(pidfd);
- retval = PTR_ERR(pidfile);
- goto bad_fork_free_pid;
- }
- get_pid(pid); /* held by pidfile now */
-
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ user_events_fork(p, clone_flags);
copy_oom_score_adj(clone_flags, p);
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
+ .user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
/*
* Create a kernel thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
+ .name = name,
.kthread = 1,
};
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
mmap_init();
nsproxy_cache_init();
}
NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_get_current_ancestor_cgroup_id:
- return &bpf_get_current_ancestor_cgroup_id_proto;
case BPF_FUNC_cgrp_storage_get:
return &bpf_cgrp_storage_get_proto;
case BPF_FUNC_cgrp_storage_delete:
return err;
}
- static void
+ static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
- struct pt_regs *regs)
+ struct pt_regs *regs, void *data)
+ {
+ struct bpf_kprobe_multi_link *link;
+
+ link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+ return 0;
+ }
+
+ static void
+ kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
+ struct pt_regs *regs, void *data)
{
struct bpf_kprobe_multi_link *link;
goto error;
if (flags & BPF_F_KPROBE_MULTI_RETURN)
- link->fp.exit_handler = kprobe_multi_link_handler;
+ link->fp.exit_handler = kprobe_multi_link_exit_handler;
else
link->fp.entry_handler = kprobe_multi_link_handler;
#include "trace_output.h"
#include "trace_stat.h"
+ /* Flags that do not get reset */
+ #define FTRACE_NOCLEAR_FLAGS (FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED)
+
#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__"
#define FTRACE_WARN_ON(cond) \
flag ^= rec->flags & FTRACE_FL_ENABLED;
if (update) {
- rec->flags |= FTRACE_FL_ENABLED;
+ rec->flags |= FTRACE_FL_ENABLED | FTRACE_FL_TOUCHED;
if (flag & FTRACE_FL_REGS) {
if (rec->flags & FTRACE_FL_REGS)
rec->flags |= FTRACE_FL_REGS_EN;
if (update) {
/* If there's no more users, clear all flags */
if (!ftrace_rec_count(rec))
- rec->flags &= FTRACE_FL_DISABLED;
+ rec->flags &= FTRACE_NOCLEAR_FLAGS;
else
/*
* Just disable the record, but keep the ops TRAMP
struct dyn_ftrace *rec;
do_for_each_ftrace_rec(pg, rec) {
- if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
+ if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_NOCLEAR_FLAGS))
pr_warn(" %pS flags:%lx\n",
(void *)rec->ip, rec->flags);
} while_for_each_ftrace_rec();
!ftrace_lookup_ip(iter->hash, rec->ip)) ||
((iter->flags & FTRACE_ITER_ENABLED) &&
- !(rec->flags & FTRACE_FL_ENABLED))) {
+ !(rec->flags & FTRACE_FL_ENABLED)) ||
+
+ ((iter->flags & FTRACE_ITER_TOUCHED) &&
+ !(rec->flags & FTRACE_FL_TOUCHED))) {
rec = NULL;
goto retry;
return 0;
}
- if (iter->flags & FTRACE_ITER_ENABLED) {
+ if (iter->flags & (FTRACE_ITER_ENABLED | FTRACE_ITER_TOUCHED)) {
struct ftrace_ops *ops;
seq_printf(m, " (%ld)%s%s%s%s",
return 0;
}
+ static int
+ ftrace_touched_open(struct inode *inode, struct file *file)
+ {
+ struct ftrace_iterator *iter;
+
+ /*
+ * This shows us what functions have ever been enabled
+ * (traced, direct, patched, etc). Not sure if we want lockdown
+ * to hide such critical information for an admin.
+ * Although, perhaps it can show information we don't
+ * want people to see, but if something had traced
+ * something, we probably want to know about it.
+ */
+
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (!iter)
+ return -ENOMEM;
+
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_TOUCHED;
+ iter->ops = &global_ops;
+
+ return 0;
+ }
+
/**
* ftrace_regex_open - initialize function tracer filter files
* @ops: The ftrace_ops that hold the hash filters
.release = seq_release_private,
};
+ static const struct file_operations ftrace_touched_fops = {
+ .open = ftrace_touched_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+ };
+
static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
trace_create_file("enabled_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_enabled_fops);
+ trace_create_file("touched_functions", TRACE_MODE_READ,
+ d_tracer, NULL, &ftrace_touched_fops);
+
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
* and returns 1 in case we resolved all the requested symbols,
* 0 otherwise.
*/
-static int kallsyms_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int kallsyms_callback(void *data, const char *name, unsigned long addr)
{
struct kallsyms_data *args = data;
const char **sym;
#define extended_time(event) \
(event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
- static inline int rb_null_event(struct ring_buffer_event *event)
+ static inline bool rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
}
/*
* We need to fit the time_stamp delta into 27 bits.
*/
- static inline int test_time_stamp(u64 delta)
+ static inline bool test_time_stamp(u64 delta)
{
- if (delta & TS_DELTA_TEST)
- return 1;
- return 0;
+ return !!(delta & TS_DELTA_TEST);
}
#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
return ret == expect;
}
- static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
unsigned long cnt, top, bottom, msb;
unsigned long cnt2, top2, bottom2, msb2;
return NULL;
}
- static int rb_head_page_replace(struct buffer_page *old,
+ static bool rb_head_page_replace(struct buffer_page *old,
struct buffer_page *new)
{
unsigned long *ptr = (unsigned long *)&old->list.prev->next;
}
}
- static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage)
{
unsigned long val = (unsigned long)bpage;
- if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
- return 1;
-
- return 0;
+ RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK);
}
/**
* As a safety measure we check to make sure the data pages have not
* been corrupted.
*/
- static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *head = rb_list_head(cpu_buffer->pages);
struct list_head *tmp;
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(head->next)->prev) != head))
- return -1;
+ return;
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(head->prev)->next) != head))
- return -1;
+ return;
for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
- return -1;
+ return;
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
- return -1;
+ return;
}
-
- return 0;
}
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
+ irq_work_sync(&cpu_buffer->irq_work.work);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+ irq_work_sync(&buffer->irq_work.work);
+
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
return local_read(&bpage->write) & RB_WRITE_MASK;
}
- static int
+ static bool
rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
{
struct list_head *tail_page, *to_remove, *next_page;
return nr_removed == 0;
}
- static int
+ static bool
rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *pages = &cpu_buffer->new_pages;
- int retries, success;
unsigned long flags;
+ bool success;
+ int retries;
/* Can be called at early boot up, where interrupts must not been enabled */
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
* spinning.
*/
retries = 10;
- success = 0;
+ success = false;
while (retries--) {
struct list_head *head_page, *prev_page, *r;
struct list_head *last_page, *first_page;
struct list_head *head_page_with_bit;
+ struct buffer_page *hpage = rb_set_head_page(cpu_buffer);
- head_page = &rb_set_head_page(cpu_buffer)->list;
- if (!head_page)
+ if (!hpage)
break;
+ head_page = &hpage->list;
prev_page = head_page->prev;
first_page = pages->next;
* pointer to point to end of list
*/
head_page->prev = last_page;
- success = 1;
+ success = true;
break;
}
}
static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- int success;
+ bool success;
if (cpu_buffer->nr_pages_to_update > 0)
success = rb_insert_pages(cpu_buffer);
}
}
- static inline int
+ static inline bool
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
delta = rb_time_delta(event);
if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
- return 0;
+ return false;
/* Make sure the write stamp is read before testing the location */
barrier();
/* Something came in, can't discard */
if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
write_stamp, write_stamp - delta))
- return 0;
+ return false;
/*
* It's possible that the event time delta is zero
if (index == old_index) {
/* update counters */
local_sub(event_length, &cpu_buffer->entries_bytes);
- return 1;
+ return true;
}
}
/* could not discard */
- return 0;
+ return false;
}
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
if (RB_WARN_ON(cpu_buffer,
rb_is_reader_page(cpu_buffer->tail_page)))
return;
+ /*
+ * No need for a memory barrier here, as the update
+ * of the tail_page did it for this page.
+ */
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
+ /* Make sure the readers see the content of what is committed. */
+ smp_wmb();
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
RB_WARN_ON(cpu_buffer,
* Note: The TRANSITION bit only handles a single transition between context.
*/
- static __always_inline int
+ static __always_inline bool
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned int val = cpu_buffer->current_context;
bit = RB_CTX_TRANSITION;
if (val & (1 << (bit + cpu_buffer->nest))) {
do_ring_buffer_record_recursion();
- return 1;
+ return true;
}
}
val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val;
- return 0;
+ return false;
}
static __always_inline void
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd | RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_off);
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd & ~RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_on);
default:
RB_WARN_ON(cpu_buffer, 1);
}
- return;
}
static void
default:
RB_WARN_ON(iter->cpu_buffer, 1);
}
- return;
}
static struct buffer_page *
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
- int ret;
+ bool ret;
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
/*
* Make sure we see any padding after the write update
- * (see rb_reset_tail())
+ * (see rb_reset_tail()).
+ *
+ * In addition, a writer may be writing on the reader page
+ * if the page has not been fully filled, so the read barrier
+ * is also needed to make sure we see the content of what is
+ * committed by the writer (see rb_set_commit_to_write()).
*/
smp_rmb();
{
if (likely(locked))
raw_spin_unlock(&cpu_buffer->reader_lock);
- return;
}
/**
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
+ /* Flag to ensure proper resetting of atomic variables */
+ #define RESET_BIT (1 << 30)
+
/**
* ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
for_each_online_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_add(RESET_BIT, &cpu_buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
}
/* Make sure all commits have finished */
synchronize_rcu();
- for_each_online_buffer_cpu(buffer, cpu) {
+ for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
+ /*
+ * If a CPU came online during the synchronize_rcu(), then
+ * ignore it.
+ */
+ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT))
+ continue;
+
reset_disabled_cpu_buffer(cpu_buffer);
atomic_dec(&cpu_buffer->record_disabled);
- atomic_dec(&cpu_buffer->resize_disabled);
+ atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
mutex_unlock(&buffer->mutex);
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
+ bool ret;
int cpu;
- int ret;
/* yes this is racy, but if you don't like the race, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
- int ret;
+ bool ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return true;
unsigned long flags;
if (in_nmi()) {
- internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
- internal_trace_puts("*** snapshot is being ignored ***\n");
+ trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ trace_array_puts(tr, "*** snapshot is being ignored ***\n");
return;
}
if (!tr->allocated_snapshot) {
- internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
- internal_trace_puts("*** stopping trace here! ***\n");
- tracing_off();
+ trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
+ trace_array_puts(tr, "*** stopping trace here! ***\n");
+ tracer_tracing_off(tr);
return;
}
/* Note, snapshot can not be used when the tracer uses it */
if (tracer->use_max_tr) {
- internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
- internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+ trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
return;
}
#define STATIC_FMT_BUF_SIZE 128
static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
- static char *trace_iter_expand_format(struct trace_iterator *iter)
+ char *trace_iter_expand_format(struct trace_iterator *iter)
{
char *tmp;
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
- if (event)
+ if (event) {
+ if (tr->trace_flags & TRACE_ITER_FIELDS)
+ return print_event_fields(iter, event);
return event->funcs->trace(iter, sym_flags, event);
+ }
trace_seq_printf(s, "Unknown type %d\n", entry->type);
tracefs_remove(tr->dir);
free_percpu(tr->last_func_repeats);
free_trace_buffers(tr);
+ clear_tracing_err_log(tr);
for (i = 0; i < tr->nr_topts; i++) {
kfree(tr->topts[i].topts);
void __init ftrace_boot_snapshot(void)
{
+#ifdef CONFIG_TRACER_MAX_TRACE
struct trace_array *tr;
- if (snapshot_at_boot) {
- tracing_snapshot();
- internal_trace_puts("** Boot snapshot taken **\n");
- }
+ if (!snapshot_at_boot)
+ return;
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr == &global_trace)
+ if (!tr->allocated_snapshot)
continue;
- trace_array_puts(tr, "** Boot snapshot taken **\n");
+
tracing_snapshot_instance(tr);
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
}
+#endif
}
void __init early_trace_init(void)
/* Use indirect calls to avoid inlining the target functions */
static u32 (*target)(u32 value);
static u32 (*target2)(u32 value);
+ static u32 (*target_nest)(u32 value, u32 (*nest)(u32));
static unsigned long target_ip;
static unsigned long target2_ip;
+ static unsigned long target_nest_ip;
+ static int entry_return_value;
static noinline u32 fprobe_selftest_target(u32 value)
{
return (value / div_factor) + 1;
}
- static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+ static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32))
+ {
+ return nest(value + 2);
+ }
+
+ static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip,
+ struct pt_regs *regs, void *data)
{
KUNIT_EXPECT_FALSE(current_test, preemptible());
/* This can be called on the fprobe_selftest_target and the fprobe_selftest_target2 */
if (ip != target_ip)
KUNIT_EXPECT_EQ(current_test, ip, target2_ip);
entry_val = (rand1 / div_factor);
+ if (fp->entry_data_size) {
+ KUNIT_EXPECT_NOT_NULL(current_test, data);
+ if (data)
+ *(u32 *)data = entry_val;
+ } else
+ KUNIT_EXPECT_NULL(current_test, data);
+
+ return entry_return_value;
}
- static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs)
+ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip,
+ struct pt_regs *regs, void *data)
{
unsigned long ret = regs_return_value(regs);
KUNIT_EXPECT_EQ(current_test, ret, (rand1 / div_factor));
KUNIT_EXPECT_EQ(current_test, entry_val, (rand1 / div_factor));
exit_val = entry_val + div_factor;
+ if (fp->entry_data_size) {
+ KUNIT_EXPECT_NOT_NULL(current_test, data);
+ if (data)
+ KUNIT_EXPECT_EQ(current_test, *(u32 *)data, entry_val);
+ } else
+ KUNIT_EXPECT_NULL(current_test, data);
+ }
+
+ static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip,
+ struct pt_regs *regs, void *data)
+ {
+ KUNIT_EXPECT_FALSE(current_test, preemptible());
+ return 0;
+ }
+
+ static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip,
+ struct pt_regs *regs, void *data)
+ {
+ KUNIT_EXPECT_FALSE(current_test, preemptible());
+ KUNIT_EXPECT_EQ(current_test, ip, target_nest_ip);
}
/* Test entry only (no rethook) */
KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
}
+ /* Test private entry_data */
+ static void test_fprobe_data(struct kunit *test)
+ {
+ struct fprobe fp = {
+ .entry_handler = fp_entry_handler,
+ .exit_handler = fp_exit_handler,
+ .entry_data_size = sizeof(u32),
+ };
+
+ current_test = test;
+ KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp, "fprobe_selftest_target", NULL));
+
+ target(rand1);
+
+ KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
+ }
+
+ /* Test nr_maxactive */
+ static void test_fprobe_nest(struct kunit *test)
+ {
+ static const char *syms[] = {"fprobe_selftest_target", "fprobe_selftest_nest_target"};
+ struct fprobe fp = {
+ .entry_handler = nest_entry_handler,
+ .exit_handler = nest_exit_handler,
+ .nr_maxactive = 1,
+ };
+
+ current_test = test;
+ KUNIT_EXPECT_EQ(test, 0, register_fprobe_syms(&fp, syms, 2));
+
+ target_nest(rand1, target);
+ KUNIT_EXPECT_EQ(test, 1, fp.nmissed);
+
+ KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
+ }
+
+ static void test_fprobe_skip(struct kunit *test)
+ {
+ struct fprobe fp = {
+ .entry_handler = fp_entry_handler,
+ .exit_handler = fp_exit_handler,
+ };
+
+ current_test = test;
+ KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp, "fprobe_selftest_target", NULL));
+
+ entry_return_value = 1;
+ entry_val = 0;
+ exit_val = 0;
+ target(rand1);
+ KUNIT_EXPECT_NE(test, 0, entry_val);
+ KUNIT_EXPECT_EQ(test, 0, exit_val);
+ KUNIT_EXPECT_EQ(test, 0, fp.nmissed);
+ entry_return_value = 0;
+
+ KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp));
+ }
+
static unsigned long get_ftrace_location(void *func)
{
unsigned long size, addr = (unsigned long)func;
rand1 = get_random_u32_above(div_factor);
target = fprobe_selftest_target;
target2 = fprobe_selftest_target2;
+ target_nest = fprobe_selftest_nest_target;
target_ip = get_ftrace_location(target);
target2_ip = get_ftrace_location(target2);
+ target_nest_ip = get_ftrace_location(target_nest);
return 0;
}
KUNIT_CASE(test_fprobe_entry),
KUNIT_CASE(test_fprobe),
KUNIT_CASE(test_fprobe_syms),
+ KUNIT_CASE(test_fprobe_data),
+ KUNIT_CASE(test_fprobe_nest),
+ KUNIT_CASE(test_fprobe_skip),
{}
};
kunit_test_suites(&fprobe_test_suite);
-MODULE_LICENSE("GPL");