#include <linux/compat.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
/*
* Maps the mm_struct mm into the current task struct.
- * On success, this function returns with the mutex
- * exec_update_mutex locked.
+ * On success, this function returns with exec_update_lock
+ * held for writing.
*/
static int exec_mmap(struct mm_struct *mm)
{
if (old_mm)
sync_mm_rss(old_mm);
- ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
+ ret = down_write_killable(&tsk->signal->exec_update_lock);
if (ret)
return ret;
mmap_read_lock(old_mm);
if (unlikely(old_mm->core_state)) {
mmap_read_unlock(old_mm);
- mutex_unlock(&tsk->signal->exec_update_mutex);
+ up_write(&tsk->signal->exec_update_lock);
return -EINTR;
}
}
goto out;
/*
+ * Cancel any io_uring activity across execve
+ */
+ io_uring_task_cancel();
+
+ /* Ensure the files table is not shared. */
+ retval = unshare_files();
+ if (retval)
+ goto out;
+
+ /*
* Must be called _before_ exec_mmap() as bprm->mm is
* not visibile until then. This also enables the update
* to be lockless.
flush_thread();
me->personality &= ~bprm->per_clear;
+ clear_syscall_work_syscall_user_dispatch(me);
+
/*
* We have to apply CLOEXEC before we change whether the process is
* dumpable (in setup_new_exec) to avoid a race with a process in userspace
return 0;
out_unlock:
- mutex_unlock(&me->signal->exec_update_mutex);
+ up_write(&me->signal->exec_update_lock);
out:
return retval;
}
* some architectures like powerpc
*/
me->mm->task_size = TASK_SIZE;
- mutex_unlock(&me->signal->exec_update_mutex);
+ up_write(&me->signal->exec_update_lock);
mutex_unlock(&me->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(setup_new_exec);
int fd, struct filename *filename, int flags)
{
struct file *file;
- struct files_struct *displaced;
int retval;
- /*
- * Cancel any io_uring activity across execve
- */
- io_uring_task_cancel();
-
- retval = unshare_files(&displaced);
- if (retval)
- return retval;
-
retval = prepare_bprm_creds(bprm);
if (retval)
- goto out_files;
+ return retval;
check_unsafe_exec(bprm);
current->in_execve = 1;
bprm->file = file;
/*
* Record that a name derived from an O_CLOEXEC fd will be
- * inaccessible after exec. Relies on having exclusive access to
- * current->files (due to unshare_files above).
+ * inaccessible after exec. This allows the code in exec to
+ * choose to fail when the executable is not mmaped into the
+ * interpreter and an open file descriptor is not passed to
+ * the interpreter. This makes for a better user experience
+ * than having the interpreter start and then immediately fail
+ * when it finds the executable is inaccessible.
*/
- if (bprm->fdpath &&
- close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
+ if (bprm->fdpath && get_close_on_exec(fd))
bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
/* Set the unchanging part of bprm->cred */
rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
- if (displaced)
- put_files_struct(displaced);
return retval;
out:
current->fs->in_exec = 0;
current->in_execve = 0;
-out_files:
- if (displaced)
- reset_files_struct(displaced);
-
return retval;
}
* credential calculations
* (notably. ptrace)
* Deprecated do not use in new code.
- * Use exec_update_mutex instead.
- */
- struct mutex exec_update_mutex; /* Held while task_struct is being
- * updated during exec, and may have
- * inconsistent permissions.
+ * Use exec_update_lock instead.
*/
+ struct rw_semaphore exec_update_lock; /* Held while task_struct is
+ * being updated during exec,
+ * and may have inconsistent
+ * permissions.
+ */
} __randomize_layout;
/*
return -ERESTARTNOINTR;
}
-static inline int signal_pending(struct task_struct *p)
+static inline int task_sigpending(struct task_struct *p)
{
return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}
+static inline int signal_pending(struct task_struct *p)
+{
+#if defined(TIF_NOTIFY_SIGNAL)
+ /*
+ * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
+ * behavior in terms of ensuring that we break out of wait loops
+ * so that notify signal callbacks can be processed.
+ */
+ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
+ return 1;
+#endif
+ return task_sigpending(p);
+}
+
static inline int __fatal_signal_pending(struct task_struct *p)
{
return unlikely(sigismember(&p->pending.signal, SIGKILL));
static inline int fatal_signal_pending(struct task_struct *p)
{
- return signal_pending(p) && __fatal_signal_pending(p);
+ return task_sigpending(p) && __fatal_signal_pending(p);
}
static inline int signal_pending_state(long state, struct task_struct *p)
static inline void restore_saved_sigmask_unless(bool interrupted)
{
if (interrupted)
- WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+ WARN_ON(!signal_pending(current));
else
restore_saved_sigmask();
}
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
+#include <linux/highmem.h>
+#include <linux/pgtable.h>
#include "internal.h"
* function.
*
* Lock order:
- * exec_update_mutex
+ * exec_update_lock
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
if (sample_type & PERF_SAMPLE_CGROUP)
size += sizeof(data->cgroup);
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ size += sizeof(data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ size += sizeof(data->code_page_size);
+
event->header_size = size;
}
if (sample_type & PERF_SAMPLE_CGROUP)
perf_output_put(handle, data->cgroup);
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ perf_output_put(handle, data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ perf_output_put(handle, data->code_page_size);
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);
return phys_addr;
}
+/*
+ * Return the pagetable size of a given virtual address.
+ */
+static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
+{
+ u64 size = 0;
+
+#ifdef CONFIG_HAVE_FAST_GUP
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ pgdp = pgd_offset(mm, addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ return 0;
+
+ if (pgd_leaf(pgd))
+ return pgd_leaf_size(pgd);
+
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (!p4d_present(p4d))
+ return 0;
+
+ if (p4d_leaf(p4d))
+ return p4d_leaf_size(p4d);
+
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
+ pud = READ_ONCE(*pudp);
+ if (!pud_present(pud))
+ return 0;
+
+ if (pud_leaf(pud))
+ return pud_leaf_size(pud);
+
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (pmd_leaf(pmd))
+ return pmd_leaf_size(pmd);
+
+ ptep = pte_offset_map(&pmd, addr);
+ pte = ptep_get_lockless(ptep);
+ if (pte_present(pte))
+ size = pte_leaf_size(pte);
+ pte_unmap(ptep);
+#endif /* CONFIG_HAVE_FAST_GUP */
+
+ return size;
+}
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+ struct mm_struct *mm;
+ unsigned long flags;
+ u64 size;
+
+ if (!addr)
+ return 0;
+
+ /*
+ * Software page-table walkers must disable IRQs,
+ * which prevents any tear down of the page tables.
+ */
+ local_irq_save(flags);
+
+ mm = current->mm;
+ if (!mm) {
+ /*
+ * For kernel threads and the like, use init_mm so that
+ * we can find kernel memory.
+ */
+ mm = &init_mm;
+ }
+
+ size = perf_get_pgtable_size(mm, addr);
+
+ local_irq_restore(flags);
+
+ return size;
+}
+
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
struct perf_callchain_entry *
__perf_event_header__init_id(header, data, event);
- if (sample_type & PERF_SAMPLE_IP)
+ if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
data->ip = perf_instruction_pointer(regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
}
#endif
+ /*
+ * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
+ * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
+ * but the value will not dump to the userspace.
+ */
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ data->data_page_size = perf_get_page_size(data->addr);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ data->code_page_size = perf_get_page_size(data->ip);
+
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;
goto err_task;
}
- if (task) {
- err = down_read_interruptible(&task->signal->exec_update_lock);
- if (err)
- goto err_task;
-
- /*
- * Preserve ptrace permission check for backwards compatibility.
- *
- * We must hold exec_update_lock across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto err_cred;
- }
-
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cred;
+ goto err_task;
}
if (is_sampling_event(event)) {
goto err_context;
}
- err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
+ if (task) {
- * We must hold exec_update_mutex across this and any potential
++ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_file;
+
+ /*
+ * Preserve ptrace permission check for backwards compatibility.
+ *
++ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ goto err_cred;
+ }
+
if (move_group) {
gctx = __perf_event_ctx_lock_double(group_leader, ctx);
mutex_unlock(&ctx->mutex);
if (task) {
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
put_task_struct(task);
}
if (move_group)
perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
-/* err_file: */
+err_cred:
+ if (task)
- mutex_unlock(&task->signal->exec_update_mutex);
++ up_read(&task->signal->exec_update_lock);
+err_file:
fput(event_file);
err_context:
perf_unpin_context(ctx);
*/
if (!event_file)
free_event(event);
-err_cred:
- if (task)
- up_read(&task->signal->exec_update_lock);
err_task:
if (task)
put_task_struct(task);
/*
* When a child task exits, feed back event values to parent events.
*
- * Can be called with exec_update_mutex held when called from
+ * Can be called with exec_update_lock held when called from
* setup_new_exec().
*/
void perf_event_exit_task(struct task_struct *child)
mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
else
- mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
}
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
/*
- * If memcg_kmem_charge_page() fails, page->mem_cgroup
- * pointer is NULL, and memcg_kmem_uncharge_page() in
- * free_thread_stack() will ignore this page.
+ * If memcg_kmem_charge_page() fails, page's
+ * memory cgroup pointer is NULL, and
+ * memcg_kmem_uncharge_page() in free_thread_stack()
+ * will ignore this page.
*/
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
0);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
+ clear_syscall_work_syscall_user_dispatch(tsk);
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
account_kernel_stack(tsk, 1);
kcov_task_init(tsk);
+ kmap_local_fork(tsk);
#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
+ seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
struct mm_struct *mm;
int err;
- err = mutex_lock_killable(&task->signal->exec_update_mutex);
+ err = down_read_killable(&task->signal->exec_update_lock);
if (err)
return ERR_PTR(err);
mmput(mm);
mm = ERR_PTR(-EACCES);
}
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return mm;
}
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_init(&sig->cred_guard_mutex);
- mutex_init(&sig->exec_update_mutex);
+ init_rwsem(&sig->exec_update_lock);
return 0;
}
* to manually enable the seccomp thread flag here.
*/
if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
- set_tsk_thread_flag(p, TIF_SECCOMP);
+ set_task_syscall_work(p, SECCOMP);
#endif
}
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
- clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
- clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+ clear_task_syscall_work(p, SYSCALL_TRACE);
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
+ clear_task_syscall_work(p, SYSCALL_EMU);
#endif
clear_tsk_latency_tracing(p);
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+#ifdef CONFIG_KRETPROBES
+ p->kretprobe_instances.first = NULL;
+#endif
+
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted that the new process's css_set can be changed
* the exec layer of the kernel.
*/
-int unshare_files(struct files_struct **displaced)
+int unshare_files(void)
{
struct task_struct *task = current;
- struct files_struct *copy = NULL;
+ struct files_struct *old, *copy = NULL;
int error;
error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©);
- if (error || !copy) {
- *displaced = NULL;
+ if (error || !copy)
return error;
- }
- *displaced = task->files;
+
+ old = task->files;
task_lock(task);
task->files = copy;
task_unlock(task);
+ put_files_struct(old);
return 0;
}
static struct file *
get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{
- struct file *file = NULL;
+ struct file *file;
- task_lock(task);
rcu_read_lock();
-
- if (task->files)
- file = fcheck_files(task->files, idx);
-
+ file = task_lookup_fd_rcu(task, idx);
rcu_read_unlock();
- task_unlock(task);
return file;
}
- static void kcmp_unlock(struct mutex *m1, struct mutex *m2)
+ static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2)
{
- if (likely(m2 != m1))
- mutex_unlock(m2);
- mutex_unlock(m1);
+ if (likely(l2 != l1))
+ up_read(l2);
+ up_read(l1);
}
- static int kcmp_lock(struct mutex *m1, struct mutex *m2)
+ static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2)
{
int err;
- if (m2 > m1)
- swap(m1, m2);
+ if (l2 > l1)
+ swap(l1, l2);
- err = mutex_lock_killable(m1);
- if (!err && likely(m1 != m2)) {
- err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING);
+ err = down_read_killable(l1);
+ if (!err && likely(l1 != l2)) {
+ err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING);
if (err)
- mutex_unlock(m1);
+ up_read(l1);
}
return err;
{
struct file *filp, *filp_epoll, *filp_tgt;
struct kcmp_epoll_slot slot;
- struct files_struct *files;
if (copy_from_user(&slot, uslot, sizeof(slot)))
return -EFAULT;
if (!filp)
return -EBADF;
- files = get_files_struct(task2);
- if (!files)
+ filp_epoll = fget_task(task2, slot.efd);
+ if (!filp_epoll)
return -EBADF;
- spin_lock(&files->file_lock);
- filp_epoll = fcheck_files(files, slot.efd);
- if (filp_epoll)
- get_file(filp_epoll);
- else
- filp_tgt = ERR_PTR(-EBADF);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-
- if (filp_epoll) {
- filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
- fput(filp_epoll);
- }
+ filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+ fput(filp_epoll);
if (IS_ERR(filp_tgt))
return PTR_ERR(filp_tgt);
/*
* One should have enough rights to inspect task details.
*/
- ret = kcmp_lock(&task1->signal->exec_update_mutex,
- &task2->signal->exec_update_mutex);
+ ret = kcmp_lock(&task1->signal->exec_update_lock,
+ &task2->signal->exec_update_lock);
if (ret)
goto err;
if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
}
err_unlock:
- kcmp_unlock(&task1->signal->exec_update_mutex,
- &task2->signal->exec_update_mutex);
+ kcmp_unlock(&task1->signal->exec_update_lock,
+ &task2->signal->exec_update_lock);
err:
put_task_struct(task1);
put_task_struct(task2);
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .kref = KREF_INIT(2),
+ .ns.count = REFCOUNT_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
struct file *file;
int ret;
- ret = mutex_lock_killable(&task->signal->exec_update_mutex);
+ ret = down_read_killable(&task->signal->exec_update_lock);
if (ret)
return ERR_PTR(ret);
else
file = ERR_PTR(-EPERM);
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return file ?: ERR_PTR(-EBADF);
}