Merge branch 'core/urgent' into core/entry

author Thomas Gleixner <tglx@linutronix.de>

Wed, 4 Nov 2020 17:14:52 +0000 (18:14 +0100)

committer Thomas Gleixner <tglx@linutronix.de>

Wed, 4 Nov 2020 17:14:52 +0000 (18:14 +0100)
author Thomas Gleixner <tglx@linutronix.de>
Wed, 4 Nov 2020 17:14:52 +0000 (18:14 +0100)
committer Thomas Gleixner <tglx@linutronix.de>
Wed, 4 Nov 2020 17:14:52 +0000 (18:14 +0100)
diff --combined arch/x86/kernel/signal.c

index ec3b9c6,be0d7d4..ea794a0
--- 1/arch/x86/kernel/signal.c
--- 2/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@@ -726,7 -726,7 +726,7 @@@ handle_signal(struct ksignal *ksig, str
                                 regs->ax = -EINTR;
                                 break;
                         }
-               /* fallthrough */
+                       fallthrough;
                 case -ERESTARTNOINTR:
                         regs->ax = regs->orig_ax;
                         regs->ip -= 2;
@@@ -804,11 -804,11 +804,11 @@@ static inline unsigned long get_nr_rest
    * want to handle. Thus you cannot kill init even with a SIGKILL even by
    * mistake.
    */
- -void arch_do_signal(struct pt_regs *regs)
+ +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
   {
         struct ksignal ksig;
   
- -      if (get_signal(&ksig)) {
+ +      if (has_signal && get_signal(&ksig)) {
                 /* Whee! Actually deliver the signal.  */
                 handle_signal(&ksig, regs);
                 return;
diff --combined include/linux/entry-common.h

index c7bfac4,474f296..b9711e8
--- 1/include/linux/entry-common.h
--- 2/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@@ -37,12 -37,8 +37,12 @@@
   # define _TIF_UPROBE                  (0)
   #endif
   
+ +#ifndef _TIF_NOTIFY_SIGNAL
+ +# define _TIF_NOTIFY_SIGNAL           (0)
+ +#endif
+ +
   /*
-  * TIF flags handled in syscall_enter_from_usermode()
+  * TIF flags handled in syscall_enter_from_user_mode()
    */
   #ifndef ARCH_SYSCALL_ENTER_WORK
   # define ARCH_SYSCALL_ENTER_WORK      (0)
@@@ -73,7 -69,7 +73,7 @@@
   
   #define EXIT_TO_USER_MODE_WORK                                                \
         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |           \
- -       _TIF_NEED_RESCHED | _TIF_PATCH_PENDING |                       \
+ +       _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |  \
          ARCH_EXIT_TO_USER_MODE_WORK)
   
   /**
@@@ -114,15 -110,30 +114,30 @@@ static inline __must_check int arch_sys
   #endif
   
   /**
-  * syscall_enter_from_user_mode - Check and handle work before invoking
-  *                             a syscall
+  * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
    * @regs:     Pointer to currents pt_regs
-  * @syscall:  The syscall number
    *
    * Invoked from architecture specific syscall entry code with interrupts
    * disabled. The calling code has to be non-instrumentable. When the
-  * function returns all state is correct and the subsequent functions can be
-  * instrumented.
+  * function returns all state is correct, interrupts are enabled and the
+  * subsequent functions can be instrumented.
+  *
+  * This handles lockdep, RCU (context tracking) and tracing state.
+  *
+  * This is invoked when there is extra architecture specific functionality
+  * to be done between establishing state and handling user mode entry work.
+  */
+ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
+ 
+ /**
+  * syscall_enter_from_user_mode_work - Check and handle work before invoking
+  *                                   a syscall
+  * @regs:     Pointer to currents pt_regs
+  * @syscall:  The syscall number
+  *
+  * Invoked from architecture specific syscall entry code with interrupts
+  * enabled after invoking syscall_enter_from_user_mode_prepare() and extra
+  * architecture specific work.
    *
    * Returns: The original or a modified syscall number
    *
@@@ -131,12 -142,30 +146,30 @@@
    * syscall_set_return_value() first.  If neither of those are called and -1
    * is returned, then the syscall will fail with ENOSYS.
    *
-  * The following functionality is handled here:
+  * It handles the following work items:
    *
-  *  1) Establish state (lockdep, RCU (context tracking), tracing)
-  *  2) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
+  *  1) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
    *     __secure_computing(), trace_sys_enter()
-  *  3) Invocation of audit_syscall_entry()
+  *  2) Invocation of audit_syscall_entry()
+  */
+ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
+ 
+ /**
+  * syscall_enter_from_user_mode - Establish state and check and handle work
+  *                              before invoking a syscall
+  * @regs:     Pointer to currents pt_regs
+  * @syscall:  The syscall number
+  *
+  * Invoked from architecture specific syscall entry code with interrupts
+  * disabled. The calling code has to be non-instrumentable. When the
+  * function returns all state is correct, interrupts are enabled and the
+  * subsequent functions can be instrumented.
+  *
+  * This is combination of syscall_enter_from_user_mode_prepare() and
+  * syscall_enter_from_user_mode_work().
+  *
+  * Returns: The original or a modified syscall number. See
+  * syscall_enter_from_user_mode_work() for further explanation.
    */
   long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);
   
@@@ -230,13 -259,12 +263,13 @@@ static __always_inline void arch_exit_t
   #endif
   
   /**
- - * arch_do_signal -  Architecture specific signal delivery function
+ + * arch_do_signal_or_restart -  Architecture specific signal delivery function
    * @regs:     Pointer to currents pt_regs
+ + * @has_signal:       actual signal to handle
    *
    * Invoked from exit_to_user_mode_loop().
    */
- -void arch_do_signal(struct pt_regs *regs);
+ +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal);
   
   /**
    * arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit()
diff --combined include/linux/tracehook.h

index 1e8caca,b480e1a..f7d82e4
--- 1/include/linux/tracehook.h
--- 2/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@@ -178,9 -178,9 +178,9 @@@ static inline void set_notify_resume(st
    */
   static inline void tracehook_notify_resume(struct pt_regs *regs)
   {
+       clear_thread_flag(TIF_NOTIFY_RESUME);
         /*
-        * The caller just cleared TIF_NOTIFY_RESUME. This barrier
-        * pairs with task_work_add()->set_notify_resume() after
+        * This barrier pairs with task_work_add()->set_notify_resume() after
          * hlist_add_head(task->task_works);
          */
         smp_mb__after_atomic();
@@@ -198,31 -198,4 +198,31 @@@
         blkcg_maybe_throttle_current();
   }
   
+ +/*
+ + * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
+ + * is currently used by TWA_SIGNAL based task_work, which requires breaking
+ + * wait loops to ensure that task_work is noticed and run.
+ + */
+ +static inline void tracehook_notify_signal(void)
+ +{
+ +#if defined(TIF_NOTIFY_SIGNAL)
+ +      clear_thread_flag(TIF_NOTIFY_SIGNAL);
+ +      smp_mb__after_atomic();
+ +      if (current->task_works)
+ +              task_work_run();
+ +#endif
+ +}
+ +
+ +/*
+ + * Called when we have work to process from exit_to_user_mode_loop()
+ + */
+ +static inline void set_notify_signal(struct task_struct *task)
+ +{
+ +#if defined(TIF_NOTIFY_SIGNAL)
+ +      if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
+ +          !wake_up_state(task, TASK_INTERRUPTIBLE))
+ +              kick_process(task);
+ +#endif
+ +}
+ +
   #endif        /* <linux/tracehook.h> */
diff --combined kernel/entry/common.c

index f7ed415,e9e2df3..3a1dfec
--- 1/kernel/entry/common.c
--- 2/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@@ -60,6 -60,9 +60,9 @@@ static long syscall_trace_enter(struct 
                         return ret;
         }
   
+       /* Either of the above might have changed the syscall number */
+       syscall = syscall_get_nr(current, regs);
+ 
         if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
                 trace_sys_enter(regs, syscall);
   
@@@ -68,22 -71,45 +71,45 @@@
         return ret ? : syscall;
   }
   
- noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+ static __always_inline long
+ __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
   {
         unsigned long ti_work;
   
-       enter_from_user_mode(regs);
-       instrumentation_begin();
- 
-       local_irq_enable();
         ti_work = READ_ONCE(current_thread_info()->flags);
         if (ti_work & SYSCALL_ENTER_WORK)
                 syscall = syscall_trace_enter(regs, syscall, ti_work);
-       instrumentation_end();
   
         return syscall;
   }
   
+ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
+ {
+       return __syscall_enter_from_user_work(regs, syscall);
+ }
+ 
+ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+ {
+       long ret;
+ 
+       enter_from_user_mode(regs);
+ 
+       instrumentation_begin();
+       local_irq_enable();
+       ret = __syscall_enter_from_user_work(regs, syscall);
+       instrumentation_end();
+ 
+       return ret;
+ }
+ 
+ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
+ {
+       enter_from_user_mode(regs);
+       instrumentation_begin();
+       local_irq_enable();
+       instrumentation_end();
+ }
+ 
   /**
    * exit_to_user_mode - Fixup state when exiting to user mode
    *
@@@ -109,15 -135,7 +135,15 @@@ static __always_inline void exit_to_use
   }
   
   /* Workaround to allow gradual conversion of architecture code */
- -void __weak arch_do_signal(struct pt_regs *regs) { }
+ +void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
+ +
+ +static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
+ +{
+ +      if (ti_work & _TIF_NOTIFY_SIGNAL)
+ +              tracehook_notify_signal();
+ +
+ +      arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
+ +}
   
   static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
                                             unsigned long ti_work)
@@@ -139,11 -157,10 +165,10 @@@
                 if (ti_work & _TIF_PATCH_PENDING)
                         klp_update_patch_state(current);
   
- -              if (ti_work & _TIF_SIGPENDING)
- -                      arch_do_signal(regs);
+ +              if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ +                      handle_signal_work(regs, ti_work);
   
                 if (ti_work & _TIF_NOTIFY_RESUME) {
-                       clear_thread_flag(TIF_NOTIFY_RESUME);
                         tracehook_notify_resume(regs);
                         rseq_handle_notify_resume(NULL, regs);
                 }
@@@ -190,7 -207,7 +215,7 @@@ static inline bool report_single_step(u
   /*
    * If TIF_SYSCALL_EMU is set, then the only reason to report is when
    * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
-  * instruction has been already reported in syscall_enter_from_usermode().
+  * instruction has been already reported in syscall_enter_from_user_mode().
    */
   #define SYSEMU_STEP   (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
   
@@@ -286,7 -303,7 +311,7 @@@ noinstr irqentry_state_t irqentry_enter
          * terminate a grace period, if and only if the timer interrupt is
          * not nested into another interrupt.
          *
-        * Checking for __rcu_is_watching() here would prevent the nesting
+        * Checking for rcu_is_watching() here would prevent the nesting
          * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
          * the tick then rcu_flavor_sched_clock_irq() would wrongfully
          * assume that it is the first interupt and eventually claim
@@@ -302,7 -319,7 +327,7 @@@
                 /*
                  * If RCU is not watching then the same careful
                  * sequence vs. lockdep and tracing is required
- -               * as in irq_enter_from_user_mode().
+ +               * as in irqentry_enter_from_user_mode().
                  */
                 lockdep_hardirqs_off(CALLER_ADDR0);
                 rcu_irq_enter();
@@@ -320,10 -337,10 +345,10 @@@
          * already contains a warning when RCU is not watching, so no point
          * in having another one here.
          */
+       lockdep_hardirqs_off(CALLER_ADDR0);
         instrumentation_begin();
         rcu_irq_enter_check_tick();
-       /* Use the combo lockdep/tracing function */
-       trace_hardirqs_off();
+       trace_hardirqs_off_finish();
         instrumentation_end();
   
         return ret;
diff --combined kernel/entry/kvm.c

index b828a3d,b6678a5..49972ee
--- 1/kernel/entry/kvm.c
--- 2/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@@ -8,9 -8,6 +8,9 @@@ static int xfer_to_guest_mode_work(stru
         do {
                 int ret;
   
+ +              if (ti_work & _TIF_NOTIFY_SIGNAL)
+ +                      tracehook_notify_signal();
+ +
                 if (ti_work & _TIF_SIGPENDING) {
                         kvm_handle_signal_exit(vcpu);
                         return -EINTR;
@@@ -19,10 -16,8 +19,8 @@@
                 if (ti_work & _TIF_NEED_RESCHED)
                         schedule();
   
-               if (ti_work & _TIF_NOTIFY_RESUME) {
-                       clear_thread_flag(TIF_NOTIFY_RESUME);
+               if (ti_work & _TIF_NOTIFY_RESUME)
                         tracehook_notify_resume(NULL);
-               }
   
                 ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
                 if (ret)
diff --combined kernel/events/uprobes.c

index edd0c98,00b0358..bf9edd8
--- 1/kernel/events/uprobes.c
--- 2/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@@ -205,7 -205,7 +205,7 @@@ static int __replace_page(struct vm_are
                 try_to_free_swap(old_page);
         page_vma_mapped_walk_done(&pvmw);
   
-       if (vma->vm_flags & VM_LOCKED)
+       if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
                 munlock_vma_page(old_page);
         put_page(old_page);
   
@@@ -1823,7 -1823,7 +1823,7 @@@ void uprobe_copy_process(struct task_st
   
         t->utask->dup_xol_addr = area->vaddr;
         init_task_work(&t->utask->dup_xol_work, dup_xol_work);
-       task_work_add(t, &t->utask->dup_xol_work, true);
+       task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
   }
   
   /*
@@@ -1973,7 -1973,7 +1973,7 @@@ bool uprobe_deny_signal(void
   
         WARN_ON_ONCE(utask->state != UTASK_SSTEP);
   
- -      if (signal_pending(t)) {
+ +      if (task_sigpending(t)) {
                 spin_lock_irq(&t->sighand->siglock);
                 clear_tsk_thread_flag(t, TIF_SIGPENDING);
                 spin_unlock_irq(&t->sighand->siglock);
diff --combined kernel/signal.c

index 61b377e,a38b3ed..f67ea9a
--- 1/kernel/signal.c
--- 2/kernel/signal.c
+++ b/kernel/signal.c
@@@ -851,7 -851,7 +851,7 @@@ static int check_kill_permission(int si
                          */
                         if (!sid || sid == task_session(current))
                                 break;
-                       /* fall through */
+                       fallthrough;
                 default:
                         return -EPERM;
                 }
@@@ -983,7 -983,7 +983,7 @@@ static inline bool wants_signal(int sig
         if (task_is_stopped_or_traced(p))
                 return false;
   
- -      return task_curr(p) || !signal_pending(p);
+ +      return task_curr(p) || !task_sigpending(p);
   }
   
   static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
@@@ -2529,20 -2529,6 +2529,20 @@@ bool get_signal(struct ksignal *ksig
         struct signal_struct *signal = current->signal;
         int signr;
   
+ +      /*
+ +       * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
+ +       * that the arch handlers don't all have to do it. If we get here
+ +       * without TIF_SIGPENDING, just exit after running signal work.
+ +       */
+ +#ifdef TIF_NOTIFY_SIGNAL
+ +      if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
+ +              if (test_thread_flag(TIF_NOTIFY_SIGNAL))
+ +                      tracehook_notify_signal();
+ +              if (!task_sigpending(current))
+ +                      return false;
+ +      }
+ +#endif
+ +
         if (unlikely(uprobe_deny_signal()))
                 return false;
   
@@@ -2836,7 -2822,7 +2836,7 @@@ static void retarget_shared_pending(str
                 /* Remove the signals this thread can handle. */
                 sigandsets(&retarget, &retarget, &t->blocked);
   
- -              if (!signal_pending(t))
+ +              if (!task_sigpending(t))
                         signal_wake_up(t, 0);
   
                 if (sigisemptyset(&retarget))
@@@ -2870,7 -2856,7 +2870,7 @@@ void exit_signals(struct task_struct *t
   
         cgroup_threadgroup_change_end(tsk);
   
- -      if (!signal_pending(tsk))
+ +      if (!task_sigpending(tsk))
                 goto out;
   
         unblocked = tsk->blocked;
@@@ -2914,7 -2900,7 +2914,7 @@@ long do_no_restart_syscall(struct resta
   
   static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
   {
- -      if (signal_pending(tsk) && !thread_group_empty(tsk)) {
+ +      if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
                 sigset_t newblocked;
                 /* A set of now blocked but previously unblocked signals. */
                 sigandnsets(&newblocked, newset, &current->blocked);
diff --combined kernel/task_work.c

index ae05889,8d6e121..15b0872
--- 1/kernel/task_work.c
--- 2/kernel/task_work.c
+++ b/kernel/task_work.c
@@@ -5,57 -5,35 +5,62 @@@
   
   static struct callback_head work_exited; /* all we need is ->next == NULL */
   
+ +/*
+ + * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster
+ + * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is
+ + * shared for threads, and can cause contention on sighand->lock. Even for
+ + * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking
+ + * or IRQ disabling is involved for notification (or running) purposes.
+ + */
+ +static void task_work_notify_signal(struct task_struct *task)
+ +{
+ +#if defined(TIF_NOTIFY_SIGNAL)
+ +      set_notify_signal(task);
+ +#else
+ +      unsigned long flags;
+ +
+ +      /*
+ +       * Only grab the sighand lock if we don't already have some
+ +       * task_work pending. This pairs with the smp_store_mb()
+ +       * in get_signal(), see comment there.
+ +       */
+ +      if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
+ +          lock_task_sighand(task, &flags)) {
+ +              task->jobctl |= JOBCTL_TASK_WORK;
+ +              signal_wake_up(task, 0);
+ +              unlock_task_sighand(task, &flags);
+ +      }
+ +#endif
+ +}
+ +
   /**
    * task_work_add - ask the @task to execute @work->func()
    * @task: the task which should run the callback
    * @work: the callback to run
-  * @notify: send the notification if true
+  * @notify: how to notify the targeted task
    *
-  * Queue @work for task_work_run() below and notify the @task if @notify.
-  * Fails if the @task is exiting/exited and thus it can't process this @work.
-  * Otherwise @work->func() will be called when the @task returns from kernel
-  * mode or exits.
+  * Queue @work for task_work_run() below and notify the @task if @notify
+  * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the
+  * it will interrupt the targeted task and run the task_work. @TWA_RESUME
+  * work is run only when the task exits the kernel and returns to user mode,
+  * or before entering guest mode. Fails if the @task is exiting/exited and thus
+  * it can't process this @work. Otherwise @work->func() will be called when the
+  * @task goes through one of the aforementioned transitions, or exits.
    *
-  * This is like the signal handler which runs in kernel mode, but it doesn't
-  * try to wake up the @task.
+  * If the targeted task is exiting, then an error is returned and the work item
+  * is not queued. It's up to the caller to arrange for an alternative mechanism
+  * in that case.
    *
-  * Note: there is no ordering guarantee on works queued here.
+  * Note: there is no ordering guarantee on works queued here. The task_work
+  * list is LIFO.
    *
    * RETURNS:
    * 0 if succeeds or -ESRCH.
    */
- int
- task_work_add(struct task_struct *task, struct callback_head *work, int notify)
+ int task_work_add(struct task_struct *task, struct callback_head *work,
+                 enum task_work_notify_mode notify)
   {
         struct callback_head *head;
- -      unsigned long flags;
   
         do {
                 head = READ_ONCE(task->task_works);
@@@ -65,12 -43,27 +70,17 @@@
         } while (cmpxchg(&task->task_works, head, work) != head);
   
         switch (notify) {
+       case TWA_NONE:
+               break;
         case TWA_RESUME:
                 set_notify_resume(task);
                 break;
         case TWA_SIGNAL:
- -              /*
- -               * Only grab the sighand lock if we don't already have some
- -               * task_work pending. This pairs with the smp_store_mb()
- -               * in get_signal(), see comment there.
- -               */
- -              if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
- -                  lock_task_sighand(task, &flags)) {
- -                      task->jobctl |= JOBCTL_TASK_WORK;
- -                      signal_wake_up(task, 0);
- -                      unlock_task_sighand(task, &flags);
- -              }
+ +              task_work_notify_signal(task);
                 break;
+       default:
+               WARN_ON_ONCE(1);
+               break;
         }
   
         return 0;
author	Thomas Gleixner <tglx@linutronix.de>
	Wed, 4 Nov 2020 17:14:52 +0000 (18:14 +0100)
committer	Thomas Gleixner <tglx@linutronix.de>
	Wed, 4 Nov 2020 17:14:52 +0000 (18:14 +0100)
		1	2
arch/x86/kernel/signal.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/entry-common.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/tracehook.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/entry/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/entry/kvm.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/uprobes.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/signal.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/task_work.c	patch \|	diff1 \|	diff2 \|	blob \| history