Merge branch 'kvm-fixes' into 'next'
[platform/kernel/linux-rpi.git] / kernel / entry / common.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/context_tracking.h>
4 #include <linux/entry-common.h>
5 #include <linux/livepatch.h>
6 #include <linux/audit.h>
7
8 #define CREATE_TRACE_POINTS
9 #include <trace/events/syscalls.h>
10
11 /**
12  * enter_from_user_mode - Establish state when coming from user mode
13  *
14  * Syscall/interrupt entry disables interrupts, but user mode is traced as
15  * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
16  *
17  * 1) Tell lockdep that interrupts are disabled
18  * 2) Invoke context tracking if enabled to reactivate RCU
19  * 3) Trace interrupts off state
20  */
21 static __always_inline void enter_from_user_mode(struct pt_regs *regs)
22 {
23         arch_check_user_regs(regs);
24         lockdep_hardirqs_off(CALLER_ADDR0);
25
26         CT_WARN_ON(ct_state() != CONTEXT_USER);
27         user_exit_irqoff();
28
29         instrumentation_begin();
30         trace_hardirqs_off_finish();
31         instrumentation_end();
32 }
33
34 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
35 {
36         if (unlikely(audit_context())) {
37                 unsigned long args[6];
38
39                 syscall_get_arguments(current, regs, args);
40                 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
41         }
42 }
43
44 static long syscall_trace_enter(struct pt_regs *regs, long syscall,
45                                 unsigned long ti_work)
46 {
47         long ret = 0;
48
49         /* Handle ptrace */
50         if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
51                 ret = arch_syscall_enter_tracehook(regs);
52                 if (ret || (ti_work & _TIF_SYSCALL_EMU))
53                         return -1L;
54         }
55
56         /* Do seccomp after ptrace, to catch any tracer changes. */
57         if (ti_work & _TIF_SECCOMP) {
58                 ret = __secure_computing(NULL);
59                 if (ret == -1L)
60                         return ret;
61         }
62
63         if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
64                 trace_sys_enter(regs, syscall);
65
66         syscall_enter_audit(regs, syscall);
67
68         /* The above might have changed the syscall number */
69         return ret ? : syscall_get_nr(current, regs);
70 }
71
72 static __always_inline long
73 __syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
74 {
75         unsigned long ti_work;
76
77         ti_work = READ_ONCE(current_thread_info()->flags);
78         if (ti_work & SYSCALL_ENTER_WORK)
79                 syscall = syscall_trace_enter(regs, syscall, ti_work);
80
81         return syscall;
82 }
83
84 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
85 {
86         return __syscall_enter_from_user_work(regs, syscall);
87 }
88
89 noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
90 {
91         long ret;
92
93         enter_from_user_mode(regs);
94
95         instrumentation_begin();
96         local_irq_enable();
97         ret = __syscall_enter_from_user_work(regs, syscall);
98         instrumentation_end();
99
100         return ret;
101 }
102
103 noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
104 {
105         enter_from_user_mode(regs);
106         instrumentation_begin();
107         local_irq_enable();
108         instrumentation_end();
109 }
110
111 /**
112  * exit_to_user_mode - Fixup state when exiting to user mode
113  *
114  * Syscall/interupt exit enables interrupts, but the kernel state is
115  * interrupts disabled when this is invoked. Also tell RCU about it.
116  *
117  * 1) Trace interrupts on state
118  * 2) Invoke context tracking if enabled to adjust RCU state
119  * 3) Invoke architecture specific last minute exit code, e.g. speculation
120  *    mitigations, etc.
121  * 4) Tell lockdep that interrupts are enabled
122  */
123 static __always_inline void exit_to_user_mode(void)
124 {
125         instrumentation_begin();
126         trace_hardirqs_on_prepare();
127         lockdep_hardirqs_on_prepare(CALLER_ADDR0);
128         instrumentation_end();
129
130         user_enter_irqoff();
131         arch_exit_to_user_mode();
132         lockdep_hardirqs_on(CALLER_ADDR0);
133 }
134
135 /* Workaround to allow gradual conversion of architecture code */
136 void __weak arch_do_signal(struct pt_regs *regs) { }
137
138 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
139                                             unsigned long ti_work)
140 {
141         /*
142          * Before returning to user space ensure that all pending work
143          * items have been completed.
144          */
145         while (ti_work & EXIT_TO_USER_MODE_WORK) {
146
147                 local_irq_enable_exit_to_user(ti_work);
148
149                 if (ti_work & _TIF_NEED_RESCHED)
150                         schedule();
151
152                 if (ti_work & _TIF_UPROBE)
153                         uprobe_notify_resume(regs);
154
155                 if (ti_work & _TIF_PATCH_PENDING)
156                         klp_update_patch_state(current);
157
158                 if (ti_work & _TIF_SIGPENDING)
159                         arch_do_signal(regs);
160
161                 if (ti_work & _TIF_NOTIFY_RESUME) {
162                         clear_thread_flag(TIF_NOTIFY_RESUME);
163                         tracehook_notify_resume(regs);
164                         rseq_handle_notify_resume(NULL, regs);
165                 }
166
167                 /* Architecture specific TIF work */
168                 arch_exit_to_user_mode_work(regs, ti_work);
169
170                 /*
171                  * Disable interrupts and reevaluate the work flags as they
172                  * might have changed while interrupts and preemption was
173                  * enabled above.
174                  */
175                 local_irq_disable_exit_to_user();
176                 ti_work = READ_ONCE(current_thread_info()->flags);
177         }
178
179         /* Return the latest work state for arch_exit_to_user_mode() */
180         return ti_work;
181 }
182
183 static void exit_to_user_mode_prepare(struct pt_regs *regs)
184 {
185         unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
186
187         lockdep_assert_irqs_disabled();
188
189         if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
190                 ti_work = exit_to_user_mode_loop(regs, ti_work);
191
192         arch_exit_to_user_mode_prepare(regs, ti_work);
193
194         /* Ensure that the address limit is intact and no locks are held */
195         addr_limit_user_check();
196         lockdep_assert_irqs_disabled();
197         lockdep_sys_exit();
198 }
199
200 #ifndef _TIF_SINGLESTEP
201 static inline bool report_single_step(unsigned long ti_work)
202 {
203         return false;
204 }
205 #else
206 /*
207  * If TIF_SYSCALL_EMU is set, then the only reason to report is when
208  * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
209  * instruction has been already reported in syscall_enter_from_usermode().
210  */
211 #define SYSEMU_STEP     (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
212
213 static inline bool report_single_step(unsigned long ti_work)
214 {
215         return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP;
216 }
217 #endif
218
219 static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
220 {
221         bool step;
222
223         audit_syscall_exit(regs);
224
225         if (ti_work & _TIF_SYSCALL_TRACEPOINT)
226                 trace_sys_exit(regs, syscall_get_return_value(current, regs));
227
228         step = report_single_step(ti_work);
229         if (step || ti_work & _TIF_SYSCALL_TRACE)
230                 arch_syscall_exit_tracehook(regs, step);
231 }
232
233 /*
234  * Syscall specific exit to user mode preparation. Runs with interrupts
235  * enabled.
236  */
237 static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
238 {
239         u32 cached_flags = READ_ONCE(current_thread_info()->flags);
240         unsigned long nr = syscall_get_nr(current, regs);
241
242         CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
243
244         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
245                 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
246                         local_irq_enable();
247         }
248
249         rseq_syscall(regs);
250
251         /*
252          * Do one-time syscall specific work. If these work items are
253          * enabled, we want to run them exactly once per syscall exit with
254          * interrupts enabled.
255          */
256         if (unlikely(cached_flags & SYSCALL_EXIT_WORK))
257                 syscall_exit_work(regs, cached_flags);
258 }
259
260 __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
261 {
262         instrumentation_begin();
263         syscall_exit_to_user_mode_prepare(regs);
264         local_irq_disable_exit_to_user();
265         exit_to_user_mode_prepare(regs);
266         instrumentation_end();
267         exit_to_user_mode();
268 }
269
270 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
271 {
272         enter_from_user_mode(regs);
273 }
274
275 noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
276 {
277         instrumentation_begin();
278         exit_to_user_mode_prepare(regs);
279         instrumentation_end();
280         exit_to_user_mode();
281 }
282
283 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
284 {
285         irqentry_state_t ret = {
286                 .exit_rcu = false,
287         };
288
289         if (user_mode(regs)) {
290                 irqentry_enter_from_user_mode(regs);
291                 return ret;
292         }
293
294         /*
295          * If this entry hit the idle task invoke rcu_irq_enter() whether
296          * RCU is watching or not.
297          *
298          * Interupts can nest when the first interrupt invokes softirq
299          * processing on return which enables interrupts.
300          *
301          * Scheduler ticks in the idle task can mark quiescent state and
302          * terminate a grace period, if and only if the timer interrupt is
303          * not nested into another interrupt.
304          *
305          * Checking for __rcu_is_watching() here would prevent the nesting
306          * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
307          * the tick then rcu_flavor_sched_clock_irq() would wrongfully
308          * assume that it is the first interupt and eventually claim
309          * quiescient state and end grace periods prematurely.
310          *
311          * Unconditionally invoke rcu_irq_enter() so RCU state stays
312          * consistent.
313          *
314          * TINY_RCU does not support EQS, so let the compiler eliminate
315          * this part when enabled.
316          */
317         if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
318                 /*
319                  * If RCU is not watching then the same careful
320                  * sequence vs. lockdep and tracing is required
321                  * as in irq_enter_from_user_mode().
322                  */
323                 lockdep_hardirqs_off(CALLER_ADDR0);
324                 rcu_irq_enter();
325                 instrumentation_begin();
326                 trace_hardirqs_off_finish();
327                 instrumentation_end();
328
329                 ret.exit_rcu = true;
330                 return ret;
331         }
332
333         /*
334          * If RCU is watching then RCU only wants to check whether it needs
335          * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
336          * already contains a warning when RCU is not watching, so no point
337          * in having another one here.
338          */
339         instrumentation_begin();
340         rcu_irq_enter_check_tick();
341         /* Use the combo lockdep/tracing function */
342         trace_hardirqs_off();
343         instrumentation_end();
344
345         return ret;
346 }
347
348 void irqentry_exit_cond_resched(void)
349 {
350         if (!preempt_count()) {
351                 /* Sanity check RCU and thread stack */
352                 rcu_irq_exit_check_preempt();
353                 if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
354                         WARN_ON_ONCE(!on_thread_stack());
355                 if (need_resched())
356                         preempt_schedule_irq();
357         }
358 }
359
360 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
361 {
362         lockdep_assert_irqs_disabled();
363
364         /* Check whether this returns to user mode */
365         if (user_mode(regs)) {
366                 irqentry_exit_to_user_mode(regs);
367         } else if (!regs_irqs_disabled(regs)) {
368                 /*
369                  * If RCU was not watching on entry this needs to be done
370                  * carefully and needs the same ordering of lockdep/tracing
371                  * and RCU as the return to user mode path.
372                  */
373                 if (state.exit_rcu) {
374                         instrumentation_begin();
375                         /* Tell the tracer that IRET will enable interrupts */
376                         trace_hardirqs_on_prepare();
377                         lockdep_hardirqs_on_prepare(CALLER_ADDR0);
378                         instrumentation_end();
379                         rcu_irq_exit();
380                         lockdep_hardirqs_on(CALLER_ADDR0);
381                         return;
382                 }
383
384                 instrumentation_begin();
385                 if (IS_ENABLED(CONFIG_PREEMPTION))
386                         irqentry_exit_cond_resched();
387                 /* Covers both tracing and lockdep */
388                 trace_hardirqs_on();
389                 instrumentation_end();
390         } else {
391                 /*
392                  * IRQ flags state is correct already. Just tell RCU if it
393                  * was not watching on entry.
394                  */
395                 if (state.exit_rcu)
396                         rcu_irq_exit();
397         }
398 }