62fb67e5c59cad7c5a53dc7fd90778193abfa6f5
[platform/kernel/linux-rpi.git] / kernel / cpu.c
1 /* CPU control.
2  * (C) 2001, 2002, 2003, 2004 Rusty Russell
3  *
4  * This code is licenced under the GPL.
5  */
6 #include <linux/sched/mm.h>
7 #include <linux/proc_fs.h>
8 #include <linux/smp.h>
9 #include <linux/init.h>
10 #include <linux/notifier.h>
11 #include <linux/sched/signal.h>
12 #include <linux/sched/hotplug.h>
13 #include <linux/sched/isolation.h>
14 #include <linux/sched/task.h>
15 #include <linux/sched/smt.h>
16 #include <linux/unistd.h>
17 #include <linux/cpu.h>
18 #include <linux/oom.h>
19 #include <linux/rcupdate.h>
20 #include <linux/export.h>
21 #include <linux/bug.h>
22 #include <linux/kthread.h>
23 #include <linux/stop_machine.h>
24 #include <linux/mutex.h>
25 #include <linux/gfp.h>
26 #include <linux/suspend.h>
27 #include <linux/lockdep.h>
28 #include <linux/tick.h>
29 #include <linux/irq.h>
30 #include <linux/nmi.h>
31 #include <linux/smpboot.h>
32 #include <linux/relay.h>
33 #include <linux/slab.h>
34 #include <linux/percpu-rwsem.h>
35 #include <linux/cpuset.h>
36
37 #include <trace/events/power.h>
38 #define CREATE_TRACE_POINTS
39 #include <trace/events/cpuhp.h>
40
41 #include "smpboot.h"
42
43 /**
44  * struct cpuhp_cpu_state - Per cpu hotplug state storage
45  * @state:      The current cpu state
46  * @target:     The target state
47  * @fail:       Current CPU hotplug callback state
48  * @thread:     Pointer to the hotplug thread
49  * @should_run: Thread should execute
50  * @rollback:   Perform a rollback
51  * @single:     Single callback invocation
52  * @bringup:    Single callback bringup or teardown selector
53  * @cpu:        CPU number
54  * @node:       Remote CPU node; for multi-instance, do a
55  *              single entry callback for install/remove
56  * @last:       For multi-instance rollback, remember how far we got
57  * @cb_state:   The state for a single callback (install/uninstall)
58  * @result:     Result of the operation
59  * @done_up:    Signal completion to the issuer of the task for cpu-up
60  * @done_down:  Signal completion to the issuer of the task for cpu-down
61  */
62 struct cpuhp_cpu_state {
63         enum cpuhp_state        state;
64         enum cpuhp_state        target;
65         enum cpuhp_state        fail;
66 #ifdef CONFIG_SMP
67         struct task_struct      *thread;
68         bool                    should_run;
69         bool                    rollback;
70         bool                    single;
71         bool                    bringup;
72         int                     cpu;
73         struct hlist_node       *node;
74         struct hlist_node       *last;
75         enum cpuhp_state        cb_state;
76         int                     result;
77         struct completion       done_up;
78         struct completion       done_down;
79 #endif
80 };
81
82 static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
83         .fail = CPUHP_INVALID,
84 };
85
86 #ifdef CONFIG_SMP
87 cpumask_t cpus_booted_once_mask;
88 #endif
89
90 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
91 static struct lockdep_map cpuhp_state_up_map =
92         STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
93 static struct lockdep_map cpuhp_state_down_map =
94         STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
95
96
97 static inline void cpuhp_lock_acquire(bool bringup)
98 {
99         lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
100 }
101
102 static inline void cpuhp_lock_release(bool bringup)
103 {
104         lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
105 }
106 #else
107
108 static inline void cpuhp_lock_acquire(bool bringup) { }
109 static inline void cpuhp_lock_release(bool bringup) { }
110
111 #endif
112
113 /**
114  * struct cpuhp_step - Hotplug state machine step
115  * @name:       Name of the step
116  * @startup:    Startup function of the step
117  * @teardown:   Teardown function of the step
118  * @cant_stop:  Bringup/teardown can't be stopped at this step
119  * @multi_instance:     State has multiple instances which get added afterwards
120  */
121 struct cpuhp_step {
122         const char              *name;
123         union {
124                 int             (*single)(unsigned int cpu);
125                 int             (*multi)(unsigned int cpu,
126                                          struct hlist_node *node);
127         } startup;
128         union {
129                 int             (*single)(unsigned int cpu);
130                 int             (*multi)(unsigned int cpu,
131                                          struct hlist_node *node);
132         } teardown;
133         /* private: */
134         struct hlist_head       list;
135         /* public: */
136         bool                    cant_stop;
137         bool                    multi_instance;
138 };
139
140 static DEFINE_MUTEX(cpuhp_state_mutex);
141 static struct cpuhp_step cpuhp_hp_states[];
142
143 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
144 {
145         return cpuhp_hp_states + state;
146 }
147
148 static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
149 {
150         return bringup ? !step->startup.single : !step->teardown.single;
151 }
152
153 /**
154  * cpuhp_invoke_callback - Invoke the callbacks for a given state
155  * @cpu:        The cpu for which the callback should be invoked
156  * @state:      The state to do callbacks for
157  * @bringup:    True if the bringup callback should be invoked
158  * @node:       For multi-instance, do a single entry callback for install/remove
159  * @lastp:      For multi-instance rollback, remember how far we got
160  *
161  * Called from cpu hotplug and from the state register machinery.
162  *
163  * Return: %0 on success or a negative errno code
164  */
165 static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
166                                  bool bringup, struct hlist_node *node,
167                                  struct hlist_node **lastp)
168 {
169         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
170         struct cpuhp_step *step = cpuhp_get_step(state);
171         int (*cbm)(unsigned int cpu, struct hlist_node *node);
172         int (*cb)(unsigned int cpu);
173         int ret, cnt;
174
175         if (st->fail == state) {
176                 st->fail = CPUHP_INVALID;
177                 return -EAGAIN;
178         }
179
180         if (cpuhp_step_empty(bringup, step)) {
181                 WARN_ON_ONCE(1);
182                 return 0;
183         }
184
185         if (!step->multi_instance) {
186                 WARN_ON_ONCE(lastp && *lastp);
187                 cb = bringup ? step->startup.single : step->teardown.single;
188
189                 trace_cpuhp_enter(cpu, st->target, state, cb);
190                 ret = cb(cpu);
191                 trace_cpuhp_exit(cpu, st->state, state, ret);
192                 return ret;
193         }
194         cbm = bringup ? step->startup.multi : step->teardown.multi;
195
196         /* Single invocation for instance add/remove */
197         if (node) {
198                 WARN_ON_ONCE(lastp && *lastp);
199                 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
200                 ret = cbm(cpu, node);
201                 trace_cpuhp_exit(cpu, st->state, state, ret);
202                 return ret;
203         }
204
205         /* State transition. Invoke on all instances */
206         cnt = 0;
207         hlist_for_each(node, &step->list) {
208                 if (lastp && node == *lastp)
209                         break;
210
211                 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
212                 ret = cbm(cpu, node);
213                 trace_cpuhp_exit(cpu, st->state, state, ret);
214                 if (ret) {
215                         if (!lastp)
216                                 goto err;
217
218                         *lastp = node;
219                         return ret;
220                 }
221                 cnt++;
222         }
223         if (lastp)
224                 *lastp = NULL;
225         return 0;
226 err:
227         /* Rollback the instances if one failed */
228         cbm = !bringup ? step->startup.multi : step->teardown.multi;
229         if (!cbm)
230                 return ret;
231
232         hlist_for_each(node, &step->list) {
233                 if (!cnt--)
234                         break;
235
236                 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
237                 ret = cbm(cpu, node);
238                 trace_cpuhp_exit(cpu, st->state, state, ret);
239                 /*
240                  * Rollback must not fail,
241                  */
242                 WARN_ON_ONCE(ret);
243         }
244         return ret;
245 }
246
247 #ifdef CONFIG_SMP
248 static bool cpuhp_is_ap_state(enum cpuhp_state state)
249 {
250         /*
251          * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
252          * purposes as that state is handled explicitly in cpu_down.
253          */
254         return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
255 }
256
257 static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
258 {
259         struct completion *done = bringup ? &st->done_up : &st->done_down;
260         wait_for_completion(done);
261 }
262
263 static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
264 {
265         struct completion *done = bringup ? &st->done_up : &st->done_down;
266         complete(done);
267 }
268
269 /*
270  * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
271  */
272 static bool cpuhp_is_atomic_state(enum cpuhp_state state)
273 {
274         return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
275 }
276
277 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
278 static DEFINE_MUTEX(cpu_add_remove_lock);
279 bool cpuhp_tasks_frozen;
280 EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
281
282 /*
283  * The following two APIs (cpu_maps_update_begin/done) must be used when
284  * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
285  */
286 void cpu_maps_update_begin(void)
287 {
288         mutex_lock(&cpu_add_remove_lock);
289 }
290
291 void cpu_maps_update_done(void)
292 {
293         mutex_unlock(&cpu_add_remove_lock);
294 }
295
296 /*
297  * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
298  * Should always be manipulated under cpu_add_remove_lock
299  */
300 static int cpu_hotplug_disabled;
301
302 #ifdef CONFIG_HOTPLUG_CPU
303
304 DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
305
306 void cpus_read_lock(void)
307 {
308         percpu_down_read(&cpu_hotplug_lock);
309 }
310 EXPORT_SYMBOL_GPL(cpus_read_lock);
311
312 int cpus_read_trylock(void)
313 {
314         return percpu_down_read_trylock(&cpu_hotplug_lock);
315 }
316 EXPORT_SYMBOL_GPL(cpus_read_trylock);
317
318 void cpus_read_unlock(void)
319 {
320         percpu_up_read(&cpu_hotplug_lock);
321 }
322 EXPORT_SYMBOL_GPL(cpus_read_unlock);
323
324 void cpus_write_lock(void)
325 {
326         percpu_down_write(&cpu_hotplug_lock);
327 }
328
329 void cpus_write_unlock(void)
330 {
331         percpu_up_write(&cpu_hotplug_lock);
332 }
333
334 void lockdep_assert_cpus_held(void)
335 {
336         /*
337          * We can't have hotplug operations before userspace starts running,
338          * and some init codepaths will knowingly not take the hotplug lock.
339          * This is all valid, so mute lockdep until it makes sense to report
340          * unheld locks.
341          */
342         if (system_state < SYSTEM_RUNNING)
343                 return;
344
345         percpu_rwsem_assert_held(&cpu_hotplug_lock);
346 }
347
348 #ifdef CONFIG_LOCKDEP
349 int lockdep_is_cpus_held(void)
350 {
351         return percpu_rwsem_is_held(&cpu_hotplug_lock);
352 }
353 #endif
354
355 static void lockdep_acquire_cpus_lock(void)
356 {
357         rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
358 }
359
360 static void lockdep_release_cpus_lock(void)
361 {
362         rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
363 }
364
365 /*
366  * Wait for currently running CPU hotplug operations to complete (if any) and
367  * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
368  * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
369  * hotplug path before performing hotplug operations. So acquiring that lock
370  * guarantees mutual exclusion from any currently running hotplug operations.
371  */
372 void cpu_hotplug_disable(void)
373 {
374         cpu_maps_update_begin();
375         cpu_hotplug_disabled++;
376         cpu_maps_update_done();
377 }
378 EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
379
380 static void __cpu_hotplug_enable(void)
381 {
382         if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
383                 return;
384         cpu_hotplug_disabled--;
385 }
386
387 void cpu_hotplug_enable(void)
388 {
389         cpu_maps_update_begin();
390         __cpu_hotplug_enable();
391         cpu_maps_update_done();
392 }
393 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
394
395 #else
396
397 static void lockdep_acquire_cpus_lock(void)
398 {
399 }
400
401 static void lockdep_release_cpus_lock(void)
402 {
403 }
404
405 #endif  /* CONFIG_HOTPLUG_CPU */
406
407 /*
408  * Architectures that need SMT-specific errata handling during SMT hotplug
409  * should override this.
410  */
411 void __weak arch_smt_update(void) { }
412
413 #ifdef CONFIG_HOTPLUG_SMT
414 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
415
416 void __init cpu_smt_disable(bool force)
417 {
418         if (!cpu_smt_possible())
419                 return;
420
421         if (force) {
422                 pr_info("SMT: Force disabled\n");
423                 cpu_smt_control = CPU_SMT_FORCE_DISABLED;
424         } else {
425                 pr_info("SMT: disabled\n");
426                 cpu_smt_control = CPU_SMT_DISABLED;
427         }
428 }
429
430 /*
431  * The decision whether SMT is supported can only be done after the full
432  * CPU identification. Called from architecture code.
433  */
434 void __init cpu_smt_check_topology(void)
435 {
436         if (!topology_smt_supported())
437                 cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
438 }
439
440 static int __init smt_cmdline_disable(char *str)
441 {
442         cpu_smt_disable(str && !strcmp(str, "force"));
443         return 0;
444 }
445 early_param("nosmt", smt_cmdline_disable);
446
447 static inline bool cpu_smt_allowed(unsigned int cpu)
448 {
449         if (cpu_smt_control == CPU_SMT_ENABLED)
450                 return true;
451
452         if (topology_is_primary_thread(cpu))
453                 return true;
454
455         /*
456          * On x86 it's required to boot all logical CPUs at least once so
457          * that the init code can get a chance to set CR4.MCE on each
458          * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
459          * core will shutdown the machine.
460          */
461         return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
462 }
463
464 /* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
465 bool cpu_smt_possible(void)
466 {
467         return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
468                 cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
469 }
470 EXPORT_SYMBOL_GPL(cpu_smt_possible);
471 #else
472 static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
473 #endif
474
475 static inline enum cpuhp_state
476 cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
477 {
478         enum cpuhp_state prev_state = st->state;
479         bool bringup = st->state < target;
480
481         st->rollback = false;
482         st->last = NULL;
483
484         st->target = target;
485         st->single = false;
486         st->bringup = bringup;
487         if (cpu_dying(st->cpu) != !bringup)
488                 set_cpu_dying(st->cpu, !bringup);
489
490         return prev_state;
491 }
492
493 static inline void
494 cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
495 {
496         bool bringup = !st->bringup;
497
498         st->target = prev_state;
499
500         /*
501          * Already rolling back. No need invert the bringup value or to change
502          * the current state.
503          */
504         if (st->rollback)
505                 return;
506
507         st->rollback = true;
508
509         /*
510          * If we have st->last we need to undo partial multi_instance of this
511          * state first. Otherwise start undo at the previous state.
512          */
513         if (!st->last) {
514                 if (st->bringup)
515                         st->state--;
516                 else
517                         st->state++;
518         }
519
520         st->bringup = bringup;
521         if (cpu_dying(st->cpu) != !bringup)
522                 set_cpu_dying(st->cpu, !bringup);
523 }
524
525 /* Regular hotplug invocation of the AP hotplug thread */
526 static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
527 {
528         if (!st->single && st->state == st->target)
529                 return;
530
531         st->result = 0;
532         /*
533          * Make sure the above stores are visible before should_run becomes
534          * true. Paired with the mb() above in cpuhp_thread_fun()
535          */
536         smp_mb();
537         st->should_run = true;
538         wake_up_process(st->thread);
539         wait_for_ap_thread(st, st->bringup);
540 }
541
542 static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
543 {
544         enum cpuhp_state prev_state;
545         int ret;
546
547         prev_state = cpuhp_set_state(st, target);
548         __cpuhp_kick_ap(st);
549         if ((ret = st->result)) {
550                 cpuhp_reset_state(st, prev_state);
551                 __cpuhp_kick_ap(st);
552         }
553
554         return ret;
555 }
556
557 static int bringup_wait_for_ap(unsigned int cpu)
558 {
559         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
560
561         /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
562         wait_for_ap_thread(st, true);
563         if (WARN_ON_ONCE((!cpu_online(cpu))))
564                 return -ECANCELED;
565
566         /* Unpark the hotplug thread of the target cpu */
567         kthread_unpark(st->thread);
568
569         /*
570          * SMT soft disabling on X86 requires to bring the CPU out of the
571          * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
572          * CPU marked itself as booted_once in notify_cpu_starting() so the
573          * cpu_smt_allowed() check will now return false if this is not the
574          * primary sibling.
575          */
576         if (!cpu_smt_allowed(cpu))
577                 return -ECANCELED;
578
579         if (st->target <= CPUHP_AP_ONLINE_IDLE)
580                 return 0;
581
582         return cpuhp_kick_ap(st, st->target);
583 }
584
585 static int bringup_cpu(unsigned int cpu)
586 {
587         struct task_struct *idle = idle_thread_get(cpu);
588         int ret;
589
590         /*
591          * Some architectures have to walk the irq descriptors to
592          * setup the vector space for the cpu which comes online.
593          * Prevent irq alloc/free across the bringup.
594          */
595         irq_lock_sparse();
596
597         /* Arch-specific enabling code. */
598         ret = __cpu_up(cpu, idle);
599         irq_unlock_sparse();
600         if (ret)
601                 return ret;
602         return bringup_wait_for_ap(cpu);
603 }
604
605 static int finish_cpu(unsigned int cpu)
606 {
607         struct task_struct *idle = idle_thread_get(cpu);
608         struct mm_struct *mm = idle->active_mm;
609
610         /*
611          * idle_task_exit() will have switched to &init_mm, now
612          * clean up any remaining active_mm state.
613          */
614         if (mm != &init_mm)
615                 idle->active_mm = &init_mm;
616         mmdrop(mm);
617         return 0;
618 }
619
620 /*
621  * Hotplug state machine related functions
622  */
623
624 /*
625  * Get the next state to run. Empty ones will be skipped. Returns true if a
626  * state must be run.
627  *
628  * st->state will be modified ahead of time, to match state_to_run, as if it
629  * has already ran.
630  */
631 static bool cpuhp_next_state(bool bringup,
632                              enum cpuhp_state *state_to_run,
633                              struct cpuhp_cpu_state *st,
634                              enum cpuhp_state target)
635 {
636         do {
637                 if (bringup) {
638                         if (st->state >= target)
639                                 return false;
640
641                         *state_to_run = ++st->state;
642                 } else {
643                         if (st->state <= target)
644                                 return false;
645
646                         *state_to_run = st->state--;
647                 }
648
649                 if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
650                         break;
651         } while (true);
652
653         return true;
654 }
655
656 static int cpuhp_invoke_callback_range(bool bringup,
657                                        unsigned int cpu,
658                                        struct cpuhp_cpu_state *st,
659                                        enum cpuhp_state target)
660 {
661         enum cpuhp_state state;
662         int err = 0;
663
664         while (cpuhp_next_state(bringup, &state, st, target)) {
665                 err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
666                 if (err)
667                         break;
668         }
669
670         return err;
671 }
672
673 static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
674 {
675         if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
676                 return true;
677         /*
678          * When CPU hotplug is disabled, then taking the CPU down is not
679          * possible because takedown_cpu() and the architecture and
680          * subsystem specific mechanisms are not available. So the CPU
681          * which would be completely unplugged again needs to stay around
682          * in the current state.
683          */
684         return st->state <= CPUHP_BRINGUP_CPU;
685 }
686
687 static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
688                               enum cpuhp_state target)
689 {
690         enum cpuhp_state prev_state = st->state;
691         int ret = 0;
692
693         ret = cpuhp_invoke_callback_range(true, cpu, st, target);
694         if (ret) {
695                 cpuhp_reset_state(st, prev_state);
696                 if (can_rollback_cpu(st))
697                         WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
698                                                             prev_state));
699         }
700         return ret;
701 }
702
703 /*
704  * The cpu hotplug threads manage the bringup and teardown of the cpus
705  */
706 static void cpuhp_create(unsigned int cpu)
707 {
708         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
709
710         init_completion(&st->done_up);
711         init_completion(&st->done_down);
712         st->cpu = cpu;
713 }
714
715 static int cpuhp_should_run(unsigned int cpu)
716 {
717         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
718
719         return st->should_run;
720 }
721
722 /*
723  * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
724  * callbacks when a state gets [un]installed at runtime.
725  *
726  * Each invocation of this function by the smpboot thread does a single AP
727  * state callback.
728  *
729  * It has 3 modes of operation:
730  *  - single: runs st->cb_state
731  *  - up:     runs ++st->state, while st->state < st->target
732  *  - down:   runs st->state--, while st->state > st->target
733  *
734  * When complete or on error, should_run is cleared and the completion is fired.
735  */
736 static void cpuhp_thread_fun(unsigned int cpu)
737 {
738         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
739         bool bringup = st->bringup;
740         enum cpuhp_state state;
741
742         if (WARN_ON_ONCE(!st->should_run))
743                 return;
744
745         /*
746          * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
747          * that if we see ->should_run we also see the rest of the state.
748          */
749         smp_mb();
750
751         /*
752          * The BP holds the hotplug lock, but we're now running on the AP,
753          * ensure that anybody asserting the lock is held, will actually find
754          * it so.
755          */
756         lockdep_acquire_cpus_lock();
757         cpuhp_lock_acquire(bringup);
758
759         if (st->single) {
760                 state = st->cb_state;
761                 st->should_run = false;
762         } else {
763                 st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
764                 if (!st->should_run)
765                         goto end;
766         }
767
768         WARN_ON_ONCE(!cpuhp_is_ap_state(state));
769
770         if (cpuhp_is_atomic_state(state)) {
771                 local_irq_disable();
772                 st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
773                 local_irq_enable();
774
775                 /*
776                  * STARTING/DYING must not fail!
777                  */
778                 WARN_ON_ONCE(st->result);
779         } else {
780                 st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
781         }
782
783         if (st->result) {
784                 /*
785                  * If we fail on a rollback, we're up a creek without no
786                  * paddle, no way forward, no way back. We loose, thanks for
787                  * playing.
788                  */
789                 WARN_ON_ONCE(st->rollback);
790                 st->should_run = false;
791         }
792
793 end:
794         cpuhp_lock_release(bringup);
795         lockdep_release_cpus_lock();
796
797         if (!st->should_run)
798                 complete_ap_thread(st, bringup);
799 }
800
801 /* Invoke a single callback on a remote cpu */
802 static int
803 cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
804                          struct hlist_node *node)
805 {
806         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
807         int ret;
808
809         if (!cpu_online(cpu))
810                 return 0;
811
812         cpuhp_lock_acquire(false);
813         cpuhp_lock_release(false);
814
815         cpuhp_lock_acquire(true);
816         cpuhp_lock_release(true);
817
818         /*
819          * If we are up and running, use the hotplug thread. For early calls
820          * we invoke the thread function directly.
821          */
822         if (!st->thread)
823                 return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
824
825         st->rollback = false;
826         st->last = NULL;
827
828         st->node = node;
829         st->bringup = bringup;
830         st->cb_state = state;
831         st->single = true;
832
833         __cpuhp_kick_ap(st);
834
835         /*
836          * If we failed and did a partial, do a rollback.
837          */
838         if ((ret = st->result) && st->last) {
839                 st->rollback = true;
840                 st->bringup = !bringup;
841
842                 __cpuhp_kick_ap(st);
843         }
844
845         /*
846          * Clean up the leftovers so the next hotplug operation wont use stale
847          * data.
848          */
849         st->node = st->last = NULL;
850         return ret;
851 }
852
853 static int cpuhp_kick_ap_work(unsigned int cpu)
854 {
855         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
856         enum cpuhp_state prev_state = st->state;
857         int ret;
858
859         cpuhp_lock_acquire(false);
860         cpuhp_lock_release(false);
861
862         cpuhp_lock_acquire(true);
863         cpuhp_lock_release(true);
864
865         trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
866         ret = cpuhp_kick_ap(st, st->target);
867         trace_cpuhp_exit(cpu, st->state, prev_state, ret);
868
869         return ret;
870 }
871
872 static struct smp_hotplug_thread cpuhp_threads = {
873         .store                  = &cpuhp_state.thread,
874         .create                 = &cpuhp_create,
875         .thread_should_run      = cpuhp_should_run,
876         .thread_fn              = cpuhp_thread_fun,
877         .thread_comm            = "cpuhp/%u",
878         .selfparking            = true,
879 };
880
881 void __init cpuhp_threads_init(void)
882 {
883         BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
884         kthread_unpark(this_cpu_read(cpuhp_state.thread));
885 }
886
887 /*
888  *
889  * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
890  * protected region.
891  *
892  * The operation is still serialized against concurrent CPU hotplug via
893  * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
894  * serialized against other hotplug related activity like adding or
895  * removing of state callbacks and state instances, which invoke either the
896  * startup or the teardown callback of the affected state.
897  *
898  * This is required for subsystems which are unfixable vs. CPU hotplug and
899  * evade lock inversion problems by scheduling work which has to be
900  * completed _before_ cpu_up()/_cpu_down() returns.
901  *
902  * Don't even think about adding anything to this for any new code or even
903  * drivers. It's only purpose is to keep existing lock order trainwrecks
904  * working.
905  *
906  * For cpu_down() there might be valid reasons to finish cleanups which are
907  * not required to be done under cpu_hotplug_lock, but that's a different
908  * story and would be not invoked via this.
909  */
910 static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
911 {
912         /*
913          * cpusets delegate hotplug operations to a worker to "solve" the
914          * lock order problems. Wait for the worker, but only if tasks are
915          * _not_ frozen (suspend, hibernate) as that would wait forever.
916          *
917          * The wait is required because otherwise the hotplug operation
918          * returns with inconsistent state, which could even be observed in
919          * user space when a new CPU is brought up. The CPU plug uevent
920          * would be delivered and user space reacting on it would fail to
921          * move tasks to the newly plugged CPU up to the point where the
922          * work has finished because up to that point the newly plugged CPU
923          * is not assignable in cpusets/cgroups. On unplug that's not
924          * necessarily a visible issue, but it is still inconsistent state,
925          * which is the real problem which needs to be "fixed". This can't
926          * prevent the transient state between scheduling the work and
927          * returning from waiting for it.
928          */
929         if (!tasks_frozen)
930                 cpuset_wait_for_hotplug();
931 }
932
933 #ifdef CONFIG_HOTPLUG_CPU
934 #ifndef arch_clear_mm_cpumask_cpu
935 #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
936 #endif
937
938 /**
939  * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
940  * @cpu: a CPU id
941  *
942  * This function walks all processes, finds a valid mm struct for each one and
943  * then clears a corresponding bit in mm's cpumask.  While this all sounds
944  * trivial, there are various non-obvious corner cases, which this function
945  * tries to solve in a safe manner.
946  *
947  * Also note that the function uses a somewhat relaxed locking scheme, so it may
948  * be called only for an already offlined CPU.
949  */
950 void clear_tasks_mm_cpumask(int cpu)
951 {
952         struct task_struct *p;
953
954         /*
955          * This function is called after the cpu is taken down and marked
956          * offline, so its not like new tasks will ever get this cpu set in
957          * their mm mask. -- Peter Zijlstra
958          * Thus, we may use rcu_read_lock() here, instead of grabbing
959          * full-fledged tasklist_lock.
960          */
961         WARN_ON(cpu_online(cpu));
962         rcu_read_lock();
963         for_each_process(p) {
964                 struct task_struct *t;
965
966                 /*
967                  * Main thread might exit, but other threads may still have
968                  * a valid mm. Find one.
969                  */
970                 t = find_lock_task_mm(p);
971                 if (!t)
972                         continue;
973                 arch_clear_mm_cpumask_cpu(cpu, t->mm);
974                 task_unlock(t);
975         }
976         rcu_read_unlock();
977 }
978
979 /* Take this CPU down. */
980 static int take_cpu_down(void *_param)
981 {
982         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
983         enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
984         int err, cpu = smp_processor_id();
985         int ret;
986
987         /* Ensure this CPU doesn't handle any more interrupts. */
988         err = __cpu_disable();
989         if (err < 0)
990                 return err;
991
992         /*
993          * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
994          * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
995          */
996         WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
997
998         /* Invoke the former CPU_DYING callbacks */
999         ret = cpuhp_invoke_callback_range(false, cpu, st, target);
1000
1001         /*
1002          * DYING must not fail!
1003          */
1004         WARN_ON_ONCE(ret);
1005
1006         /* Give up timekeeping duties */
1007         tick_handover_do_timer();
1008         /* Remove CPU from timer broadcasting */
1009         tick_offline_cpu(cpu);
1010         /* Park the stopper thread */
1011         stop_machine_park(cpu);
1012         return 0;
1013 }
1014
1015 static int takedown_cpu(unsigned int cpu)
1016 {
1017         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1018         int err;
1019
1020         /* Park the smpboot threads */
1021         kthread_park(st->thread);
1022
1023         /*
1024          * Prevent irq alloc/free while the dying cpu reorganizes the
1025          * interrupt affinities.
1026          */
1027         irq_lock_sparse();
1028
1029         /*
1030          * So now all preempt/rcu users must observe !cpu_active().
1031          */
1032         err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
1033         if (err) {
1034                 /* CPU refused to die */
1035                 irq_unlock_sparse();
1036                 /* Unpark the hotplug thread so we can rollback there */
1037                 kthread_unpark(st->thread);
1038                 return err;
1039         }
1040         BUG_ON(cpu_online(cpu));
1041
1042         /*
1043          * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
1044          * all runnable tasks from the CPU, there's only the idle task left now
1045          * that the migration thread is done doing the stop_machine thing.
1046          *
1047          * Wait for the stop thread to go away.
1048          */
1049         wait_for_ap_thread(st, false);
1050         BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
1051
1052         /* Interrupts are moved away from the dying cpu, reenable alloc/free */
1053         irq_unlock_sparse();
1054
1055         hotplug_cpu__broadcast_tick_pull(cpu);
1056         /* This actually kills the CPU. */
1057         __cpu_die(cpu);
1058
1059         tick_cleanup_dead_cpu(cpu);
1060         rcutree_migrate_callbacks(cpu);
1061         return 0;
1062 }
1063
1064 static void cpuhp_complete_idle_dead(void *arg)
1065 {
1066         struct cpuhp_cpu_state *st = arg;
1067
1068         complete_ap_thread(st, false);
1069 }
1070
1071 void cpuhp_report_idle_dead(void)
1072 {
1073         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
1074
1075         BUG_ON(st->state != CPUHP_AP_OFFLINE);
1076         rcu_report_dead(smp_processor_id());
1077         st->state = CPUHP_AP_IDLE_DEAD;
1078         /*
1079          * We cannot call complete after rcu_report_dead() so we delegate it
1080          * to an online cpu.
1081          */
1082         smp_call_function_single(cpumask_first(cpu_online_mask),
1083                                  cpuhp_complete_idle_dead, st, 0);
1084 }
1085
1086 static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
1087                                 enum cpuhp_state target)
1088 {
1089         enum cpuhp_state prev_state = st->state;
1090         int ret = 0;
1091
1092         ret = cpuhp_invoke_callback_range(false, cpu, st, target);
1093         if (ret) {
1094
1095                 cpuhp_reset_state(st, prev_state);
1096
1097                 if (st->state < prev_state)
1098                         WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
1099                                                             prev_state));
1100         }
1101
1102         return ret;
1103 }
1104
1105 /* Requires cpu_add_remove_lock to be held */
1106 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
1107                            enum cpuhp_state target)
1108 {
1109         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1110         int prev_state, ret = 0;
1111
1112         if (num_online_cpus() == 1)
1113                 return -EBUSY;
1114
1115         if (!cpu_present(cpu))
1116                 return -EINVAL;
1117
1118         cpus_write_lock();
1119
1120         cpuhp_tasks_frozen = tasks_frozen;
1121
1122         prev_state = cpuhp_set_state(st, target);
1123         /*
1124          * If the current CPU state is in the range of the AP hotplug thread,
1125          * then we need to kick the thread.
1126          */
1127         if (st->state > CPUHP_TEARDOWN_CPU) {
1128                 st->target = max((int)target, CPUHP_TEARDOWN_CPU);
1129                 ret = cpuhp_kick_ap_work(cpu);
1130                 /*
1131                  * The AP side has done the error rollback already. Just
1132                  * return the error code..
1133                  */
1134                 if (ret)
1135                         goto out;
1136
1137                 /*
1138                  * We might have stopped still in the range of the AP hotplug
1139                  * thread. Nothing to do anymore.
1140                  */
1141                 if (st->state > CPUHP_TEARDOWN_CPU)
1142                         goto out;
1143
1144                 st->target = target;
1145         }
1146         /*
1147          * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
1148          * to do the further cleanups.
1149          */
1150         ret = cpuhp_down_callbacks(cpu, st, target);
1151         if (ret && st->state < prev_state) {
1152                 if (st->state == CPUHP_TEARDOWN_CPU) {
1153                         cpuhp_reset_state(st, prev_state);
1154                         __cpuhp_kick_ap(st);
1155                 } else {
1156                         WARN(1, "DEAD callback error for CPU%d", cpu);
1157                 }
1158         }
1159
1160 out:
1161         cpus_write_unlock();
1162         /*
1163          * Do post unplug cleanup. This is still protected against
1164          * concurrent CPU hotplug via cpu_add_remove_lock.
1165          */
1166         lockup_detector_cleanup();
1167         arch_smt_update();
1168         cpu_up_down_serialize_trainwrecks(tasks_frozen);
1169         return ret;
1170 }
1171
1172 static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
1173 {
1174         if (cpu_hotplug_disabled)
1175                 return -EBUSY;
1176         return _cpu_down(cpu, 0, target);
1177 }
1178
1179 static int cpu_down(unsigned int cpu, enum cpuhp_state target)
1180 {
1181         int err;
1182
1183         cpu_maps_update_begin();
1184         err = cpu_down_maps_locked(cpu, target);
1185         cpu_maps_update_done();
1186         return err;
1187 }
1188
1189 /**
1190  * cpu_device_down - Bring down a cpu device
1191  * @dev: Pointer to the cpu device to offline
1192  *
1193  * This function is meant to be used by device core cpu subsystem only.
1194  *
1195  * Other subsystems should use remove_cpu() instead.
1196  *
1197  * Return: %0 on success or a negative errno code
1198  */
1199 int cpu_device_down(struct device *dev)
1200 {
1201         return cpu_down(dev->id, CPUHP_OFFLINE);
1202 }
1203
1204 int remove_cpu(unsigned int cpu)
1205 {
1206         int ret;
1207
1208         lock_device_hotplug();
1209         ret = device_offline(get_cpu_device(cpu));
1210         unlock_device_hotplug();
1211
1212         return ret;
1213 }
1214 EXPORT_SYMBOL_GPL(remove_cpu);
1215
1216 void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
1217 {
1218         unsigned int cpu;
1219         int error;
1220
1221         cpu_maps_update_begin();
1222
1223         /*
1224          * Make certain the cpu I'm about to reboot on is online.
1225          *
1226          * This is inline to what migrate_to_reboot_cpu() already do.
1227          */
1228         if (!cpu_online(primary_cpu))
1229                 primary_cpu = cpumask_first(cpu_online_mask);
1230
1231         for_each_online_cpu(cpu) {
1232                 if (cpu == primary_cpu)
1233                         continue;
1234
1235                 error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
1236                 if (error) {
1237                         pr_err("Failed to offline CPU%d - error=%d",
1238                                 cpu, error);
1239                         break;
1240                 }
1241         }
1242
1243         /*
1244          * Ensure all but the reboot CPU are offline.
1245          */
1246         BUG_ON(num_online_cpus() > 1);
1247
1248         /*
1249          * Make sure the CPUs won't be enabled by someone else after this
1250          * point. Kexec will reboot to a new kernel shortly resetting
1251          * everything along the way.
1252          */
1253         cpu_hotplug_disabled++;
1254
1255         cpu_maps_update_done();
1256 }
1257
1258 #else
1259 #define takedown_cpu            NULL
1260 #endif /*CONFIG_HOTPLUG_CPU*/
1261
1262 /**
1263  * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
1264  * @cpu: cpu that just started
1265  *
1266  * It must be called by the arch code on the new cpu, before the new cpu
1267  * enables interrupts and before the "boot" cpu returns from __cpu_up().
1268  */
1269 void notify_cpu_starting(unsigned int cpu)
1270 {
1271         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1272         enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
1273         int ret;
1274
1275         rcu_cpu_starting(cpu);  /* Enables RCU usage on this CPU. */
1276         cpumask_set_cpu(cpu, &cpus_booted_once_mask);
1277         ret = cpuhp_invoke_callback_range(true, cpu, st, target);
1278
1279         /*
1280          * STARTING must not fail!
1281          */
1282         WARN_ON_ONCE(ret);
1283 }
1284
1285 /*
1286  * Called from the idle task. Wake up the controlling task which brings the
1287  * hotplug thread of the upcoming CPU up and then delegates the rest of the
1288  * online bringup to the hotplug thread.
1289  */
1290 void cpuhp_online_idle(enum cpuhp_state state)
1291 {
1292         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
1293
1294         /* Happens for the boot cpu */
1295         if (state != CPUHP_AP_ONLINE_IDLE)
1296                 return;
1297
1298         /*
1299          * Unpart the stopper thread before we start the idle loop (and start
1300          * scheduling); this ensures the stopper task is always available.
1301          */
1302         stop_machine_unpark(smp_processor_id());
1303
1304         st->state = CPUHP_AP_ONLINE_IDLE;
1305         complete_ap_thread(st, true);
1306 }
1307
1308 /* Requires cpu_add_remove_lock to be held */
1309 static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
1310 {
1311         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1312         struct task_struct *idle;
1313         int ret = 0;
1314
1315         cpus_write_lock();
1316
1317         if (!cpu_present(cpu)) {
1318                 ret = -EINVAL;
1319                 goto out;
1320         }
1321
1322         /*
1323          * The caller of cpu_up() might have raced with another
1324          * caller. Nothing to do.
1325          */
1326         if (st->state >= target)
1327                 goto out;
1328
1329         if (st->state == CPUHP_OFFLINE) {
1330                 /* Let it fail before we try to bring the cpu up */
1331                 idle = idle_thread_get(cpu);
1332                 if (IS_ERR(idle)) {
1333                         ret = PTR_ERR(idle);
1334                         goto out;
1335                 }
1336         }
1337
1338         cpuhp_tasks_frozen = tasks_frozen;
1339
1340         cpuhp_set_state(st, target);
1341         /*
1342          * If the current CPU state is in the range of the AP hotplug thread,
1343          * then we need to kick the thread once more.
1344          */
1345         if (st->state > CPUHP_BRINGUP_CPU) {
1346                 ret = cpuhp_kick_ap_work(cpu);
1347                 /*
1348                  * The AP side has done the error rollback already. Just
1349                  * return the error code..
1350                  */
1351                 if (ret)
1352                         goto out;
1353         }
1354
1355         /*
1356          * Try to reach the target state. We max out on the BP at
1357          * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
1358          * responsible for bringing it up to the target state.
1359          */
1360         target = min((int)target, CPUHP_BRINGUP_CPU);
1361         ret = cpuhp_up_callbacks(cpu, st, target);
1362 out:
1363         cpus_write_unlock();
1364         arch_smt_update();
1365         cpu_up_down_serialize_trainwrecks(tasks_frozen);
1366         return ret;
1367 }
1368
1369 static int cpu_up(unsigned int cpu, enum cpuhp_state target)
1370 {
1371         int err = 0;
1372
1373         if (!cpu_possible(cpu)) {
1374                 pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
1375                        cpu);
1376 #if defined(CONFIG_IA64)
1377                 pr_err("please check additional_cpus= boot parameter\n");
1378 #endif
1379                 return -EINVAL;
1380         }
1381
1382         err = try_online_node(cpu_to_node(cpu));
1383         if (err)
1384                 return err;
1385
1386         cpu_maps_update_begin();
1387
1388         if (cpu_hotplug_disabled) {
1389                 err = -EBUSY;
1390                 goto out;
1391         }
1392         if (!cpu_smt_allowed(cpu)) {
1393                 err = -EPERM;
1394                 goto out;
1395         }
1396
1397         err = _cpu_up(cpu, 0, target);
1398 out:
1399         cpu_maps_update_done();
1400         return err;
1401 }
1402
1403 /**
1404  * cpu_device_up - Bring up a cpu device
1405  * @dev: Pointer to the cpu device to online
1406  *
1407  * This function is meant to be used by device core cpu subsystem only.
1408  *
1409  * Other subsystems should use add_cpu() instead.
1410  *
1411  * Return: %0 on success or a negative errno code
1412  */
1413 int cpu_device_up(struct device *dev)
1414 {
1415         return cpu_up(dev->id, CPUHP_ONLINE);
1416 }
1417
1418 int add_cpu(unsigned int cpu)
1419 {
1420         int ret;
1421
1422         lock_device_hotplug();
1423         ret = device_online(get_cpu_device(cpu));
1424         unlock_device_hotplug();
1425
1426         return ret;
1427 }
1428 EXPORT_SYMBOL_GPL(add_cpu);
1429
1430 /**
1431  * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
1432  * @sleep_cpu: The cpu we hibernated on and should be brought up.
1433  *
1434  * On some architectures like arm64, we can hibernate on any CPU, but on
1435  * wake up the CPU we hibernated on might be offline as a side effect of
1436  * using maxcpus= for example.
1437  *
1438  * Return: %0 on success or a negative errno code
1439  */
1440 int bringup_hibernate_cpu(unsigned int sleep_cpu)
1441 {
1442         int ret;
1443
1444         if (!cpu_online(sleep_cpu)) {
1445                 pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
1446                 ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
1447                 if (ret) {
1448                         pr_err("Failed to bring hibernate-CPU up!\n");
1449                         return ret;
1450                 }
1451         }
1452         return 0;
1453 }
1454
1455 void bringup_nonboot_cpus(unsigned int setup_max_cpus)
1456 {
1457         unsigned int cpu;
1458
1459         for_each_present_cpu(cpu) {
1460                 if (num_online_cpus() >= setup_max_cpus)
1461                         break;
1462                 if (!cpu_online(cpu))
1463                         cpu_up(cpu, CPUHP_ONLINE);
1464         }
1465 }
1466
1467 #ifdef CONFIG_PM_SLEEP_SMP
1468 static cpumask_var_t frozen_cpus;
1469
1470 int freeze_secondary_cpus(int primary)
1471 {
1472         int cpu, error = 0;
1473
1474         cpu_maps_update_begin();
1475         if (primary == -1) {
1476                 primary = cpumask_first(cpu_online_mask);
1477                 if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
1478                         primary = housekeeping_any_cpu(HK_FLAG_TIMER);
1479         } else {
1480                 if (!cpu_online(primary))
1481                         primary = cpumask_first(cpu_online_mask);
1482         }
1483
1484         /*
1485          * We take down all of the non-boot CPUs in one shot to avoid races
1486          * with the userspace trying to use the CPU hotplug at the same time
1487          */
1488         cpumask_clear(frozen_cpus);
1489
1490         pr_info("Disabling non-boot CPUs ...\n");
1491         for_each_online_cpu(cpu) {
1492                 if (cpu == primary)
1493                         continue;
1494
1495                 if (pm_wakeup_pending()) {
1496                         pr_info("Wakeup pending. Abort CPU freeze\n");
1497                         error = -EBUSY;
1498                         break;
1499                 }
1500
1501                 trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
1502                 error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
1503                 trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
1504                 if (!error)
1505                         cpumask_set_cpu(cpu, frozen_cpus);
1506                 else {
1507                         pr_err("Error taking CPU%d down: %d\n", cpu, error);
1508                         break;
1509                 }
1510         }
1511
1512         if (!error)
1513                 BUG_ON(num_online_cpus() > 1);
1514         else
1515                 pr_err("Non-boot CPUs are not disabled\n");
1516
1517         /*
1518          * Make sure the CPUs won't be enabled by someone else. We need to do
1519          * this even in case of failure as all freeze_secondary_cpus() users are
1520          * supposed to do thaw_secondary_cpus() on the failure path.
1521          */
1522         cpu_hotplug_disabled++;
1523
1524         cpu_maps_update_done();
1525         return error;
1526 }
1527
1528 void __weak arch_thaw_secondary_cpus_begin(void)
1529 {
1530 }
1531
1532 void __weak arch_thaw_secondary_cpus_end(void)
1533 {
1534 }
1535
1536 void thaw_secondary_cpus(void)
1537 {
1538         int cpu, error;
1539
1540         /* Allow everyone to use the CPU hotplug again */
1541         cpu_maps_update_begin();
1542         __cpu_hotplug_enable();
1543         if (cpumask_empty(frozen_cpus))
1544                 goto out;
1545
1546         pr_info("Enabling non-boot CPUs ...\n");
1547
1548         arch_thaw_secondary_cpus_begin();
1549
1550         for_each_cpu(cpu, frozen_cpus) {
1551                 trace_suspend_resume(TPS("CPU_ON"), cpu, true);
1552                 error = _cpu_up(cpu, 1, CPUHP_ONLINE);
1553                 trace_suspend_resume(TPS("CPU_ON"), cpu, false);
1554                 if (!error) {
1555                         pr_info("CPU%d is up\n", cpu);
1556                         continue;
1557                 }
1558                 pr_warn("Error taking CPU%d up: %d\n", cpu, error);
1559         }
1560
1561         arch_thaw_secondary_cpus_end();
1562
1563         cpumask_clear(frozen_cpus);
1564 out:
1565         cpu_maps_update_done();
1566 }
1567
1568 static int __init alloc_frozen_cpus(void)
1569 {
1570         if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
1571                 return -ENOMEM;
1572         return 0;
1573 }
1574 core_initcall(alloc_frozen_cpus);
1575
1576 /*
1577  * When callbacks for CPU hotplug notifications are being executed, we must
1578  * ensure that the state of the system with respect to the tasks being frozen
1579  * or not, as reported by the notification, remains unchanged *throughout the
1580  * duration* of the execution of the callbacks.
1581  * Hence we need to prevent the freezer from racing with regular CPU hotplug.
1582  *
1583  * This synchronization is implemented by mutually excluding regular CPU
1584  * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
1585  * Hibernate notifications.
1586  */
1587 static int
1588 cpu_hotplug_pm_callback(struct notifier_block *nb,
1589                         unsigned long action, void *ptr)
1590 {
1591         switch (action) {
1592
1593         case PM_SUSPEND_PREPARE:
1594         case PM_HIBERNATION_PREPARE:
1595                 cpu_hotplug_disable();
1596                 break;
1597
1598         case PM_POST_SUSPEND:
1599         case PM_POST_HIBERNATION:
1600                 cpu_hotplug_enable();
1601                 break;
1602
1603         default:
1604                 return NOTIFY_DONE;
1605         }
1606
1607         return NOTIFY_OK;
1608 }
1609
1610
1611 static int __init cpu_hotplug_pm_sync_init(void)
1612 {
1613         /*
1614          * cpu_hotplug_pm_callback has higher priority than x86
1615          * bsp_pm_callback which depends on cpu_hotplug_pm_callback
1616          * to disable cpu hotplug to avoid cpu hotplug race.
1617          */
1618         pm_notifier(cpu_hotplug_pm_callback, 0);
1619         return 0;
1620 }
1621 core_initcall(cpu_hotplug_pm_sync_init);
1622
1623 #endif /* CONFIG_PM_SLEEP_SMP */
1624
1625 int __boot_cpu_id;
1626
1627 #endif /* CONFIG_SMP */
1628
1629 /* Boot processor state steps */
1630 static struct cpuhp_step cpuhp_hp_states[] = {
1631         [CPUHP_OFFLINE] = {
1632                 .name                   = "offline",
1633                 .startup.single         = NULL,
1634                 .teardown.single        = NULL,
1635         },
1636 #ifdef CONFIG_SMP
1637         [CPUHP_CREATE_THREADS]= {
1638                 .name                   = "threads:prepare",
1639                 .startup.single         = smpboot_create_threads,
1640                 .teardown.single        = NULL,
1641                 .cant_stop              = true,
1642         },
1643         [CPUHP_PERF_PREPARE] = {
1644                 .name                   = "perf:prepare",
1645                 .startup.single         = perf_event_init_cpu,
1646                 .teardown.single        = perf_event_exit_cpu,
1647         },
1648         [CPUHP_WORKQUEUE_PREP] = {
1649                 .name                   = "workqueue:prepare",
1650                 .startup.single         = workqueue_prepare_cpu,
1651                 .teardown.single        = NULL,
1652         },
1653         [CPUHP_HRTIMERS_PREPARE] = {
1654                 .name                   = "hrtimers:prepare",
1655                 .startup.single         = hrtimers_prepare_cpu,
1656                 .teardown.single        = hrtimers_dead_cpu,
1657         },
1658         [CPUHP_SMPCFD_PREPARE] = {
1659                 .name                   = "smpcfd:prepare",
1660                 .startup.single         = smpcfd_prepare_cpu,
1661                 .teardown.single        = smpcfd_dead_cpu,
1662         },
1663         [CPUHP_RELAY_PREPARE] = {
1664                 .name                   = "relay:prepare",
1665                 .startup.single         = relay_prepare_cpu,
1666                 .teardown.single        = NULL,
1667         },
1668         [CPUHP_SLAB_PREPARE] = {
1669                 .name                   = "slab:prepare",
1670                 .startup.single         = slab_prepare_cpu,
1671                 .teardown.single        = slab_dead_cpu,
1672         },
1673         [CPUHP_RCUTREE_PREP] = {
1674                 .name                   = "RCU/tree:prepare",
1675                 .startup.single         = rcutree_prepare_cpu,
1676                 .teardown.single        = rcutree_dead_cpu,
1677         },
1678         /*
1679          * On the tear-down path, timers_dead_cpu() must be invoked
1680          * before blk_mq_queue_reinit_notify() from notify_dead(),
1681          * otherwise a RCU stall occurs.
1682          */
1683         [CPUHP_TIMERS_PREPARE] = {
1684                 .name                   = "timers:prepare",
1685                 .startup.single         = timers_prepare_cpu,
1686                 .teardown.single        = timers_dead_cpu,
1687         },
1688         /* Kicks the plugged cpu into life */
1689         [CPUHP_BRINGUP_CPU] = {
1690                 .name                   = "cpu:bringup",
1691                 .startup.single         = bringup_cpu,
1692                 .teardown.single        = finish_cpu,
1693                 .cant_stop              = true,
1694         },
1695         /* Final state before CPU kills itself */
1696         [CPUHP_AP_IDLE_DEAD] = {
1697                 .name                   = "idle:dead",
1698         },
1699         /*
1700          * Last state before CPU enters the idle loop to die. Transient state
1701          * for synchronization.
1702          */
1703         [CPUHP_AP_OFFLINE] = {
1704                 .name                   = "ap:offline",
1705                 .cant_stop              = true,
1706         },
1707         /* First state is scheduler control. Interrupts are disabled */
1708         [CPUHP_AP_SCHED_STARTING] = {
1709                 .name                   = "sched:starting",
1710                 .startup.single         = sched_cpu_starting,
1711                 .teardown.single        = sched_cpu_dying,
1712         },
1713         [CPUHP_AP_RCUTREE_DYING] = {
1714                 .name                   = "RCU/tree:dying",
1715                 .startup.single         = NULL,
1716                 .teardown.single        = rcutree_dying_cpu,
1717         },
1718         [CPUHP_AP_SMPCFD_DYING] = {
1719                 .name                   = "smpcfd:dying",
1720                 .startup.single         = NULL,
1721                 .teardown.single        = smpcfd_dying_cpu,
1722         },
1723         /* Entry state on starting. Interrupts enabled from here on. Transient
1724          * state for synchronsization */
1725         [CPUHP_AP_ONLINE] = {
1726                 .name                   = "ap:online",
1727         },
1728         /*
1729          * Handled on control processor until the plugged processor manages
1730          * this itself.
1731          */
1732         [CPUHP_TEARDOWN_CPU] = {
1733                 .name                   = "cpu:teardown",
1734                 .startup.single         = NULL,
1735                 .teardown.single        = takedown_cpu,
1736                 .cant_stop              = true,
1737         },
1738
1739         [CPUHP_AP_SCHED_WAIT_EMPTY] = {
1740                 .name                   = "sched:waitempty",
1741                 .startup.single         = NULL,
1742                 .teardown.single        = sched_cpu_wait_empty,
1743         },
1744
1745         /* Handle smpboot threads park/unpark */
1746         [CPUHP_AP_SMPBOOT_THREADS] = {
1747                 .name                   = "smpboot/threads:online",
1748                 .startup.single         = smpboot_unpark_threads,
1749                 .teardown.single        = smpboot_park_threads,
1750         },
1751         [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
1752                 .name                   = "irq/affinity:online",
1753                 .startup.single         = irq_affinity_online_cpu,
1754                 .teardown.single        = NULL,
1755         },
1756         [CPUHP_AP_PERF_ONLINE] = {
1757                 .name                   = "perf:online",
1758                 .startup.single         = perf_event_init_cpu,
1759                 .teardown.single        = perf_event_exit_cpu,
1760         },
1761         [CPUHP_AP_WATCHDOG_ONLINE] = {
1762                 .name                   = "lockup_detector:online",
1763                 .startup.single         = lockup_detector_online_cpu,
1764                 .teardown.single        = lockup_detector_offline_cpu,
1765         },
1766         [CPUHP_AP_WORKQUEUE_ONLINE] = {
1767                 .name                   = "workqueue:online",
1768                 .startup.single         = workqueue_online_cpu,
1769                 .teardown.single        = workqueue_offline_cpu,
1770         },
1771         [CPUHP_AP_RCUTREE_ONLINE] = {
1772                 .name                   = "RCU/tree:online",
1773                 .startup.single         = rcutree_online_cpu,
1774                 .teardown.single        = rcutree_offline_cpu,
1775         },
1776 #endif
1777         /*
1778          * The dynamically registered state space is here
1779          */
1780
1781 #ifdef CONFIG_SMP
1782         /* Last state is scheduler control setting the cpu active */
1783         [CPUHP_AP_ACTIVE] = {
1784                 .name                   = "sched:active",
1785                 .startup.single         = sched_cpu_activate,
1786                 .teardown.single        = sched_cpu_deactivate,
1787         },
1788 #endif
1789
1790         /* CPU is fully up and running. */
1791         [CPUHP_ONLINE] = {
1792                 .name                   = "online",
1793                 .startup.single         = NULL,
1794                 .teardown.single        = NULL,
1795         },
1796 };
1797
1798 /* Sanity check for callbacks */
1799 static int cpuhp_cb_check(enum cpuhp_state state)
1800 {
1801         if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
1802                 return -EINVAL;
1803         return 0;
1804 }
1805
1806 /*
1807  * Returns a free for dynamic slot assignment of the Online state. The states
1808  * are protected by the cpuhp_slot_states mutex and an empty slot is identified
1809  * by having no name assigned.
1810  */
1811 static int cpuhp_reserve_state(enum cpuhp_state state)
1812 {
1813         enum cpuhp_state i, end;
1814         struct cpuhp_step *step;
1815
1816         switch (state) {
1817         case CPUHP_AP_ONLINE_DYN:
1818                 step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
1819                 end = CPUHP_AP_ONLINE_DYN_END;
1820                 break;
1821         case CPUHP_BP_PREPARE_DYN:
1822                 step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
1823                 end = CPUHP_BP_PREPARE_DYN_END;
1824                 break;
1825         default:
1826                 return -EINVAL;
1827         }
1828
1829         for (i = state; i <= end; i++, step++) {
1830                 if (!step->name)
1831                         return i;
1832         }
1833         WARN(1, "No more dynamic states available for CPU hotplug\n");
1834         return -ENOSPC;
1835 }
1836
1837 static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
1838                                  int (*startup)(unsigned int cpu),
1839                                  int (*teardown)(unsigned int cpu),
1840                                  bool multi_instance)
1841 {
1842         /* (Un)Install the callbacks for further cpu hotplug operations */
1843         struct cpuhp_step *sp;
1844         int ret = 0;
1845
1846         /*
1847          * If name is NULL, then the state gets removed.
1848          *
1849          * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
1850          * the first allocation from these dynamic ranges, so the removal
1851          * would trigger a new allocation and clear the wrong (already
1852          * empty) state, leaving the callbacks of the to be cleared state
1853          * dangling, which causes wreckage on the next hotplug operation.
1854          */
1855         if (name && (state == CPUHP_AP_ONLINE_DYN ||
1856                      state == CPUHP_BP_PREPARE_DYN)) {
1857                 ret = cpuhp_reserve_state(state);
1858                 if (ret < 0)
1859                         return ret;
1860                 state = ret;
1861         }
1862         sp = cpuhp_get_step(state);
1863         if (name && sp->name)
1864                 return -EBUSY;
1865
1866         sp->startup.single = startup;
1867         sp->teardown.single = teardown;
1868         sp->name = name;
1869         sp->multi_instance = multi_instance;
1870         INIT_HLIST_HEAD(&sp->list);
1871         return ret;
1872 }
1873
1874 static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
1875 {
1876         return cpuhp_get_step(state)->teardown.single;
1877 }
1878
1879 /*
1880  * Call the startup/teardown function for a step either on the AP or
1881  * on the current CPU.
1882  */
1883 static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
1884                             struct hlist_node *node)
1885 {
1886         struct cpuhp_step *sp = cpuhp_get_step(state);
1887         int ret;
1888
1889         /*
1890          * If there's nothing to do, we done.
1891          * Relies on the union for multi_instance.
1892          */
1893         if (cpuhp_step_empty(bringup, sp))
1894                 return 0;
1895         /*
1896          * The non AP bound callbacks can fail on bringup. On teardown
1897          * e.g. module removal we crash for now.
1898          */
1899 #ifdef CONFIG_SMP
1900         if (cpuhp_is_ap_state(state))
1901                 ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
1902         else
1903                 ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
1904 #else
1905         ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
1906 #endif
1907         BUG_ON(ret && !bringup);
1908         return ret;
1909 }
1910
1911 /*
1912  * Called from __cpuhp_setup_state on a recoverable failure.
1913  *
1914  * Note: The teardown callbacks for rollback are not allowed to fail!
1915  */
1916 static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
1917                                    struct hlist_node *node)
1918 {
1919         int cpu;
1920
1921         /* Roll back the already executed steps on the other cpus */
1922         for_each_present_cpu(cpu) {
1923                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1924                 int cpustate = st->state;
1925
1926                 if (cpu >= failedcpu)
1927                         break;
1928
1929                 /* Did we invoke the startup call on that cpu ? */
1930                 if (cpustate >= state)
1931                         cpuhp_issue_call(cpu, state, false, node);
1932         }
1933 }
1934
1935 int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
1936                                           struct hlist_node *node,
1937                                           bool invoke)
1938 {
1939         struct cpuhp_step *sp;
1940         int cpu;
1941         int ret;
1942
1943         lockdep_assert_cpus_held();
1944
1945         sp = cpuhp_get_step(state);
1946         if (sp->multi_instance == false)
1947                 return -EINVAL;
1948
1949         mutex_lock(&cpuhp_state_mutex);
1950
1951         if (!invoke || !sp->startup.multi)
1952                 goto add_node;
1953
1954         /*
1955          * Try to call the startup callback for each present cpu
1956          * depending on the hotplug state of the cpu.
1957          */
1958         for_each_present_cpu(cpu) {
1959                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1960                 int cpustate = st->state;
1961
1962                 if (cpustate < state)
1963                         continue;
1964
1965                 ret = cpuhp_issue_call(cpu, state, true, node);
1966                 if (ret) {
1967                         if (sp->teardown.multi)
1968                                 cpuhp_rollback_install(cpu, state, node);
1969                         goto unlock;
1970                 }
1971         }
1972 add_node:
1973         ret = 0;
1974         hlist_add_head(node, &sp->list);
1975 unlock:
1976         mutex_unlock(&cpuhp_state_mutex);
1977         return ret;
1978 }
1979
1980 int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
1981                                bool invoke)
1982 {
1983         int ret;
1984
1985         cpus_read_lock();
1986         ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
1987         cpus_read_unlock();
1988         return ret;
1989 }
1990 EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
1991
1992 /**
1993  * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
1994  * @state:              The state to setup
1995  * @name:               Name of the step
1996  * @invoke:             If true, the startup function is invoked for cpus where
1997  *                      cpu state >= @state
1998  * @startup:            startup callback function
1999  * @teardown:           teardown callback function
2000  * @multi_instance:     State is set up for multiple instances which get
2001  *                      added afterwards.
2002  *
2003  * The caller needs to hold cpus read locked while calling this function.
2004  * Return:
2005  *   On success:
2006  *      Positive state number if @state is CPUHP_AP_ONLINE_DYN;
2007  *      0 for all other states
2008  *   On failure: proper (negative) error code
2009  */
2010 int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
2011                                    const char *name, bool invoke,
2012                                    int (*startup)(unsigned int cpu),
2013                                    int (*teardown)(unsigned int cpu),
2014                                    bool multi_instance)
2015 {
2016         int cpu, ret = 0;
2017         bool dynstate;
2018
2019         lockdep_assert_cpus_held();
2020
2021         if (cpuhp_cb_check(state) || !name)
2022                 return -EINVAL;
2023
2024         mutex_lock(&cpuhp_state_mutex);
2025
2026         ret = cpuhp_store_callbacks(state, name, startup, teardown,
2027                                     multi_instance);
2028
2029         dynstate = state == CPUHP_AP_ONLINE_DYN;
2030         if (ret > 0 && dynstate) {
2031                 state = ret;
2032                 ret = 0;
2033         }
2034
2035         if (ret || !invoke || !startup)
2036                 goto out;
2037
2038         /*
2039          * Try to call the startup callback for each present cpu
2040          * depending on the hotplug state of the cpu.
2041          */
2042         for_each_present_cpu(cpu) {
2043                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
2044                 int cpustate = st->state;
2045
2046                 if (cpustate < state)
2047                         continue;
2048
2049                 ret = cpuhp_issue_call(cpu, state, true, NULL);
2050                 if (ret) {
2051                         if (teardown)
2052                                 cpuhp_rollback_install(cpu, state, NULL);
2053                         cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
2054                         goto out;
2055                 }
2056         }
2057 out:
2058         mutex_unlock(&cpuhp_state_mutex);
2059         /*
2060          * If the requested state is CPUHP_AP_ONLINE_DYN, return the
2061          * dynamically allocated state in case of success.
2062          */
2063         if (!ret && dynstate)
2064                 return state;
2065         return ret;
2066 }
2067 EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);
2068
2069 int __cpuhp_setup_state(enum cpuhp_state state,
2070                         const char *name, bool invoke,
2071                         int (*startup)(unsigned int cpu),
2072                         int (*teardown)(unsigned int cpu),
2073                         bool multi_instance)
2074 {
2075         int ret;
2076
2077         cpus_read_lock();
2078         ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
2079                                              teardown, multi_instance);
2080         cpus_read_unlock();
2081         return ret;
2082 }
2083 EXPORT_SYMBOL(__cpuhp_setup_state);
2084
2085 int __cpuhp_state_remove_instance(enum cpuhp_state state,
2086                                   struct hlist_node *node, bool invoke)
2087 {
2088         struct cpuhp_step *sp = cpuhp_get_step(state);
2089         int cpu;
2090
2091         BUG_ON(cpuhp_cb_check(state));
2092
2093         if (!sp->multi_instance)
2094                 return -EINVAL;
2095
2096         cpus_read_lock();
2097         mutex_lock(&cpuhp_state_mutex);
2098
2099         if (!invoke || !cpuhp_get_teardown_cb(state))
2100                 goto remove;
2101         /*
2102          * Call the teardown callback for each present cpu depending
2103          * on the hotplug state of the cpu. This function is not
2104          * allowed to fail currently!
2105          */
2106         for_each_present_cpu(cpu) {
2107                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
2108                 int cpustate = st->state;
2109
2110                 if (cpustate >= state)
2111                         cpuhp_issue_call(cpu, state, false, node);
2112         }
2113
2114 remove:
2115         hlist_del(node);
2116         mutex_unlock(&cpuhp_state_mutex);
2117         cpus_read_unlock();
2118
2119         return 0;
2120 }
2121 EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
2122
2123 /**
2124  * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
2125  * @state:      The state to remove
2126  * @invoke:     If true, the teardown function is invoked for cpus where
2127  *              cpu state >= @state
2128  *
2129  * The caller needs to hold cpus read locked while calling this function.
2130  * The teardown callback is currently not allowed to fail. Think
2131  * about module removal!
2132  */
2133 void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
2134 {
2135         struct cpuhp_step *sp = cpuhp_get_step(state);
2136         int cpu;
2137
2138         BUG_ON(cpuhp_cb_check(state));
2139
2140         lockdep_assert_cpus_held();
2141
2142         mutex_lock(&cpuhp_state_mutex);
2143         if (sp->multi_instance) {
2144                 WARN(!hlist_empty(&sp->list),
2145                      "Error: Removing state %d which has instances left.\n",
2146                      state);
2147                 goto remove;
2148         }
2149
2150         if (!invoke || !cpuhp_get_teardown_cb(state))
2151                 goto remove;
2152
2153         /*
2154          * Call the teardown callback for each present cpu depending
2155          * on the hotplug state of the cpu. This function is not
2156          * allowed to fail currently!
2157          */
2158         for_each_present_cpu(cpu) {
2159                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
2160                 int cpustate = st->state;
2161
2162                 if (cpustate >= state)
2163                         cpuhp_issue_call(cpu, state, false, NULL);
2164         }
2165 remove:
2166         cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
2167         mutex_unlock(&cpuhp_state_mutex);
2168 }
2169 EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);
2170
2171 void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
2172 {
2173         cpus_read_lock();
2174         __cpuhp_remove_state_cpuslocked(state, invoke);
2175         cpus_read_unlock();
2176 }
2177 EXPORT_SYMBOL(__cpuhp_remove_state);
2178
2179 #ifdef CONFIG_HOTPLUG_SMT
2180 static void cpuhp_offline_cpu_device(unsigned int cpu)
2181 {
2182         struct device *dev = get_cpu_device(cpu);
2183
2184         dev->offline = true;
2185         /* Tell user space about the state change */
2186         kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
2187 }
2188
2189 static void cpuhp_online_cpu_device(unsigned int cpu)
2190 {
2191         struct device *dev = get_cpu_device(cpu);
2192
2193         dev->offline = false;
2194         /* Tell user space about the state change */
2195         kobject_uevent(&dev->kobj, KOBJ_ONLINE);
2196 }
2197
2198 int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
2199 {
2200         int cpu, ret = 0;
2201
2202         cpu_maps_update_begin();
2203         for_each_online_cpu(cpu) {
2204                 if (topology_is_primary_thread(cpu))
2205                         continue;
2206                 ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
2207                 if (ret)
2208                         break;
2209                 /*
2210                  * As this needs to hold the cpu maps lock it's impossible
2211                  * to call device_offline() because that ends up calling
2212                  * cpu_down() which takes cpu maps lock. cpu maps lock
2213                  * needs to be held as this might race against in kernel
2214                  * abusers of the hotplug machinery (thermal management).
2215                  *
2216                  * So nothing would update device:offline state. That would
2217                  * leave the sysfs entry stale and prevent onlining after
2218                  * smt control has been changed to 'off' again. This is
2219                  * called under the sysfs hotplug lock, so it is properly
2220                  * serialized against the regular offline usage.
2221                  */
2222                 cpuhp_offline_cpu_device(cpu);
2223         }
2224         if (!ret)
2225                 cpu_smt_control = ctrlval;
2226         cpu_maps_update_done();
2227         return ret;
2228 }
2229
2230 int cpuhp_smt_enable(void)
2231 {
2232         int cpu, ret = 0;
2233
2234         cpu_maps_update_begin();
2235         cpu_smt_control = CPU_SMT_ENABLED;
2236         for_each_present_cpu(cpu) {
2237                 /* Skip online CPUs and CPUs on offline nodes */
2238                 if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
2239                         continue;
2240                 ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
2241                 if (ret)
2242                         break;
2243                 /* See comment in cpuhp_smt_disable() */
2244                 cpuhp_online_cpu_device(cpu);
2245         }
2246         cpu_maps_update_done();
2247         return ret;
2248 }
2249 #endif
2250
2251 #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
2252 static ssize_t show_cpuhp_state(struct device *dev,
2253                                 struct device_attribute *attr, char *buf)
2254 {
2255         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2256
2257         return sprintf(buf, "%d\n", st->state);
2258 }
2259 static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
2260
2261 static ssize_t write_cpuhp_target(struct device *dev,
2262                                   struct device_attribute *attr,
2263                                   const char *buf, size_t count)
2264 {
2265         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2266         struct cpuhp_step *sp;
2267         int target, ret;
2268
2269         ret = kstrtoint(buf, 10, &target);
2270         if (ret)
2271                 return ret;
2272
2273 #ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
2274         if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
2275                 return -EINVAL;
2276 #else
2277         if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
2278                 return -EINVAL;
2279 #endif
2280
2281         ret = lock_device_hotplug_sysfs();
2282         if (ret)
2283                 return ret;
2284
2285         mutex_lock(&cpuhp_state_mutex);
2286         sp = cpuhp_get_step(target);
2287         ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
2288         mutex_unlock(&cpuhp_state_mutex);
2289         if (ret)
2290                 goto out;
2291
2292         if (st->state < target)
2293                 ret = cpu_up(dev->id, target);
2294         else
2295                 ret = cpu_down(dev->id, target);
2296 out:
2297         unlock_device_hotplug();
2298         return ret ? ret : count;
2299 }
2300
2301 static ssize_t show_cpuhp_target(struct device *dev,
2302                                  struct device_attribute *attr, char *buf)
2303 {
2304         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2305
2306         return sprintf(buf, "%d\n", st->target);
2307 }
2308 static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
2309
2310
2311 static ssize_t write_cpuhp_fail(struct device *dev,
2312                                 struct device_attribute *attr,
2313                                 const char *buf, size_t count)
2314 {
2315         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2316         struct cpuhp_step *sp;
2317         int fail, ret;
2318
2319         ret = kstrtoint(buf, 10, &fail);
2320         if (ret)
2321                 return ret;
2322
2323         if (fail == CPUHP_INVALID) {
2324                 st->fail = fail;
2325                 return count;
2326         }
2327
2328         if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
2329                 return -EINVAL;
2330
2331         /*
2332          * Cannot fail STARTING/DYING callbacks.
2333          */
2334         if (cpuhp_is_atomic_state(fail))
2335                 return -EINVAL;
2336
2337         /*
2338          * DEAD callbacks cannot fail...
2339          * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
2340          * triggering STARTING callbacks, a failure in this state would
2341          * hinder rollback.
2342          */
2343         if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
2344                 return -EINVAL;
2345
2346         /*
2347          * Cannot fail anything that doesn't have callbacks.
2348          */
2349         mutex_lock(&cpuhp_state_mutex);
2350         sp = cpuhp_get_step(fail);
2351         if (!sp->startup.single && !sp->teardown.single)
2352                 ret = -EINVAL;
2353         mutex_unlock(&cpuhp_state_mutex);
2354         if (ret)
2355                 return ret;
2356
2357         st->fail = fail;
2358
2359         return count;
2360 }
2361
2362 static ssize_t show_cpuhp_fail(struct device *dev,
2363                                struct device_attribute *attr, char *buf)
2364 {
2365         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2366
2367         return sprintf(buf, "%d\n", st->fail);
2368 }
2369
2370 static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
2371
2372 static struct attribute *cpuhp_cpu_attrs[] = {
2373         &dev_attr_state.attr,
2374         &dev_attr_target.attr,
2375         &dev_attr_fail.attr,
2376         NULL
2377 };
2378
2379 static const struct attribute_group cpuhp_cpu_attr_group = {
2380         .attrs = cpuhp_cpu_attrs,
2381         .name = "hotplug",
2382         NULL
2383 };
2384
2385 static ssize_t show_cpuhp_states(struct device *dev,
2386                                  struct device_attribute *attr, char *buf)
2387 {
2388         ssize_t cur, res = 0;
2389         int i;
2390
2391         mutex_lock(&cpuhp_state_mutex);
2392         for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
2393                 struct cpuhp_step *sp = cpuhp_get_step(i);
2394
2395                 if (sp->name) {
2396                         cur = sprintf(buf, "%3d: %s\n", i, sp->name);
2397                         buf += cur;
2398                         res += cur;
2399                 }
2400         }
2401         mutex_unlock(&cpuhp_state_mutex);
2402         return res;
2403 }
2404 static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
2405
2406 static struct attribute *cpuhp_cpu_root_attrs[] = {
2407         &dev_attr_states.attr,
2408         NULL
2409 };
2410
2411 static const struct attribute_group cpuhp_cpu_root_attr_group = {
2412         .attrs = cpuhp_cpu_root_attrs,
2413         .name = "hotplug",
2414         NULL
2415 };
2416
2417 #ifdef CONFIG_HOTPLUG_SMT
2418
2419 static ssize_t
2420 __store_smt_control(struct device *dev, struct device_attribute *attr,
2421                     const char *buf, size_t count)
2422 {
2423         int ctrlval, ret;
2424
2425         if (sysfs_streq(buf, "on"))
2426                 ctrlval = CPU_SMT_ENABLED;
2427         else if (sysfs_streq(buf, "off"))
2428                 ctrlval = CPU_SMT_DISABLED;
2429         else if (sysfs_streq(buf, "forceoff"))
2430                 ctrlval = CPU_SMT_FORCE_DISABLED;
2431         else
2432                 return -EINVAL;
2433
2434         if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
2435                 return -EPERM;
2436
2437         if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
2438                 return -ENODEV;
2439
2440         ret = lock_device_hotplug_sysfs();
2441         if (ret)
2442                 return ret;
2443
2444         if (ctrlval != cpu_smt_control) {
2445                 switch (ctrlval) {
2446                 case CPU_SMT_ENABLED:
2447                         ret = cpuhp_smt_enable();
2448                         break;
2449                 case CPU_SMT_DISABLED:
2450                 case CPU_SMT_FORCE_DISABLED:
2451                         ret = cpuhp_smt_disable(ctrlval);
2452                         break;
2453                 }
2454         }
2455
2456         unlock_device_hotplug();
2457         return ret ? ret : count;
2458 }
2459
2460 #else /* !CONFIG_HOTPLUG_SMT */
2461 static ssize_t
2462 __store_smt_control(struct device *dev, struct device_attribute *attr,
2463                     const char *buf, size_t count)
2464 {
2465         return -ENODEV;
2466 }
2467 #endif /* CONFIG_HOTPLUG_SMT */
2468
2469 static const char *smt_states[] = {
2470         [CPU_SMT_ENABLED]               = "on",
2471         [CPU_SMT_DISABLED]              = "off",
2472         [CPU_SMT_FORCE_DISABLED]        = "forceoff",
2473         [CPU_SMT_NOT_SUPPORTED]         = "notsupported",
2474         [CPU_SMT_NOT_IMPLEMENTED]       = "notimplemented",
2475 };
2476
2477 static ssize_t
2478 show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
2479 {
2480         const char *state = smt_states[cpu_smt_control];
2481
2482         return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
2483 }
2484
2485 static ssize_t
2486 store_smt_control(struct device *dev, struct device_attribute *attr,
2487                   const char *buf, size_t count)
2488 {
2489         return __store_smt_control(dev, attr, buf, count);
2490 }
2491 static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
2492
2493 static ssize_t
2494 show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
2495 {
2496         return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
2497 }
2498 static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
2499
2500 static struct attribute *cpuhp_smt_attrs[] = {
2501         &dev_attr_control.attr,
2502         &dev_attr_active.attr,
2503         NULL
2504 };
2505
2506 static const struct attribute_group cpuhp_smt_attr_group = {
2507         .attrs = cpuhp_smt_attrs,
2508         .name = "smt",
2509         NULL
2510 };
2511
2512 static int __init cpu_smt_sysfs_init(void)
2513 {
2514         return sysfs_create_group(&cpu_subsys.dev_root->kobj,
2515                                   &cpuhp_smt_attr_group);
2516 }
2517
2518 static int __init cpuhp_sysfs_init(void)
2519 {
2520         int cpu, ret;
2521
2522         ret = cpu_smt_sysfs_init();
2523         if (ret)
2524                 return ret;
2525
2526         ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
2527                                  &cpuhp_cpu_root_attr_group);
2528         if (ret)
2529                 return ret;
2530
2531         for_each_possible_cpu(cpu) {
2532                 struct device *dev = get_cpu_device(cpu);
2533
2534                 if (!dev)
2535                         continue;
2536                 ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
2537                 if (ret)
2538                         return ret;
2539         }
2540         return 0;
2541 }
2542 device_initcall(cpuhp_sysfs_init);
2543 #endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
2544
2545 /*
2546  * cpu_bit_bitmap[] is a special, "compressed" data structure that
2547  * represents all NR_CPUS bits binary values of 1<<nr.
2548  *
2549  * It is used by cpumask_of() to get a constant address to a CPU
2550  * mask value that has a single bit set only.
2551  */
2552
2553 /* cpu_bit_bitmap[0] is empty - so we can back into it */
2554 #define MASK_DECLARE_1(x)       [x+1][0] = (1UL << (x))
2555 #define MASK_DECLARE_2(x)       MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
2556 #define MASK_DECLARE_4(x)       MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
2557 #define MASK_DECLARE_8(x)       MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
2558
2559 const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
2560
2561         MASK_DECLARE_8(0),      MASK_DECLARE_8(8),
2562         MASK_DECLARE_8(16),     MASK_DECLARE_8(24),
2563 #if BITS_PER_LONG > 32
2564         MASK_DECLARE_8(32),     MASK_DECLARE_8(40),
2565         MASK_DECLARE_8(48),     MASK_DECLARE_8(56),
2566 #endif
2567 };
2568 EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
2569
2570 const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
2571 EXPORT_SYMBOL(cpu_all_bits);
2572
2573 #ifdef CONFIG_INIT_ALL_POSSIBLE
2574 struct cpumask __cpu_possible_mask __read_mostly
2575         = {CPU_BITS_ALL};
2576 #else
2577 struct cpumask __cpu_possible_mask __read_mostly;
2578 #endif
2579 EXPORT_SYMBOL(__cpu_possible_mask);
2580
2581 struct cpumask __cpu_online_mask __read_mostly;
2582 EXPORT_SYMBOL(__cpu_online_mask);
2583
2584 struct cpumask __cpu_present_mask __read_mostly;
2585 EXPORT_SYMBOL(__cpu_present_mask);
2586
2587 struct cpumask __cpu_active_mask __read_mostly;
2588 EXPORT_SYMBOL(__cpu_active_mask);
2589
2590 struct cpumask __cpu_dying_mask __read_mostly;
2591 EXPORT_SYMBOL(__cpu_dying_mask);
2592
2593 atomic_t __num_online_cpus __read_mostly;
2594 EXPORT_SYMBOL(__num_online_cpus);
2595
2596 void init_cpu_present(const struct cpumask *src)
2597 {
2598         cpumask_copy(&__cpu_present_mask, src);
2599 }
2600
2601 void init_cpu_possible(const struct cpumask *src)
2602 {
2603         cpumask_copy(&__cpu_possible_mask, src);
2604 }
2605
2606 void init_cpu_online(const struct cpumask *src)
2607 {
2608         cpumask_copy(&__cpu_online_mask, src);
2609 }
2610
2611 void set_cpu_online(unsigned int cpu, bool online)
2612 {
2613         /*
2614          * atomic_inc/dec() is required to handle the horrid abuse of this
2615          * function by the reboot and kexec code which invoke it from
2616          * IPI/NMI broadcasts when shutting down CPUs. Invocation from
2617          * regular CPU hotplug is properly serialized.
2618          *
2619          * Note, that the fact that __num_online_cpus is of type atomic_t
2620          * does not protect readers which are not serialized against
2621          * concurrent hotplug operations.
2622          */
2623         if (online) {
2624                 if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
2625                         atomic_inc(&__num_online_cpus);
2626         } else {
2627                 if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
2628                         atomic_dec(&__num_online_cpus);
2629         }
2630 }
2631
2632 /*
2633  * Activate the first processor.
2634  */
2635 void __init boot_cpu_init(void)
2636 {
2637         int cpu = smp_processor_id();
2638
2639         /* Mark the boot cpu "present", "online" etc for SMP and UP case */
2640         set_cpu_online(cpu, true);
2641         set_cpu_active(cpu, true);
2642         set_cpu_present(cpu, true);
2643         set_cpu_possible(cpu, true);
2644
2645 #ifdef CONFIG_SMP
2646         __boot_cpu_id = cpu;
2647 #endif
2648 }
2649
2650 /*
2651  * Must be called _AFTER_ setting up the per_cpu areas
2652  */
2653 void __init boot_cpu_hotplug_init(void)
2654 {
2655 #ifdef CONFIG_SMP
2656         cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
2657 #endif
2658         this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
2659 }
2660
2661 /*
2662  * These are used for a global "mitigations=" cmdline option for toggling
2663  * optional CPU mitigations.
2664  */
2665 enum cpu_mitigations {
2666         CPU_MITIGATIONS_OFF,
2667         CPU_MITIGATIONS_AUTO,
2668         CPU_MITIGATIONS_AUTO_NOSMT,
2669 };
2670
2671 static enum cpu_mitigations cpu_mitigations __ro_after_init =
2672         CPU_MITIGATIONS_AUTO;
2673
2674 static int __init mitigations_parse_cmdline(char *arg)
2675 {
2676         if (!strcmp(arg, "off"))
2677                 cpu_mitigations = CPU_MITIGATIONS_OFF;
2678         else if (!strcmp(arg, "auto"))
2679                 cpu_mitigations = CPU_MITIGATIONS_AUTO;
2680         else if (!strcmp(arg, "auto,nosmt"))
2681                 cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
2682         else
2683                 pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
2684                         arg);
2685
2686         return 0;
2687 }
2688 early_param("mitigations", mitigations_parse_cmdline);
2689
2690 /* mitigations=off */
2691 bool cpu_mitigations_off(void)
2692 {
2693         return cpu_mitigations == CPU_MITIGATIONS_OFF;
2694 }
2695 EXPORT_SYMBOL_GPL(cpu_mitigations_off);
2696
2697 /* mitigations=auto,nosmt */
2698 bool cpu_mitigations_auto_nosmt(void)
2699 {
2700         return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
2701 }
2702 EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);