kernel/cpu.c

   1 /* CPU control.
   2  * (C) 2001, 2002, 2003, 2004 Rusty Russell
   3  *
   4  * This code is licenced under the GPL.
   5  */
   6 #include <linux/sched/mm.h>
   7 #include <linux/proc_fs.h>
   8 #include <linux/smp.h>
   9 #include <linux/init.h>
  10 #include <linux/notifier.h>
  11 #include <linux/sched/signal.h>
  12 #include <linux/sched/hotplug.h>
  13 #include <linux/sched/isolation.h>
  14 #include <linux/sched/task.h>
  15 #include <linux/sched/smt.h>
  16 #include <linux/unistd.h>
  17 #include <linux/cpu.h>
  18 #include <linux/oom.h>
  19 #include <linux/rcupdate.h>
  20 #include <linux/export.h>
  21 #include <linux/bug.h>
  22 #include <linux/kthread.h>
  23 #include <linux/stop_machine.h>
  24 #include <linux/mutex.h>
  25 #include <linux/gfp.h>
  26 #include <linux/suspend.h>
  27 #include <linux/lockdep.h>
  28 #include <linux/tick.h>
  29 #include <linux/irq.h>
  30 #include <linux/nmi.h>
  31 #include <linux/smpboot.h>
  32 #include <linux/relay.h>
  33 #include <linux/slab.h>
  34 #include <linux/percpu-rwsem.h>
  35 #include <linux/cpuset.h>
  36
  37 #include <trace/events/power.h>
  38 #define CREATE_TRACE_POINTS
  39 #include <trace/events/cpuhp.h>
  40
  41 #include "smpboot.h"
  42
  43 /**
  44  * struct cpuhp_cpu_state - Per cpu hotplug state storage
  45  * @state:      The current cpu state
  46  * @target:     The target state
  47  * @fail:       Current CPU hotplug callback state
  48  * @thread:     Pointer to the hotplug thread
  49  * @should_run: Thread should execute
  50  * @rollback:   Perform a rollback
  51  * @single:     Single callback invocation
  52  * @bringup:    Single callback bringup or teardown selector
  53  * @cpu:        CPU number
  54  * @node:       Remote CPU node; for multi-instance, do a
  55  *              single entry callback for install/remove
  56  * @last:       For multi-instance rollback, remember how far we got
  57  * @cb_state:   The state for a single callback (install/uninstall)
  58  * @result:     Result of the operation
  59  * @done_up:    Signal completion to the issuer of the task for cpu-up
  60  * @done_down:  Signal completion to the issuer of the task for cpu-down
  61  */
  62 struct cpuhp_cpu_state {
  63         enum cpuhp_state        state;
  64         enum cpuhp_state        target;
  65         enum cpuhp_state        fail;
  66 #ifdef CONFIG_SMP
  67         struct task_struct      *thread;
  68         bool                    should_run;
  69         bool                    rollback;
  70         bool                    single;
  71         bool                    bringup;
  72         int                     cpu;
  73         struct hlist_node       *node;
  74         struct hlist_node       *last;
  75         enum cpuhp_state        cb_state;
  76         int                     result;
  77         struct completion       done_up;
  78         struct completion       done_down;
  79 #endif
  80 };
  81
  82 static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
  83         .fail = CPUHP_INVALID,
  84 };
  85
  86 #ifdef CONFIG_SMP
  87 cpumask_t cpus_booted_once_mask;
  88 #endif
  89
  90 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
  91 static struct lockdep_map cpuhp_state_up_map =
  92         STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
  93 static struct lockdep_map cpuhp_state_down_map =
  94         STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
  95
  96
  97 static inline void cpuhp_lock_acquire(bool bringup)
  98 {
  99         lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 100 }
 101
 102 static inline void cpuhp_lock_release(bool bringup)
 103 {
 104         lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
 105 }
 106 #else
 107
 108 static inline void cpuhp_lock_acquire(bool bringup) { }
 109 static inline void cpuhp_lock_release(bool bringup) { }
 110
 111 #endif
 112
 113 /**
 114  * struct cpuhp_step - Hotplug state machine step
 115  * @name:       Name of the step
 116  * @startup:    Startup function of the step
 117  * @teardown:   Teardown function of the step
 118  * @cant_stop:  Bringup/teardown can't be stopped at this step
 119  * @multi_instance:     State has multiple instances which get added afterwards
 120  */
 121 struct cpuhp_step {
 122         const char              *name;
 123         union {
 124                 int             (*single)(unsigned int cpu);
 125                 int             (*multi)(unsigned int cpu,
 126                                          struct hlist_node *node);
 127         } startup;
 128         union {
 129                 int             (*single)(unsigned int cpu);
 130                 int             (*multi)(unsigned int cpu,
 131                                          struct hlist_node *node);
 132         } teardown;
 133         /* private: */
 134         struct hlist_head       list;
 135         /* public: */
 136         bool                    cant_stop;
 137         bool                    multi_instance;
 138 };
 139
 140 static DEFINE_MUTEX(cpuhp_state_mutex);
 141 static struct cpuhp_step cpuhp_hp_states[];
 142
 143 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
 144 {
 145         return cpuhp_hp_states + state;
 146 }
 147
 148 static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
 149 {
 150         return bringup ? !step->startup.single : !step->teardown.single;
 151 }
 152
 153 /**
 154  * cpuhp_invoke_callback - Invoke the callbacks for a given state
 155  * @cpu:        The cpu for which the callback should be invoked
 156  * @state:      The state to do callbacks for
 157  * @bringup:    True if the bringup callback should be invoked
 158  * @node:       For multi-instance, do a single entry callback for install/remove
 159  * @lastp:      For multi-instance rollback, remember how far we got
 160  *
 161  * Called from cpu hotplug and from the state register machinery.
 162  *
 163  * Return: %0 on success or a negative errno code
 164  */
 165 static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
 166                                  bool bringup, struct hlist_node *node,
 167                                  struct hlist_node **lastp)
 168 {
 169         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 170         struct cpuhp_step *step = cpuhp_get_step(state);
 171         int (*cbm)(unsigned int cpu, struct hlist_node *node);
 172         int (*cb)(unsigned int cpu);
 173         int ret, cnt;
 174
 175         if (st->fail == state) {
 176                 st->fail = CPUHP_INVALID;
 177                 return -EAGAIN;
 178         }
 179
 180         if (cpuhp_step_empty(bringup, step)) {
 181                 WARN_ON_ONCE(1);
 182                 return 0;
 183         }
 184
 185         if (!step->multi_instance) {
 186                 WARN_ON_ONCE(lastp && *lastp);
 187                 cb = bringup ? step->startup.single : step->teardown.single;
 188
 189                 trace_cpuhp_enter(cpu, st->target, state, cb);
 190                 ret = cb(cpu);
 191                 trace_cpuhp_exit(cpu, st->state, state, ret);
 192                 return ret;
 193         }
 194         cbm = bringup ? step->startup.multi : step->teardown.multi;
 195
 196         /* Single invocation for instance add/remove */
 197         if (node) {
 198                 WARN_ON_ONCE(lastp && *lastp);
 199                 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
 200                 ret = cbm(cpu, node);
 201                 trace_cpuhp_exit(cpu, st->state, state, ret);
 202                 return ret;
 203         }
 204
 205         /* State transition. Invoke on all instances */
 206         cnt = 0;
 207         hlist_for_each(node, &step->list) {
 208                 if (lastp && node == *lastp)
 209                         break;
 210
 211                 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
 212                 ret = cbm(cpu, node);
 213                 trace_cpuhp_exit(cpu, st->state, state, ret);
 214                 if (ret) {
 215                         if (!lastp)
 216                                 goto err;
 217
 218                         *lastp = node;
 219                         return ret;
 220                 }
 221                 cnt++;
 222         }
 223         if (lastp)
 224                 *lastp = NULL;
 225         return 0;
 226 err:
 227         /* Rollback the instances if one failed */
 228         cbm = !bringup ? step->startup.multi : step->teardown.multi;
 229         if (!cbm)
 230                 return ret;
 231
 232         hlist_for_each(node, &step->list) {
 233                 if (!cnt--)
 234                         break;
 235
 236                 trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
 237                 ret = cbm(cpu, node);
 238                 trace_cpuhp_exit(cpu, st->state, state, ret);
 239                 /*
 240                  * Rollback must not fail,
 241                  */
 242                 WARN_ON_ONCE(ret);
 243         }
 244         return ret;
 245 }
 246
 247 #ifdef CONFIG_SMP
 248 static bool cpuhp_is_ap_state(enum cpuhp_state state)
 249 {
 250         /*
 251          * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
 252          * purposes as that state is handled explicitly in cpu_down.
 253          */
 254         return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
 255 }
 256
 257 static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
 258 {
 259         struct completion *done = bringup ? &st->done_up : &st->done_down;
 260         wait_for_completion(done);
 261 }
 262
 263 static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
 264 {
 265         struct completion *done = bringup ? &st->done_up : &st->done_down;
 266         complete(done);
 267 }
 268
 269 /*
 270  * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
 271  */
 272 static bool cpuhp_is_atomic_state(enum cpuhp_state state)
 273 {
 274         return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
 275 }
 276
 277 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 278 static DEFINE_MUTEX(cpu_add_remove_lock);
 279 bool cpuhp_tasks_frozen;
 280 EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
 281
 282 /*
 283  * The following two APIs (cpu_maps_update_begin/done) must be used when
 284  * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
 285  */
 286 void cpu_maps_update_begin(void)
 287 {
 288         mutex_lock(&cpu_add_remove_lock);
 289 }
 290
 291 void cpu_maps_update_done(void)
 292 {
 293         mutex_unlock(&cpu_add_remove_lock);
 294 }
 295
 296 /*
 297  * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
 298  * Should always be manipulated under cpu_add_remove_lock
 299  */
 300 static int cpu_hotplug_disabled;
 301
 302 #ifdef CONFIG_HOTPLUG_CPU
 303
 304 DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
 305
 306 void cpus_read_lock(void)
 307 {
 308         percpu_down_read(&cpu_hotplug_lock);
 309 }
 310 EXPORT_SYMBOL_GPL(cpus_read_lock);
 311
 312 int cpus_read_trylock(void)
 313 {
 314         return percpu_down_read_trylock(&cpu_hotplug_lock);
 315 }
 316 EXPORT_SYMBOL_GPL(cpus_read_trylock);
 317
 318 void cpus_read_unlock(void)
 319 {
 320         percpu_up_read(&cpu_hotplug_lock);
 321 }
 322 EXPORT_SYMBOL_GPL(cpus_read_unlock);
 323
 324 void cpus_write_lock(void)
 325 {
 326         percpu_down_write(&cpu_hotplug_lock);
 327 }
 328
 329 void cpus_write_unlock(void)
 330 {
 331         percpu_up_write(&cpu_hotplug_lock);
 332 }
 333
 334 void lockdep_assert_cpus_held(void)
 335 {
 336         /*
 337          * We can't have hotplug operations before userspace starts running,
 338          * and some init codepaths will knowingly not take the hotplug lock.
 339          * This is all valid, so mute lockdep until it makes sense to report
 340          * unheld locks.
 341          */
 342         if (system_state < SYSTEM_RUNNING)
 343                 return;
 344
 345         percpu_rwsem_assert_held(&cpu_hotplug_lock);
 346 }
 347
 348 #ifdef CONFIG_LOCKDEP
 349 int lockdep_is_cpus_held(void)
 350 {
 351         return percpu_rwsem_is_held(&cpu_hotplug_lock);
 352 }
 353 #endif
 354
 355 static void lockdep_acquire_cpus_lock(void)
 356 {
 357         rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
 358 }
 359
 360 static void lockdep_release_cpus_lock(void)
 361 {
 362         rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
 363 }
 364
 365 /*
 366  * Wait for currently running CPU hotplug operations to complete (if any) and
 367  * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
 368  * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
 369  * hotplug path before performing hotplug operations. So acquiring that lock
 370  * guarantees mutual exclusion from any currently running hotplug operations.
 371  */
 372 void cpu_hotplug_disable(void)
 373 {
 374         cpu_maps_update_begin();
 375         cpu_hotplug_disabled++;
 376         cpu_maps_update_done();
 377 }
 378 EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
 379
 380 static void __cpu_hotplug_enable(void)
 381 {
 382         if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
 383                 return;
 384         cpu_hotplug_disabled--;
 385 }
 386
 387 void cpu_hotplug_enable(void)
 388 {
 389         cpu_maps_update_begin();
 390         __cpu_hotplug_enable();
 391         cpu_maps_update_done();
 392 }
 393 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 394
 395 #else
 396
 397 static void lockdep_acquire_cpus_lock(void)
 398 {
 399 }
 400
 401 static void lockdep_release_cpus_lock(void)
 402 {
 403 }
 404
 405 #endif  /* CONFIG_HOTPLUG_CPU */
 406
 407 /*
 408  * Architectures that need SMT-specific errata handling during SMT hotplug
 409  * should override this.
 410  */
 411 void __weak arch_smt_update(void) { }
 412
 413 #ifdef CONFIG_HOTPLUG_SMT
 414 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
 415
 416 void __init cpu_smt_disable(bool force)
 417 {
 418         if (!cpu_smt_possible())
 419                 return;
 420
 421         if (force) {
 422                 pr_info("SMT: Force disabled\n");
 423                 cpu_smt_control = CPU_SMT_FORCE_DISABLED;
 424         } else {
 425                 pr_info("SMT: disabled\n");
 426                 cpu_smt_control = CPU_SMT_DISABLED;
 427         }
 428 }
 429
 430 /*
 431  * The decision whether SMT is supported can only be done after the full
 432  * CPU identification. Called from architecture code.
 433  */
 434 void __init cpu_smt_check_topology(void)
 435 {
 436         if (!topology_smt_supported())
 437                 cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
 438 }
 439
 440 static int __init smt_cmdline_disable(char *str)
 441 {
 442         cpu_smt_disable(str && !strcmp(str, "force"));
 443         return 0;
 444 }
 445 early_param("nosmt", smt_cmdline_disable);
 446
 447 static inline bool cpu_smt_allowed(unsigned int cpu)
 448 {
 449         if (cpu_smt_control == CPU_SMT_ENABLED)
 450                 return true;
 451
 452         if (topology_is_primary_thread(cpu))
 453                 return true;
 454
 455         /*
 456          * On x86 it's required to boot all logical CPUs at least once so
 457          * that the init code can get a chance to set CR4.MCE on each
 458          * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
 459          * core will shutdown the machine.
 460          */
 461         return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
 462 }
 463
 464 /* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
 465 bool cpu_smt_possible(void)
 466 {
 467         return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
 468                 cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
 469 }
 470 EXPORT_SYMBOL_GPL(cpu_smt_possible);
 471 #else
 472 static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
 473 #endif
 474
 475 static inline enum cpuhp_state
 476 cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
 477 {
 478         enum cpuhp_state prev_state = st->state;
 479         bool bringup = st->state < target;
 480
 481         st->rollback = false;
 482         st->last = NULL;
 483
 484         st->target = target;
 485         st->single = false;
 486         st->bringup = bringup;
 487         if (cpu_dying(st->cpu) != !bringup)
 488                 set_cpu_dying(st->cpu, !bringup);
 489
 490         return prev_state;
 491 }
 492
 493 static inline void
 494 cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
 495 {
 496         bool bringup = !st->bringup;
 497
 498         st->target = prev_state;
 499
 500         /*
 501          * Already rolling back. No need invert the bringup value or to change
 502          * the current state.
 503          */
 504         if (st->rollback)
 505                 return;
 506
 507         st->rollback = true;
 508
 509         /*
 510          * If we have st->last we need to undo partial multi_instance of this
 511          * state first. Otherwise start undo at the previous state.
 512          */
 513         if (!st->last) {
 514                 if (st->bringup)
 515                         st->state--;
 516                 else
 517                         st->state++;
 518         }
 519
 520         st->bringup = bringup;
 521         if (cpu_dying(st->cpu) != !bringup)
 522                 set_cpu_dying(st->cpu, !bringup);
 523 }
 524
 525 /* Regular hotplug invocation of the AP hotplug thread */
 526 static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
 527 {
 528         if (!st->single && st->state == st->target)
 529                 return;
 530
 531         st->result = 0;
 532         /*
 533          * Make sure the above stores are visible before should_run becomes
 534          * true. Paired with the mb() above in cpuhp_thread_fun()
 535          */
 536         smp_mb();
 537         st->should_run = true;
 538         wake_up_process(st->thread);
 539         wait_for_ap_thread(st, st->bringup);
 540 }
 541
 542 static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
 543 {
 544         enum cpuhp_state prev_state;
 545         int ret;
 546
 547         prev_state = cpuhp_set_state(st, target);
 548         __cpuhp_kick_ap(st);
 549         if ((ret = st->result)) {
 550                 cpuhp_reset_state(st, prev_state);
 551                 __cpuhp_kick_ap(st);
 552         }
 553
 554         return ret;
 555 }
 556
 557 static int bringup_wait_for_ap(unsigned int cpu)
 558 {
 559         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 560
 561         /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
 562         wait_for_ap_thread(st, true);
 563         if (WARN_ON_ONCE((!cpu_online(cpu))))
 564                 return -ECANCELED;
 565
 566         /* Unpark the hotplug thread of the target cpu */
 567         kthread_unpark(st->thread);
 568
 569         /*
 570          * SMT soft disabling on X86 requires to bring the CPU out of the
 571          * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
 572          * CPU marked itself as booted_once in notify_cpu_starting() so the
 573          * cpu_smt_allowed() check will now return false if this is not the
 574          * primary sibling.
 575          */
 576         if (!cpu_smt_allowed(cpu))
 577                 return -ECANCELED;
 578
 579         if (st->target <= CPUHP_AP_ONLINE_IDLE)
 580                 return 0;
 581
 582         return cpuhp_kick_ap(st, st->target);
 583 }
 584
 585 static int bringup_cpu(unsigned int cpu)
 586 {
 587         struct task_struct *idle = idle_thread_get(cpu);
 588         int ret;
 589
 590         /*
 591          * Some architectures have to walk the irq descriptors to
 592          * setup the vector space for the cpu which comes online.
 593          * Prevent irq alloc/free across the bringup.
 594          */
 595         irq_lock_sparse();
 596
 597         /* Arch-specific enabling code. */
 598         ret = __cpu_up(cpu, idle);
 599         irq_unlock_sparse();
 600         if (ret)
 601                 return ret;
 602         return bringup_wait_for_ap(cpu);
 603 }
 604
 605 static int finish_cpu(unsigned int cpu)
 606 {
 607         struct task_struct *idle = idle_thread_get(cpu);
 608         struct mm_struct *mm = idle->active_mm;
 609
 610         /*
 611          * idle_task_exit() will have switched to &init_mm, now
 612          * clean up any remaining active_mm state.
 613          */
 614         if (mm != &init_mm)
 615                 idle->active_mm = &init_mm;
 616         mmdrop(mm);
 617         return 0;
 618 }
 619
 620 /*
 621  * Hotplug state machine related functions
 622  */
 623
 624 /*
 625  * Get the next state to run. Empty ones will be skipped. Returns true if a
 626  * state must be run.
 627  *
 628  * st->state will be modified ahead of time, to match state_to_run, as if it
 629  * has already ran.
 630  */
 631 static bool cpuhp_next_state(bool bringup,
 632                              enum cpuhp_state *state_to_run,
 633                              struct cpuhp_cpu_state *st,
 634                              enum cpuhp_state target)
 635 {
 636         do {
 637                 if (bringup) {
 638                         if (st->state >= target)
 639                                 return false;
 640
 641                         *state_to_run = ++st->state;
 642                 } else {
 643                         if (st->state <= target)
 644                                 return false;
 645
 646                         *state_to_run = st->state--;
 647                 }
 648
 649                 if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
 650                         break;
 651         } while (true);
 652
 653         return true;
 654 }
 655
 656 static int cpuhp_invoke_callback_range(bool bringup,
 657                                        unsigned int cpu,
 658                                        struct cpuhp_cpu_state *st,
 659                                        enum cpuhp_state target)
 660 {
 661         enum cpuhp_state state;
 662         int err = 0;
 663
 664         while (cpuhp_next_state(bringup, &state, st, target)) {
 665                 err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
 666                 if (err)
 667                         break;
 668         }
 669
 670         return err;
 671 }
 672
 673 static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
 674 {
 675         if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
 676                 return true;
 677         /*
 678          * When CPU hotplug is disabled, then taking the CPU down is not
 679          * possible because takedown_cpu() and the architecture and
 680          * subsystem specific mechanisms are not available. So the CPU
 681          * which would be completely unplugged again needs to stay around
 682          * in the current state.
 683          */
 684         return st->state <= CPUHP_BRINGUP_CPU;
 685 }
 686
 687 static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
 688                               enum cpuhp_state target)
 689 {
 690         enum cpuhp_state prev_state = st->state;
 691         int ret = 0;
 692
 693         ret = cpuhp_invoke_callback_range(true, cpu, st, target);
 694         if (ret) {
 695                 cpuhp_reset_state(st, prev_state);
 696                 if (can_rollback_cpu(st))
 697                         WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
 698                                                             prev_state));
 699         }
 700         return ret;
 701 }
 702
 703 /*
 704  * The cpu hotplug threads manage the bringup and teardown of the cpus
 705  */
 706 static void cpuhp_create(unsigned int cpu)
 707 {
 708         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 709
 710         init_completion(&st->done_up);
 711         init_completion(&st->done_down);
 712         st->cpu = cpu;
 713 }
 714
 715 static int cpuhp_should_run(unsigned int cpu)
 716 {
 717         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 718
 719         return st->should_run;
 720 }
 721
 722 /*
 723  * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
 724  * callbacks when a state gets [un]installed at runtime.
 725  *
 726  * Each invocation of this function by the smpboot thread does a single AP
 727  * state callback.
 728  *
 729  * It has 3 modes of operation:
 730  *  - single: runs st->cb_state
 731  *  - up:     runs ++st->state, while st->state < st->target
 732  *  - down:   runs st->state--, while st->state > st->target
 733  *
 734  * When complete or on error, should_run is cleared and the completion is fired.
 735  */
 736 static void cpuhp_thread_fun(unsigned int cpu)
 737 {
 738         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 739         bool bringup = st->bringup;
 740         enum cpuhp_state state;
 741
 742         if (WARN_ON_ONCE(!st->should_run))
 743                 return;
 744
 745         /*
 746          * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
 747          * that if we see ->should_run we also see the rest of the state.
 748          */
 749         smp_mb();
 750
 751         /*
 752          * The BP holds the hotplug lock, but we're now running on the AP,
 753          * ensure that anybody asserting the lock is held, will actually find
 754          * it so.
 755          */
 756         lockdep_acquire_cpus_lock();
 757         cpuhp_lock_acquire(bringup);
 758
 759         if (st->single) {
 760                 state = st->cb_state;
 761                 st->should_run = false;
 762         } else {
 763                 st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
 764                 if (!st->should_run)
 765                         goto end;
 766         }
 767
 768         WARN_ON_ONCE(!cpuhp_is_ap_state(state));
 769
 770         if (cpuhp_is_atomic_state(state)) {
 771                 local_irq_disable();
 772                 st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
 773                 local_irq_enable();
 774
 775                 /*
 776                  * STARTING/DYING must not fail!
 777                  */
 778                 WARN_ON_ONCE(st->result);
 779         } else {
 780                 st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
 781         }
 782
 783         if (st->result) {
 784                 /*
 785                  * If we fail on a rollback, we're up a creek without no
 786                  * paddle, no way forward, no way back. We loose, thanks for
 787                  * playing.
 788                  */
 789                 WARN_ON_ONCE(st->rollback);
 790                 st->should_run = false;
 791         }
 792
 793 end:
 794         cpuhp_lock_release(bringup);
 795         lockdep_release_cpus_lock();
 796
 797         if (!st->should_run)
 798                 complete_ap_thread(st, bringup);
 799 }
 800
 801 /* Invoke a single callback on a remote cpu */
 802 static int
 803 cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
 804                          struct hlist_node *node)
 805 {
 806         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 807         int ret;
 808
 809         if (!cpu_online(cpu))
 810                 return 0;
 811
 812         cpuhp_lock_acquire(false);
 813         cpuhp_lock_release(false);
 814
 815         cpuhp_lock_acquire(true);
 816         cpuhp_lock_release(true);
 817
 818         /*
 819          * If we are up and running, use the hotplug thread. For early calls
 820          * we invoke the thread function directly.
 821          */
 822         if (!st->thread)
 823                 return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
 824
 825         st->rollback = false;
 826         st->last = NULL;
 827
 828         st->node = node;
 829         st->bringup = bringup;
 830         st->cb_state = state;
 831         st->single = true;
 832
 833         __cpuhp_kick_ap(st);
 834
 835         /*
 836          * If we failed and did a partial, do a rollback.
 837          */
 838         if ((ret = st->result) && st->last) {
 839                 st->rollback = true;
 840                 st->bringup = !bringup;
 841
 842                 __cpuhp_kick_ap(st);
 843         }
 844
 845         /*
 846          * Clean up the leftovers so the next hotplug operation wont use stale
 847          * data.
 848          */
 849         st->node = st->last = NULL;
 850         return ret;
 851 }
 852
 853 static int cpuhp_kick_ap_work(unsigned int cpu)
 854 {
 855         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 856         enum cpuhp_state prev_state = st->state;
 857         int ret;
 858
 859         cpuhp_lock_acquire(false);
 860         cpuhp_lock_release(false);
 861
 862         cpuhp_lock_acquire(true);
 863         cpuhp_lock_release(true);
 864
 865         trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
 866         ret = cpuhp_kick_ap(st, st->target);
 867         trace_cpuhp_exit(cpu, st->state, prev_state, ret);
 868
 869         return ret;
 870 }
 871
 872 static struct smp_hotplug_thread cpuhp_threads = {
 873         .store                  = &cpuhp_state.thread,
 874         .create                 = &cpuhp_create,
 875         .thread_should_run      = cpuhp_should_run,
 876         .thread_fn              = cpuhp_thread_fun,
 877         .thread_comm            = "cpuhp/%u",
 878         .selfparking            = true,
 879 };
 880
 881 void __init cpuhp_threads_init(void)
 882 {
 883         BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
 884         kthread_unpark(this_cpu_read(cpuhp_state.thread));
 885 }
 886
 887 /*
 888  *
 889  * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
 890  * protected region.
 891  *
 892  * The operation is still serialized against concurrent CPU hotplug via
 893  * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
 894  * serialized against other hotplug related activity like adding or
 895  * removing of state callbacks and state instances, which invoke either the
 896  * startup or the teardown callback of the affected state.
 897  *
 898  * This is required for subsystems which are unfixable vs. CPU hotplug and
 899  * evade lock inversion problems by scheduling work which has to be
 900  * completed _before_ cpu_up()/_cpu_down() returns.
 901  *
 902  * Don't even think about adding anything to this for any new code or even
 903  * drivers. It's only purpose is to keep existing lock order trainwrecks
 904  * working.
 905  *
 906  * For cpu_down() there might be valid reasons to finish cleanups which are
 907  * not required to be done under cpu_hotplug_lock, but that's a different
 908  * story and would be not invoked via this.
 909  */
 910 static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
 911 {
 912         /*
 913          * cpusets delegate hotplug operations to a worker to "solve" the
 914          * lock order problems. Wait for the worker, but only if tasks are
 915          * _not_ frozen (suspend, hibernate) as that would wait forever.
 916          *
 917          * The wait is required because otherwise the hotplug operation
 918          * returns with inconsistent state, which could even be observed in
 919          * user space when a new CPU is brought up. The CPU plug uevent
 920          * would be delivered and user space reacting on it would fail to
 921          * move tasks to the newly plugged CPU up to the point where the
 922          * work has finished because up to that point the newly plugged CPU
 923          * is not assignable in cpusets/cgroups. On unplug that's not
 924          * necessarily a visible issue, but it is still inconsistent state,
 925          * which is the real problem which needs to be "fixed". This can't
 926          * prevent the transient state between scheduling the work and
 927          * returning from waiting for it.
 928          */
 929         if (!tasks_frozen)
 930                 cpuset_wait_for_hotplug();
 931 }
 932
 933 #ifdef CONFIG_HOTPLUG_CPU
 934 #ifndef arch_clear_mm_cpumask_cpu
 935 #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
 936 #endif
 937
 938 /**
 939  * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
 940  * @cpu: a CPU id
 941  *
 942  * This function walks all processes, finds a valid mm struct for each one and
 943  * then clears a corresponding bit in mm's cpumask.  While this all sounds
 944  * trivial, there are various non-obvious corner cases, which this function
 945  * tries to solve in a safe manner.
 946  *
 947  * Also note that the function uses a somewhat relaxed locking scheme, so it may
 948  * be called only for an already offlined CPU.
 949  */
 950 void clear_tasks_mm_cpumask(int cpu)
 951 {
 952         struct task_struct *p;
 953
 954         /*
 955          * This function is called after the cpu is taken down and marked
 956          * offline, so its not like new tasks will ever get this cpu set in
 957          * their mm mask. -- Peter Zijlstra
 958          * Thus, we may use rcu_read_lock() here, instead of grabbing
 959          * full-fledged tasklist_lock.
 960          */
 961         WARN_ON(cpu_online(cpu));
 962         rcu_read_lock();
 963         for_each_process(p) {
 964                 struct task_struct *t;
 965
 966                 /*
 967                  * Main thread might exit, but other threads may still have
 968                  * a valid mm. Find one.
 969                  */
 970                 t = find_lock_task_mm(p);
 971                 if (!t)
 972                         continue;
 973                 arch_clear_mm_cpumask_cpu(cpu, t->mm);
 974                 task_unlock(t);
 975         }
 976         rcu_read_unlock();
 977 }
 978
 979 /* Take this CPU down. */
 980 static int take_cpu_down(void *_param)
 981 {
 982         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 983         enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
 984         int err, cpu = smp_processor_id();
 985         int ret;
 986
 987         /* Ensure this CPU doesn't handle any more interrupts. */
 988         err = __cpu_disable();
 989         if (err < 0)
 990                 return err;
 991
 992         /*
 993          * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
 994          * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
 995          */
 996         WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
 997
 998         /* Invoke the former CPU_DYING callbacks */
 999         ret = cpuhp_invoke_callback_range(false, cpu, st, target);
1000
1001         /*
1002          * DYING must not fail!
1003          */
1004         WARN_ON_ONCE(ret);
1005
1006         /* Give up timekeeping duties */
1007         tick_handover_do_timer();
1008         /* Remove CPU from timer broadcasting */
1009         tick_offline_cpu(cpu);
1010         /* Park the stopper thread */
1011         stop_machine_park(cpu);
1012         return 0;
1013 }
1014
1015 static int takedown_cpu(unsigned int cpu)
1016 {
1017         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1018         int err;
1019
1020         /* Park the smpboot threads */
1021         kthread_park(st->thread);
1022
1023         /*
1024          * Prevent irq alloc/free while the dying cpu reorganizes the
1025          * interrupt affinities.
1026          */
1027         irq_lock_sparse();
1028
1029         /*
1030          * So now all preempt/rcu users must observe !cpu_active().
1031          */
1032         err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
1033         if (err) {
1034                 /* CPU refused to die */
1035                 irq_unlock_sparse();
1036                 /* Unpark the hotplug thread so we can rollback there */
1037                 kthread_unpark(st->thread);
1038                 return err;
1039         }
1040         BUG_ON(cpu_online(cpu));
1041
1042         /*
1043          * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
1044          * all runnable tasks from the CPU, there's only the idle task left now
1045          * that the migration thread is done doing the stop_machine thing.
1046          *
1047          * Wait for the stop thread to go away.
1048          */
1049         wait_for_ap_thread(st, false);
1050         BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
1051
1052         /* Interrupts are moved away from the dying cpu, reenable alloc/free */
1053         irq_unlock_sparse();
1054
1055         hotplug_cpu__broadcast_tick_pull(cpu);
1056         /* This actually kills the CPU. */
1057         __cpu_die(cpu);
1058
1059         tick_cleanup_dead_cpu(cpu);
1060         rcutree_migrate_callbacks(cpu);
1061         return 0;
1062 }
1063
1064 static void cpuhp_complete_idle_dead(void *arg)
1065 {
1066         struct cpuhp_cpu_state *st = arg;
1067
1068         complete_ap_thread(st, false);
1069 }
1070
1071 void cpuhp_report_idle_dead(void)
1072 {
1073         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
1074
1075         BUG_ON(st->state != CPUHP_AP_OFFLINE);
1076         rcu_report_dead(smp_processor_id());
1077         st->state = CPUHP_AP_IDLE_DEAD;
1078         /*
1079          * We cannot call complete after rcu_report_dead() so we delegate it
1080          * to an online cpu.
1081          */
1082         smp_call_function_single(cpumask_first(cpu_online_mask),
1083                                  cpuhp_complete_idle_dead, st, 0);
1084 }
1085
1086 static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
1087                                 enum cpuhp_state target)
1088 {
1089         enum cpuhp_state prev_state = st->state;
1090         int ret = 0;
1091
1092         ret = cpuhp_invoke_callback_range(false, cpu, st, target);
1093         if (ret) {
1094
1095                 cpuhp_reset_state(st, prev_state);
1096
1097                 if (st->state < prev_state)
1098                         WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
1099                                                             prev_state));
1100         }
1101
1102         return ret;
1103 }
1104
1105 /* Requires cpu_add_remove_lock to be held */
1106 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
1107                            enum cpuhp_state target)
1108 {
1109         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1110         int prev_state, ret = 0;
1111
1112         if (num_online_cpus() == 1)
1113                 return -EBUSY;
1114
1115         if (!cpu_present(cpu))
1116                 return -EINVAL;
1117
1118         cpus_write_lock();
1119
1120         cpuhp_tasks_frozen = tasks_frozen;
1121
1122         prev_state = cpuhp_set_state(st, target);
1123         /*
1124          * If the current CPU state is in the range of the AP hotplug thread,
1125          * then we need to kick the thread.
1126          */
1127         if (st->state > CPUHP_TEARDOWN_CPU) {
1128                 st->target = max((int)target, CPUHP_TEARDOWN_CPU);
1129                 ret = cpuhp_kick_ap_work(cpu);
1130                 /*
1131                  * The AP side has done the error rollback already. Just
1132                  * return the error code..
1133                  */
1134                 if (ret)
1135                         goto out;
1136
1137                 /*
1138                  * We might have stopped still in the range of the AP hotplug
1139                  * thread. Nothing to do anymore.
1140                  */
1141                 if (st->state > CPUHP_TEARDOWN_CPU)
1142                         goto out;
1143
1144                 st->target = target;
1145         }
1146         /*
1147          * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
1148          * to do the further cleanups.
1149          */
1150         ret = cpuhp_down_callbacks(cpu, st, target);
1151         if (ret && st->state < prev_state) {
1152                 if (st->state == CPUHP_TEARDOWN_CPU) {
1153                         cpuhp_reset_state(st, prev_state);
1154                         __cpuhp_kick_ap(st);
1155                 } else {
1156                         WARN(1, "DEAD callback error for CPU%d", cpu);
1157                 }
1158         }
1159
1160 out:
1161         cpus_write_unlock();
1162         /*
1163          * Do post unplug cleanup. This is still protected against
1164          * concurrent CPU hotplug via cpu_add_remove_lock.
1165          */
1166         lockup_detector_cleanup();
1167         arch_smt_update();
1168         cpu_up_down_serialize_trainwrecks(tasks_frozen);
1169         return ret;
1170 }
1171
1172 static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
1173 {
1174         if (cpu_hotplug_disabled)
1175                 return -EBUSY;
1176         return _cpu_down(cpu, 0, target);
1177 }
1178
1179 static int cpu_down(unsigned int cpu, enum cpuhp_state target)
1180 {
1181         int err;
1182
1183         cpu_maps_update_begin();
1184         err = cpu_down_maps_locked(cpu, target);
1185         cpu_maps_update_done();
1186         return err;
1187 }
1188
1189 /**
1190  * cpu_device_down - Bring down a cpu device
1191  * @dev: Pointer to the cpu device to offline
1192  *
1193  * This function is meant to be used by device core cpu subsystem only.
1194  *
1195  * Other subsystems should use remove_cpu() instead.
1196  *
1197  * Return: %0 on success or a negative errno code
1198  */
1199 int cpu_device_down(struct device *dev)
1200 {
1201         return cpu_down(dev->id, CPUHP_OFFLINE);
1202 }
1203
1204 int remove_cpu(unsigned int cpu)
1205 {
1206         int ret;
1207
1208         lock_device_hotplug();
1209         ret = device_offline(get_cpu_device(cpu));
1210         unlock_device_hotplug();
1211
1212         return ret;
1213 }
1214 EXPORT_SYMBOL_GPL(remove_cpu);
1215
1216 void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
1217 {
1218         unsigned int cpu;
1219         int error;
1220
1221         cpu_maps_update_begin();
1222
1223         /*
1224          * Make certain the cpu I'm about to reboot on is online.
1225          *
1226          * This is inline to what migrate_to_reboot_cpu() already do.
1227          */
1228         if (!cpu_online(primary_cpu))
1229                 primary_cpu = cpumask_first(cpu_online_mask);
1230
1231         for_each_online_cpu(cpu) {
1232                 if (cpu == primary_cpu)
1233                         continue;
1234
1235                 error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
1236                 if (error) {
1237                         pr_err("Failed to offline CPU%d - error=%d",
1238                                 cpu, error);
1239                         break;
1240                 }
1241         }
1242
1243         /*
1244          * Ensure all but the reboot CPU are offline.
1245          */
1246         BUG_ON(num_online_cpus() > 1);
1247
1248         /*
1249          * Make sure the CPUs won't be enabled by someone else after this
1250          * point. Kexec will reboot to a new kernel shortly resetting
1251          * everything along the way.
1252          */
1253         cpu_hotplug_disabled++;
1254
1255         cpu_maps_update_done();
1256 }
1257
1258 #else
1259 #define takedown_cpu            NULL
1260 #endif /*CONFIG_HOTPLUG_CPU*/
1261
1262 /**
1263  * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
1264  * @cpu: cpu that just started
1265  *
1266  * It must be called by the arch code on the new cpu, before the new cpu
1267  * enables interrupts and before the "boot" cpu returns from __cpu_up().
1268  */
1269 void notify_cpu_starting(unsigned int cpu)
1270 {
1271         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1272         enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
1273         int ret;
1274
1275         rcu_cpu_starting(cpu);  /* Enables RCU usage on this CPU. */
1276         cpumask_set_cpu(cpu, &cpus_booted_once_mask);
1277         ret = cpuhp_invoke_callback_range(true, cpu, st, target);
1278
1279         /*
1280          * STARTING must not fail!
1281          */
1282         WARN_ON_ONCE(ret);
1283 }
1284
1285 /*
1286  * Called from the idle task. Wake up the controlling task which brings the
1287  * hotplug thread of the upcoming CPU up and then delegates the rest of the
1288  * online bringup to the hotplug thread.
1289  */
1290 void cpuhp_online_idle(enum cpuhp_state state)
1291 {
1292         struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
1293
1294         /* Happens for the boot cpu */
1295         if (state != CPUHP_AP_ONLINE_IDLE)
1296                 return;
1297
1298         /*
1299          * Unpart the stopper thread before we start the idle loop (and start
1300          * scheduling); this ensures the stopper task is always available.
1301          */
1302         stop_machine_unpark(smp_processor_id());
1303
1304         st->state = CPUHP_AP_ONLINE_IDLE;
1305         complete_ap_thread(st, true);
1306 }
1307
1308 /* Requires cpu_add_remove_lock to be held */
1309 static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
1310 {
1311         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1312         struct task_struct *idle;
1313         int ret = 0;
1314
1315         cpus_write_lock();
1316
1317         if (!cpu_present(cpu)) {
1318                 ret = -EINVAL;
1319                 goto out;
1320         }
1321
1322         /*
1323          * The caller of cpu_up() might have raced with another
1324          * caller. Nothing to do.
1325          */
1326         if (st->state >= target)
1327                 goto out;
1328
1329         if (st->state == CPUHP_OFFLINE) {
1330                 /* Let it fail before we try to bring the cpu up */
1331                 idle = idle_thread_get(cpu);
1332                 if (IS_ERR(idle)) {
1333                         ret = PTR_ERR(idle);
1334                         goto out;
1335                 }
1336         }
1337
1338         cpuhp_tasks_frozen = tasks_frozen;
1339
1340         cpuhp_set_state(st, target);
1341         /*
1342          * If the current CPU state is in the range of the AP hotplug thread,
1343          * then we need to kick the thread once more.
1344          */
1345         if (st->state > CPUHP_BRINGUP_CPU) {
1346                 ret = cpuhp_kick_ap_work(cpu);
1347                 /*
1348                  * The AP side has done the error rollback already. Just
1349                  * return the error code..
1350                  */
1351                 if (ret)
1352                         goto out;
1353         }
1354
1355         /*
1356          * Try to reach the target state. We max out on the BP at
1357          * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
1358          * responsible for bringing it up to the target state.
1359          */
1360         target = min((int)target, CPUHP_BRINGUP_CPU);
1361         ret = cpuhp_up_callbacks(cpu, st, target);
1362 out:
1363         cpus_write_unlock();
1364         arch_smt_update();
1365         cpu_up_down_serialize_trainwrecks(tasks_frozen);
1366         return ret;
1367 }
1368
1369 static int cpu_up(unsigned int cpu, enum cpuhp_state target)
1370 {
1371         int err = 0;
1372
1373         if (!cpu_possible(cpu)) {
1374                 pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
1375                        cpu);
1376 #if defined(CONFIG_IA64)
1377                 pr_err("please check additional_cpus= boot parameter\n");
1378 #endif
1379                 return -EINVAL;
1380         }
1381
1382         err = try_online_node(cpu_to_node(cpu));
1383         if (err)
1384                 return err;
1385
1386         cpu_maps_update_begin();
1387
1388         if (cpu_hotplug_disabled) {
1389                 err = -EBUSY;
1390                 goto out;
1391         }
1392         if (!cpu_smt_allowed(cpu)) {
1393                 err = -EPERM;
1394                 goto out;
1395         }
1396
1397         err = _cpu_up(cpu, 0, target);
1398 out:
1399         cpu_maps_update_done();
1400         return err;
1401 }
1402
1403 /**
1404  * cpu_device_up - Bring up a cpu device
1405  * @dev: Pointer to the cpu device to online
1406  *
1407  * This function is meant to be used by device core cpu subsystem only.
1408  *
1409  * Other subsystems should use add_cpu() instead.
1410  *
1411  * Return: %0 on success or a negative errno code
1412  */
1413 int cpu_device_up(struct device *dev)
1414 {
1415         return cpu_up(dev->id, CPUHP_ONLINE);
1416 }
1417
1418 int add_cpu(unsigned int cpu)
1419 {
1420         int ret;
1421
1422         lock_device_hotplug();
1423         ret = device_online(get_cpu_device(cpu));
1424         unlock_device_hotplug();
1425
1426         return ret;
1427 }
1428 EXPORT_SYMBOL_GPL(add_cpu);
1429
1430 /**
1431  * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
1432  * @sleep_cpu: The cpu we hibernated on and should be brought up.
1433  *
1434  * On some architectures like arm64, we can hibernate on any CPU, but on
1435  * wake up the CPU we hibernated on might be offline as a side effect of
1436  * using maxcpus= for example.
1437  *
1438  * Return: %0 on success or a negative errno code
1439  */
1440 int bringup_hibernate_cpu(unsigned int sleep_cpu)
1441 {
1442         int ret;
1443
1444         if (!cpu_online(sleep_cpu)) {
1445                 pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
1446                 ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
1447                 if (ret) {
1448                         pr_err("Failed to bring hibernate-CPU up!\n");
1449                         return ret;
1450                 }
1451         }
1452         return 0;
1453 }
1454
1455 void bringup_nonboot_cpus(unsigned int setup_max_cpus)
1456 {
1457         unsigned int cpu;
1458
1459         for_each_present_cpu(cpu) {
1460                 if (num_online_cpus() >= setup_max_cpus)
1461                         break;
1462                 if (!cpu_online(cpu))
1463                         cpu_up(cpu, CPUHP_ONLINE);
1464         }
1465 }
1466
1467 #ifdef CONFIG_PM_SLEEP_SMP
1468 static cpumask_var_t frozen_cpus;
1469
1470 int freeze_secondary_cpus(int primary)
1471 {
1472         int cpu, error = 0;
1473
1474         cpu_maps_update_begin();
1475         if (primary == -1) {
1476                 primary = cpumask_first(cpu_online_mask);
1477                 if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
1478                         primary = housekeeping_any_cpu(HK_FLAG_TIMER);
1479         } else {
1480                 if (!cpu_online(primary))
1481                         primary = cpumask_first(cpu_online_mask);
1482         }
1483
1484         /*
1485          * We take down all of the non-boot CPUs in one shot to avoid races
1486          * with the userspace trying to use the CPU hotplug at the same time
1487          */
1488         cpumask_clear(frozen_cpus);
1489
1490         pr_info("Disabling non-boot CPUs ...\n");
1491         for_each_online_cpu(cpu) {
1492                 if (cpu == primary)
1493                         continue;
1494
1495                 if (pm_wakeup_pending()) {
1496                         pr_info("Wakeup pending. Abort CPU freeze\n");
1497                         error = -EBUSY;
1498                         break;
1499                 }
1500
1501                 trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
1502                 error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
1503                 trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
1504                 if (!error)
1505                         cpumask_set_cpu(cpu, frozen_cpus);
1506                 else {
1507                         pr_err("Error taking CPU%d down: %d\n", cpu, error);
1508                         break;
1509                 }
1510         }
1511
1512         if (!error)
1513                 BUG_ON(num_online_cpus() > 1);
1514         else
1515                 pr_err("Non-boot CPUs are not disabled\n");
1516
1517         /*
1518          * Make sure the CPUs won't be enabled by someone else. We need to do
1519          * this even in case of failure as all freeze_secondary_cpus() users are
1520          * supposed to do thaw_secondary_cpus() on the failure path.
1521          */
1522         cpu_hotplug_disabled++;
1523
1524         cpu_maps_update_done();
1525         return error;
1526 }
1527
1528 void __weak arch_thaw_secondary_cpus_begin(void)
1529 {
1530 }
1531
1532 void __weak arch_thaw_secondary_cpus_end(void)
1533 {
1534 }
1535
1536 void thaw_secondary_cpus(void)
1537 {
1538         int cpu, error;
1539
1540         /* Allow everyone to use the CPU hotplug again */
1541         cpu_maps_update_begin();
1542         __cpu_hotplug_enable();
1543         if (cpumask_empty(frozen_cpus))
1544                 goto out;
1545
1546         pr_info("Enabling non-boot CPUs ...\n");
1547
1548         arch_thaw_secondary_cpus_begin();
1549
1550         for_each_cpu(cpu, frozen_cpus) {
1551                 trace_suspend_resume(TPS("CPU_ON"), cpu, true);
1552                 error = _cpu_up(cpu, 1, CPUHP_ONLINE);
1553                 trace_suspend_resume(TPS("CPU_ON"), cpu, false);
1554                 if (!error) {
1555                         pr_info("CPU%d is up\n", cpu);
1556                         continue;
1557                 }
1558                 pr_warn("Error taking CPU%d up: %d\n", cpu, error);
1559         }
1560
1561         arch_thaw_secondary_cpus_end();
1562
1563         cpumask_clear(frozen_cpus);
1564 out:
1565         cpu_maps_update_done();
1566 }
1567
1568 static int __init alloc_frozen_cpus(void)
1569 {
1570         if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
1571                 return -ENOMEM;
1572         return 0;
1573 }
1574 core_initcall(alloc_frozen_cpus);
1575
1576 /*
1577  * When callbacks for CPU hotplug notifications are being executed, we must
1578  * ensure that the state of the system with respect to the tasks being frozen
1579  * or not, as reported by the notification, remains unchanged *throughout the
1580  * duration* of the execution of the callbacks.
1581  * Hence we need to prevent the freezer from racing with regular CPU hotplug.
1582  *
1583  * This synchronization is implemented by mutually excluding regular CPU
1584  * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
1585  * Hibernate notifications.
1586  */
1587 static int
1588 cpu_hotplug_pm_callback(struct notifier_block *nb,
1589                         unsigned long action, void *ptr)
1590 {
1591         switch (action) {
1592
1593         case PM_SUSPEND_PREPARE:
1594         case PM_HIBERNATION_PREPARE:
1595                 cpu_hotplug_disable();
1596                 break;
1597
1598         case PM_POST_SUSPEND:
1599         case PM_POST_HIBERNATION:
1600                 cpu_hotplug_enable();
1601                 break;
1602
1603         default:
1604                 return NOTIFY_DONE;
1605         }
1606
1607         return NOTIFY_OK;
1608 }
1609
1610
1611 static int __init cpu_hotplug_pm_sync_init(void)
1612 {
1613         /*
1614          * cpu_hotplug_pm_callback has higher priority than x86
1615          * bsp_pm_callback which depends on cpu_hotplug_pm_callback
1616          * to disable cpu hotplug to avoid cpu hotplug race.
1617          */
1618         pm_notifier(cpu_hotplug_pm_callback, 0);
1619         return 0;
1620 }
1621 core_initcall(cpu_hotplug_pm_sync_init);
1622
1623 #endif /* CONFIG_PM_SLEEP_SMP */
1624
1625 int __boot_cpu_id;
1626
1627 #endif /* CONFIG_SMP */
1628
1629 /* Boot processor state steps */
1630 static struct cpuhp_step cpuhp_hp_states[] = {
1631         [CPUHP_OFFLINE] = {
1632                 .name                   = "offline",
1633                 .startup.single         = NULL,
1634                 .teardown.single        = NULL,
1635         },
1636 #ifdef CONFIG_SMP
1637         [CPUHP_CREATE_THREADS]= {
1638                 .name                   = "threads:prepare",
1639                 .startup.single         = smpboot_create_threads,
1640                 .teardown.single        = NULL,
1641                 .cant_stop              = true,
1642         },
1643         [CPUHP_PERF_PREPARE] = {
1644                 .name                   = "perf:prepare",
1645                 .startup.single         = perf_event_init_cpu,
1646                 .teardown.single        = perf_event_exit_cpu,
1647         },
1648         [CPUHP_WORKQUEUE_PREP] = {
1649                 .name                   = "workqueue:prepare",
1650                 .startup.single         = workqueue_prepare_cpu,
1651                 .teardown.single        = NULL,
1652         },
1653         [CPUHP_HRTIMERS_PREPARE] = {
1654                 .name                   = "hrtimers:prepare",
1655                 .startup.single         = hrtimers_prepare_cpu,
1656                 .teardown.single        = hrtimers_dead_cpu,
1657         },
1658         [CPUHP_SMPCFD_PREPARE] = {
1659                 .name                   = "smpcfd:prepare",
1660                 .startup.single         = smpcfd_prepare_cpu,
1661                 .teardown.single        = smpcfd_dead_cpu,
1662         },
1663         [CPUHP_RELAY_PREPARE] = {
1664                 .name                   = "relay:prepare",
1665                 .startup.single         = relay_prepare_cpu,
1666                 .teardown.single        = NULL,
1667         },
1668         [CPUHP_SLAB_PREPARE] = {
1669                 .name                   = "slab:prepare",
1670                 .startup.single         = slab_prepare_cpu,
1671                 .teardown.single        = slab_dead_cpu,
1672         },
1673         [CPUHP_RCUTREE_PREP] = {
1674                 .name                   = "RCU/tree:prepare",
1675                 .startup.single         = rcutree_prepare_cpu,
1676                 .teardown.single        = rcutree_dead_cpu,
1677         },
1678         /*
1679          * On the tear-down path, timers_dead_cpu() must be invoked
1680          * before blk_mq_queue_reinit_notify() from notify_dead(),
1681          * otherwise a RCU stall occurs.
1682          */
1683         [CPUHP_TIMERS_PREPARE] = {
1684                 .name                   = "timers:prepare",
1685                 .startup.single         = timers_prepare_cpu,
1686                 .teardown.single        = timers_dead_cpu,
1687         },
1688         /* Kicks the plugged cpu into life */
1689         [CPUHP_BRINGUP_CPU] = {
1690                 .name                   = "cpu:bringup",
1691                 .startup.single         = bringup_cpu,
1692                 .teardown.single        = finish_cpu,
1693                 .cant_stop              = true,
1694         },
1695         /* Final state before CPU kills itself */
1696         [CPUHP_AP_IDLE_DEAD] = {
1697                 .name                   = "idle:dead",
1698         },
1699         /*
1700          * Last state before CPU enters the idle loop to die. Transient state
1701          * for synchronization.
1702          */
1703         [CPUHP_AP_OFFLINE] = {
1704                 .name                   = "ap:offline",
1705                 .cant_stop              = true,
1706         },
1707         /* First state is scheduler control. Interrupts are disabled */
1708         [CPUHP_AP_SCHED_STARTING] = {
1709                 .name                   = "sched:starting",
1710                 .startup.single         = sched_cpu_starting,
1711                 .teardown.single        = sched_cpu_dying,
1712         },
1713         [CPUHP_AP_RCUTREE_DYING] = {
1714                 .name                   = "RCU/tree:dying",
1715                 .startup.single         = NULL,
1716                 .teardown.single        = rcutree_dying_cpu,
1717         },
1718         [CPUHP_AP_SMPCFD_DYING] = {
1719                 .name                   = "smpcfd:dying",
1720                 .startup.single         = NULL,
1721                 .teardown.single        = smpcfd_dying_cpu,
1722         },
1723         /* Entry state on starting. Interrupts enabled from here on. Transient
1724          * state for synchronsization */
1725         [CPUHP_AP_ONLINE] = {
1726                 .name                   = "ap:online",
1727         },
1728         /*
1729          * Handled on control processor until the plugged processor manages
1730          * this itself.
1731          */
1732         [CPUHP_TEARDOWN_CPU] = {
1733                 .name                   = "cpu:teardown",
1734                 .startup.single         = NULL,
1735                 .teardown.single        = takedown_cpu,
1736                 .cant_stop              = true,
1737         },
1738
1739         [CPUHP_AP_SCHED_WAIT_EMPTY] = {
1740                 .name                   = "sched:waitempty",
1741                 .startup.single         = NULL,
1742                 .teardown.single        = sched_cpu_wait_empty,
1743         },
1744
1745         /* Handle smpboot threads park/unpark */
1746         [CPUHP_AP_SMPBOOT_THREADS] = {
1747                 .name                   = "smpboot/threads:online",
1748                 .startup.single         = smpboot_unpark_threads,
1749                 .teardown.single        = smpboot_park_threads,
1750         },
1751         [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
1752                 .name                   = "irq/affinity:online",
1753                 .startup.single         = irq_affinity_online_cpu,
1754                 .teardown.single        = NULL,
1755         },
1756         [CPUHP_AP_PERF_ONLINE] = {
1757                 .name                   = "perf:online",
1758                 .startup.single         = perf_event_init_cpu,
1759                 .teardown.single        = perf_event_exit_cpu,
1760         },
1761         [CPUHP_AP_WATCHDOG_ONLINE] = {
1762                 .name                   = "lockup_detector:online",
1763                 .startup.single         = lockup_detector_online_cpu,
1764                 .teardown.single        = lockup_detector_offline_cpu,
1765         },
1766         [CPUHP_AP_WORKQUEUE_ONLINE] = {
1767                 .name                   = "workqueue:online",
1768                 .startup.single         = workqueue_online_cpu,
1769                 .teardown.single        = workqueue_offline_cpu,
1770         },
1771         [CPUHP_AP_RCUTREE_ONLINE] = {
1772                 .name                   = "RCU/tree:online",
1773                 .startup.single         = rcutree_online_cpu,
1774                 .teardown.single        = rcutree_offline_cpu,
1775         },
1776 #endif
1777         /*
1778          * The dynamically registered state space is here
1779          */
1780
1781 #ifdef CONFIG_SMP
1782         /* Last state is scheduler control setting the cpu active */
1783         [CPUHP_AP_ACTIVE] = {
1784                 .name                   = "sched:active",
1785                 .startup.single         = sched_cpu_activate,
1786                 .teardown.single        = sched_cpu_deactivate,
1787         },
1788 #endif
1789
1790         /* CPU is fully up and running. */
1791         [CPUHP_ONLINE] = {
1792                 .name                   = "online",
1793                 .startup.single         = NULL,
1794                 .teardown.single        = NULL,
1795         },
1796 };
1797
1798 /* Sanity check for callbacks */
1799 static int cpuhp_cb_check(enum cpuhp_state state)
1800 {
1801         if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
1802                 return -EINVAL;
1803         return 0;
1804 }
1805
1806 /*
1807  * Returns a free for dynamic slot assignment of the Online state. The states
1808  * are protected by the cpuhp_slot_states mutex and an empty slot is identified
1809  * by having no name assigned.
1810  */
1811 static int cpuhp_reserve_state(enum cpuhp_state state)
1812 {
1813         enum cpuhp_state i, end;
1814         struct cpuhp_step *step;
1815
1816         switch (state) {
1817         case CPUHP_AP_ONLINE_DYN:
1818                 step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
1819                 end = CPUHP_AP_ONLINE_DYN_END;
1820                 break;
1821         case CPUHP_BP_PREPARE_DYN:
1822                 step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
1823                 end = CPUHP_BP_PREPARE_DYN_END;
1824                 break;
1825         default:
1826                 return -EINVAL;
1827         }
1828
1829         for (i = state; i <= end; i++, step++) {
1830                 if (!step->name)
1831                         return i;
1832         }
1833         WARN(1, "No more dynamic states available for CPU hotplug\n");
1834         return -ENOSPC;
1835 }
1836
1837 static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
1838                                  int (*startup)(unsigned int cpu),
1839                                  int (*teardown)(unsigned int cpu),
1840                                  bool multi_instance)
1841 {
1842         /* (Un)Install the callbacks for further cpu hotplug operations */
1843         struct cpuhp_step *sp;
1844         int ret = 0;
1845
1846         /*
1847          * If name is NULL, then the state gets removed.
1848          *
1849          * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
1850          * the first allocation from these dynamic ranges, so the removal
1851          * would trigger a new allocation and clear the wrong (already
1852          * empty) state, leaving the callbacks of the to be cleared state
1853          * dangling, which causes wreckage on the next hotplug operation.
1854          */
1855         if (name && (state == CPUHP_AP_ONLINE_DYN ||
1856                      state == CPUHP_BP_PREPARE_DYN)) {
1857                 ret = cpuhp_reserve_state(state);
1858                 if (ret < 0)
1859                         return ret;
1860                 state = ret;
1861         }
1862         sp = cpuhp_get_step(state);
1863         if (name && sp->name)
1864                 return -EBUSY;
1865
1866         sp->startup.single = startup;
1867         sp->teardown.single = teardown;
1868         sp->name = name;
1869         sp->multi_instance = multi_instance;
1870         INIT_HLIST_HEAD(&sp->list);
1871         return ret;
1872 }
1873
1874 static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
1875 {
1876         return cpuhp_get_step(state)->teardown.single;
1877 }
1878
1879 /*
1880  * Call the startup/teardown function for a step either on the AP or
1881  * on the current CPU.
1882  */
1883 static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
1884                             struct hlist_node *node)
1885 {
1886         struct cpuhp_step *sp = cpuhp_get_step(state);
1887         int ret;
1888
1889         /*
1890          * If there's nothing to do, we done.
1891          * Relies on the union for multi_instance.
1892          */
1893         if (cpuhp_step_empty(bringup, sp))
1894                 return 0;
1895         /*
1896          * The non AP bound callbacks can fail on bringup. On teardown
1897          * e.g. module removal we crash for now.
1898          */
1899 #ifdef CONFIG_SMP
1900         if (cpuhp_is_ap_state(state))
1901                 ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
1902         else
1903                 ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
1904 #else
1905         ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
1906 #endif
1907         BUG_ON(ret && !bringup);
1908         return ret;
1909 }
1910
1911 /*
1912  * Called from __cpuhp_setup_state on a recoverable failure.
1913  *
1914  * Note: The teardown callbacks for rollback are not allowed to fail!
1915  */
1916 static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
1917                                    struct hlist_node *node)
1918 {
1919         int cpu;
1920
1921         /* Roll back the already executed steps on the other cpus */
1922         for_each_present_cpu(cpu) {
1923                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1924                 int cpustate = st->state;
1925
1926                 if (cpu >= failedcpu)
1927                         break;
1928
1929                 /* Did we invoke the startup call on that cpu ? */
1930                 if (cpustate >= state)
1931                         cpuhp_issue_call(cpu, state, false, node);
1932         }
1933 }
1934
1935 int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
1936                                           struct hlist_node *node,
1937                                           bool invoke)
1938 {
1939         struct cpuhp_step *sp;
1940         int cpu;
1941         int ret;
1942
1943         lockdep_assert_cpus_held();
1944
1945         sp = cpuhp_get_step(state);
1946         if (sp->multi_instance == false)
1947                 return -EINVAL;
1948
1949         mutex_lock(&cpuhp_state_mutex);
1950
1951         if (!invoke || !sp->startup.multi)
1952                 goto add_node;
1953
1954         /*
1955          * Try to call the startup callback for each present cpu
1956          * depending on the hotplug state of the cpu.
1957          */
1958         for_each_present_cpu(cpu) {
1959                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
1960                 int cpustate = st->state;
1961
1962                 if (cpustate < state)
1963                         continue;
1964
1965                 ret = cpuhp_issue_call(cpu, state, true, node);
1966                 if (ret) {
1967                         if (sp->teardown.multi)
1968                                 cpuhp_rollback_install(cpu, state, node);
1969                         goto unlock;
1970                 }
1971         }
1972 add_node:
1973         ret = 0;
1974         hlist_add_head(node, &sp->list);
1975 unlock:
1976         mutex_unlock(&cpuhp_state_mutex);
1977         return ret;
1978 }
1979
1980 int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
1981                                bool invoke)
1982 {
1983         int ret;
1984
1985         cpus_read_lock();
1986         ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
1987         cpus_read_unlock();
1988         return ret;
1989 }
1990 EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
1991
1992 /**
1993  * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
1994  * @state:              The state to setup
1995  * @name:               Name of the step
1996  * @invoke:             If true, the startup function is invoked for cpus where
1997  *                      cpu state >= @state
1998  * @startup:            startup callback function
1999  * @teardown:           teardown callback function
2000  * @multi_instance:     State is set up for multiple instances which get
2001  *                      added afterwards.
2002  *
2003  * The caller needs to hold cpus read locked while calling this function.
2004  * Return:
2005  *   On success:
2006  *      Positive state number if @state is CPUHP_AP_ONLINE_DYN;
2007  *      0 for all other states
2008  *   On failure: proper (negative) error code
2009  */
2010 int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
2011                                    const char *name, bool invoke,
2012                                    int (*startup)(unsigned int cpu),
2013                                    int (*teardown)(unsigned int cpu),
2014                                    bool multi_instance)
2015 {
2016         int cpu, ret = 0;
2017         bool dynstate;
2018
2019         lockdep_assert_cpus_held();
2020
2021         if (cpuhp_cb_check(state) || !name)
2022                 return -EINVAL;
2023
2024         mutex_lock(&cpuhp_state_mutex);
2025
2026         ret = cpuhp_store_callbacks(state, name, startup, teardown,
2027                                     multi_instance);
2028
2029         dynstate = state == CPUHP_AP_ONLINE_DYN;
2030         if (ret > 0 && dynstate) {
2031                 state = ret;
2032                 ret = 0;
2033         }
2034
2035         if (ret || !invoke || !startup)
2036                 goto out;
2037
2038         /*
2039          * Try to call the startup callback for each present cpu
2040          * depending on the hotplug state of the cpu.
2041          */
2042         for_each_present_cpu(cpu) {
2043                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
2044                 int cpustate = st->state;
2045
2046                 if (cpustate < state)
2047                         continue;
2048
2049                 ret = cpuhp_issue_call(cpu, state, true, NULL);
2050                 if (ret) {
2051                         if (teardown)
2052                                 cpuhp_rollback_install(cpu, state, NULL);
2053                         cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
2054                         goto out;
2055                 }
2056         }
2057 out:
2058         mutex_unlock(&cpuhp_state_mutex);
2059         /*
2060          * If the requested state is CPUHP_AP_ONLINE_DYN, return the
2061          * dynamically allocated state in case of success.
2062          */
2063         if (!ret && dynstate)
2064                 return state;
2065         return ret;
2066 }
2067 EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);
2068
2069 int __cpuhp_setup_state(enum cpuhp_state state,
2070                         const char *name, bool invoke,
2071                         int (*startup)(unsigned int cpu),
2072                         int (*teardown)(unsigned int cpu),
2073                         bool multi_instance)
2074 {
2075         int ret;
2076
2077         cpus_read_lock();
2078         ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
2079                                              teardown, multi_instance);
2080         cpus_read_unlock();
2081         return ret;
2082 }
2083 EXPORT_SYMBOL(__cpuhp_setup_state);
2084
2085 int __cpuhp_state_remove_instance(enum cpuhp_state state,
2086                                   struct hlist_node *node, bool invoke)
2087 {
2088         struct cpuhp_step *sp = cpuhp_get_step(state);
2089         int cpu;
2090
2091         BUG_ON(cpuhp_cb_check(state));
2092
2093         if (!sp->multi_instance)
2094                 return -EINVAL;
2095
2096         cpus_read_lock();
2097         mutex_lock(&cpuhp_state_mutex);
2098
2099         if (!invoke || !cpuhp_get_teardown_cb(state))
2100                 goto remove;
2101         /*
2102          * Call the teardown callback for each present cpu depending
2103          * on the hotplug state of the cpu. This function is not
2104          * allowed to fail currently!
2105          */
2106         for_each_present_cpu(cpu) {
2107                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
2108                 int cpustate = st->state;
2109
2110                 if (cpustate >= state)
2111                         cpuhp_issue_call(cpu, state, false, node);
2112         }
2113
2114 remove:
2115         hlist_del(node);
2116         mutex_unlock(&cpuhp_state_mutex);
2117         cpus_read_unlock();
2118
2119         return 0;
2120 }
2121 EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
2122
2123 /**
2124  * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
2125  * @state:      The state to remove
2126  * @invoke:     If true, the teardown function is invoked for cpus where
2127  *              cpu state >= @state
2128  *
2129  * The caller needs to hold cpus read locked while calling this function.
2130  * The teardown callback is currently not allowed to fail. Think
2131  * about module removal!
2132  */
2133 void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
2134 {
2135         struct cpuhp_step *sp = cpuhp_get_step(state);
2136         int cpu;
2137
2138         BUG_ON(cpuhp_cb_check(state));
2139
2140         lockdep_assert_cpus_held();
2141
2142         mutex_lock(&cpuhp_state_mutex);
2143         if (sp->multi_instance) {
2144                 WARN(!hlist_empty(&sp->list),
2145                      "Error: Removing state %d which has instances left.\n",
2146                      state);
2147                 goto remove;
2148         }
2149
2150         if (!invoke || !cpuhp_get_teardown_cb(state))
2151                 goto remove;
2152
2153         /*
2154          * Call the teardown callback for each present cpu depending
2155          * on the hotplug state of the cpu. This function is not
2156          * allowed to fail currently!
2157          */
2158         for_each_present_cpu(cpu) {
2159                 struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
2160                 int cpustate = st->state;
2161
2162                 if (cpustate >= state)
2163                         cpuhp_issue_call(cpu, state, false, NULL);
2164         }
2165 remove:
2166         cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
2167         mutex_unlock(&cpuhp_state_mutex);
2168 }
2169 EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);
2170
2171 void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
2172 {
2173         cpus_read_lock();
2174         __cpuhp_remove_state_cpuslocked(state, invoke);
2175         cpus_read_unlock();
2176 }
2177 EXPORT_SYMBOL(__cpuhp_remove_state);
2178
2179 #ifdef CONFIG_HOTPLUG_SMT
2180 static void cpuhp_offline_cpu_device(unsigned int cpu)
2181 {
2182         struct device *dev = get_cpu_device(cpu);
2183
2184         dev->offline = true;
2185         /* Tell user space about the state change */
2186         kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
2187 }
2188
2189 static void cpuhp_online_cpu_device(unsigned int cpu)
2190 {
2191         struct device *dev = get_cpu_device(cpu);
2192
2193         dev->offline = false;
2194         /* Tell user space about the state change */
2195         kobject_uevent(&dev->kobj, KOBJ_ONLINE);
2196 }
2197
2198 int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
2199 {
2200         int cpu, ret = 0;
2201
2202         cpu_maps_update_begin();
2203         for_each_online_cpu(cpu) {
2204                 if (topology_is_primary_thread(cpu))
2205                         continue;
2206                 ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
2207                 if (ret)
2208                         break;
2209                 /*
2210                  * As this needs to hold the cpu maps lock it's impossible
2211                  * to call device_offline() because that ends up calling
2212                  * cpu_down() which takes cpu maps lock. cpu maps lock
2213                  * needs to be held as this might race against in kernel
2214                  * abusers of the hotplug machinery (thermal management).
2215                  *
2216                  * So nothing would update device:offline state. That would
2217                  * leave the sysfs entry stale and prevent onlining after
2218                  * smt control has been changed to 'off' again. This is
2219                  * called under the sysfs hotplug lock, so it is properly
2220                  * serialized against the regular offline usage.
2221                  */
2222                 cpuhp_offline_cpu_device(cpu);
2223         }
2224         if (!ret)
2225                 cpu_smt_control = ctrlval;
2226         cpu_maps_update_done();
2227         return ret;
2228 }
2229
2230 int cpuhp_smt_enable(void)
2231 {
2232         int cpu, ret = 0;
2233
2234         cpu_maps_update_begin();
2235         cpu_smt_control = CPU_SMT_ENABLED;
2236         for_each_present_cpu(cpu) {
2237                 /* Skip online CPUs and CPUs on offline nodes */
2238                 if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
2239                         continue;
2240                 ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
2241                 if (ret)
2242                         break;
2243                 /* See comment in cpuhp_smt_disable() */
2244                 cpuhp_online_cpu_device(cpu);
2245         }
2246         cpu_maps_update_done();
2247         return ret;
2248 }
2249 #endif
2250
2251 #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
2252 static ssize_t show_cpuhp_state(struct device *dev,
2253                                 struct device_attribute *attr, char *buf)
2254 {
2255         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2256
2257         return sprintf(buf, "%d\n", st->state);
2258 }
2259 static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
2260
2261 static ssize_t write_cpuhp_target(struct device *dev,
2262                                   struct device_attribute *attr,
2263                                   const char *buf, size_t count)
2264 {
2265         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2266         struct cpuhp_step *sp;
2267         int target, ret;
2268
2269         ret = kstrtoint(buf, 10, &target);
2270         if (ret)
2271                 return ret;
2272
2273 #ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
2274         if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
2275                 return -EINVAL;
2276 #else
2277         if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
2278                 return -EINVAL;
2279 #endif
2280
2281         ret = lock_device_hotplug_sysfs();
2282         if (ret)
2283                 return ret;
2284
2285         mutex_lock(&cpuhp_state_mutex);
2286         sp = cpuhp_get_step(target);
2287         ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
2288         mutex_unlock(&cpuhp_state_mutex);
2289         if (ret)
2290                 goto out;
2291
2292         if (st->state < target)
2293                 ret = cpu_up(dev->id, target);
2294         else
2295                 ret = cpu_down(dev->id, target);
2296 out:
2297         unlock_device_hotplug();
2298         return ret ? ret : count;
2299 }
2300
2301 static ssize_t show_cpuhp_target(struct device *dev,
2302                                  struct device_attribute *attr, char *buf)
2303 {
2304         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2305
2306         return sprintf(buf, "%d\n", st->target);
2307 }
2308 static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
2309
2310
2311 static ssize_t write_cpuhp_fail(struct device *dev,
2312                                 struct device_attribute *attr,
2313                                 const char *buf, size_t count)
2314 {
2315         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2316         struct cpuhp_step *sp;
2317         int fail, ret;
2318
2319         ret = kstrtoint(buf, 10, &fail);
2320         if (ret)
2321                 return ret;
2322
2323         if (fail == CPUHP_INVALID) {
2324                 st->fail = fail;
2325                 return count;
2326         }
2327
2328         if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
2329                 return -EINVAL;
2330
2331         /*
2332          * Cannot fail STARTING/DYING callbacks.
2333          */
2334         if (cpuhp_is_atomic_state(fail))
2335                 return -EINVAL;
2336
2337         /*
2338          * DEAD callbacks cannot fail...
2339          * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
2340          * triggering STARTING callbacks, a failure in this state would
2341          * hinder rollback.
2342          */
2343         if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
2344                 return -EINVAL;
2345
2346         /*
2347          * Cannot fail anything that doesn't have callbacks.
2348          */
2349         mutex_lock(&cpuhp_state_mutex);
2350         sp = cpuhp_get_step(fail);
2351         if (!sp->startup.single && !sp->teardown.single)
2352                 ret = -EINVAL;
2353         mutex_unlock(&cpuhp_state_mutex);
2354         if (ret)
2355                 return ret;
2356
2357         st->fail = fail;
2358
2359         return count;
2360 }
2361
2362 static ssize_t show_cpuhp_fail(struct device *dev,
2363                                struct device_attribute *attr, char *buf)
2364 {
2365         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
2366
2367         return sprintf(buf, "%d\n", st->fail);
2368 }
2369
2370 static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
2371
2372 static struct attribute *cpuhp_cpu_attrs[] = {
2373         &dev_attr_state.attr,
2374         &dev_attr_target.attr,
2375         &dev_attr_fail.attr,
2376         NULL
2377 };
2378
2379 static const struct attribute_group cpuhp_cpu_attr_group = {
2380         .attrs = cpuhp_cpu_attrs,
2381         .name = "hotplug",
2382         NULL
2383 };
2384
2385 static ssize_t show_cpuhp_states(struct device *dev,
2386                                  struct device_attribute *attr, char *buf)
2387 {
2388         ssize_t cur, res = 0;
2389         int i;
2390
2391         mutex_lock(&cpuhp_state_mutex);
2392         for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
2393                 struct cpuhp_step *sp = cpuhp_get_step(i);
2394
2395                 if (sp->name) {
2396                         cur = sprintf(buf, "%3d: %s\n", i, sp->name);
2397                         buf += cur;
2398                         res += cur;
2399                 }
2400         }
2401         mutex_unlock(&cpuhp_state_mutex);
2402         return res;
2403 }
2404 static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
2405
2406 static struct attribute *cpuhp_cpu_root_attrs[] = {
2407         &dev_attr_states.attr,
2408         NULL
2409 };
2410
2411 static const struct attribute_group cpuhp_cpu_root_attr_group = {
2412         .attrs = cpuhp_cpu_root_attrs,
2413         .name = "hotplug",
2414         NULL
2415 };
2416
2417 #ifdef CONFIG_HOTPLUG_SMT
2418
2419 static ssize_t
2420 __store_smt_control(struct device *dev, struct device_attribute *attr,
2421                     const char *buf, size_t count)
2422 {
2423         int ctrlval, ret;
2424
2425         if (sysfs_streq(buf, "on"))
2426                 ctrlval = CPU_SMT_ENABLED;
2427         else if (sysfs_streq(buf, "off"))
2428                 ctrlval = CPU_SMT_DISABLED;
2429         else if (sysfs_streq(buf, "forceoff"))
2430                 ctrlval = CPU_SMT_FORCE_DISABLED;
2431         else
2432                 return -EINVAL;
2433
2434         if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
2435                 return -EPERM;
2436
2437         if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
2438                 return -ENODEV;
2439
2440         ret = lock_device_hotplug_sysfs();
2441         if (ret)
2442                 return ret;
2443
2444         if (ctrlval != cpu_smt_control) {
2445                 switch (ctrlval) {
2446                 case CPU_SMT_ENABLED:
2447                         ret = cpuhp_smt_enable();
2448                         break;
2449                 case CPU_SMT_DISABLED:
2450                 case CPU_SMT_FORCE_DISABLED:
2451                         ret = cpuhp_smt_disable(ctrlval);
2452                         break;
2453                 }
2454         }
2455
2456         unlock_device_hotplug();
2457         return ret ? ret : count;
2458 }
2459
2460 #else /* !CONFIG_HOTPLUG_SMT */
2461 static ssize_t
2462 __store_smt_control(struct device *dev, struct device_attribute *attr,
2463                     const char *buf, size_t count)
2464 {
2465         return -ENODEV;
2466 }
2467 #endif /* CONFIG_HOTPLUG_SMT */
2468
2469 static const char *smt_states[] = {
2470         [CPU_SMT_ENABLED]               = "on",
2471         [CPU_SMT_DISABLED]              = "off",
2472         [CPU_SMT_FORCE_DISABLED]        = "forceoff",
2473         [CPU_SMT_NOT_SUPPORTED]         = "notsupported",
2474         [CPU_SMT_NOT_IMPLEMENTED]       = "notimplemented",
2475 };
2476
2477 static ssize_t
2478 show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
2479 {
2480         const char *state = smt_states[cpu_smt_control];
2481
2482         return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
2483 }
2484
2485 static ssize_t
2486 store_smt_control(struct device *dev, struct device_attribute *attr,
2487                   const char *buf, size_t count)
2488 {
2489         return __store_smt_control(dev, attr, buf, count);
2490 }
2491 static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
2492
2493 static ssize_t
2494 show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
2495 {
2496         return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
2497 }
2498 static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
2499
2500 static struct attribute *cpuhp_smt_attrs[] = {
2501         &dev_attr_control.attr,
2502         &dev_attr_active.attr,
2503         NULL
2504 };
2505
2506 static const struct attribute_group cpuhp_smt_attr_group = {
2507         .attrs = cpuhp_smt_attrs,
2508         .name = "smt",
2509         NULL
2510 };
2511
2512 static int __init cpu_smt_sysfs_init(void)
2513 {
2514         return sysfs_create_group(&cpu_subsys.dev_root->kobj,
2515                                   &cpuhp_smt_attr_group);
2516 }
2517
2518 static int __init cpuhp_sysfs_init(void)
2519 {
2520         int cpu, ret;
2521
2522         ret = cpu_smt_sysfs_init();
2523         if (ret)
2524                 return ret;
2525
2526         ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
2527                                  &cpuhp_cpu_root_attr_group);
2528         if (ret)
2529                 return ret;
2530
2531         for_each_possible_cpu(cpu) {
2532                 struct device *dev = get_cpu_device(cpu);
2533
2534                 if (!dev)
2535                         continue;
2536                 ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
2537                 if (ret)
2538                         return ret;
2539         }
2540         return 0;
2541 }
2542 device_initcall(cpuhp_sysfs_init);
2543 #endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
2544
2545 /*
2546  * cpu_bit_bitmap[] is a special, "compressed" data structure that
2547  * represents all NR_CPUS bits binary values of 1<<nr.
2548  *
2549  * It is used by cpumask_of() to get a constant address to a CPU
2550  * mask value that has a single bit set only.
2551  */
2552
2553 /* cpu_bit_bitmap[0] is empty - so we can back into it */
2554 #define MASK_DECLARE_1(x)       [x+1][0] = (1UL << (x))
2555 #define MASK_DECLARE_2(x)       MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
2556 #define MASK_DECLARE_4(x)       MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
2557 #define MASK_DECLARE_8(x)       MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
2558
2559 const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
2560
2561         MASK_DECLARE_8(0),      MASK_DECLARE_8(8),
2562         MASK_DECLARE_8(16),     MASK_DECLARE_8(24),
2563 #if BITS_PER_LONG > 32
2564         MASK_DECLARE_8(32),     MASK_DECLARE_8(40),
2565         MASK_DECLARE_8(48),     MASK_DECLARE_8(56),
2566 #endif
2567 };
2568 EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
2569
2570 const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
2571 EXPORT_SYMBOL(cpu_all_bits);
2572
2573 #ifdef CONFIG_INIT_ALL_POSSIBLE
2574 struct cpumask __cpu_possible_mask __read_mostly
2575         = {CPU_BITS_ALL};
2576 #else
2577 struct cpumask __cpu_possible_mask __read_mostly;
2578 #endif
2579 EXPORT_SYMBOL(__cpu_possible_mask);
2580
2581 struct cpumask __cpu_online_mask __read_mostly;
2582 EXPORT_SYMBOL(__cpu_online_mask);
2583
2584 struct cpumask __cpu_present_mask __read_mostly;
2585 EXPORT_SYMBOL(__cpu_present_mask);
2586
2587 struct cpumask __cpu_active_mask __read_mostly;
2588 EXPORT_SYMBOL(__cpu_active_mask);
2589
2590 struct cpumask __cpu_dying_mask __read_mostly;
2591 EXPORT_SYMBOL(__cpu_dying_mask);
2592
2593 atomic_t __num_online_cpus __read_mostly;
2594 EXPORT_SYMBOL(__num_online_cpus);
2595
2596 void init_cpu_present(const struct cpumask *src)
2597 {
2598         cpumask_copy(&__cpu_present_mask, src);
2599 }
2600
2601 void init_cpu_possible(const struct cpumask *src)
2602 {
2603         cpumask_copy(&__cpu_possible_mask, src);
2604 }
2605
2606 void init_cpu_online(const struct cpumask *src)
2607 {
2608         cpumask_copy(&__cpu_online_mask, src);
2609 }
2610
2611 void set_cpu_online(unsigned int cpu, bool online)
2612 {
2613         /*
2614          * atomic_inc/dec() is required to handle the horrid abuse of this
2615          * function by the reboot and kexec code which invoke it from
2616          * IPI/NMI broadcasts when shutting down CPUs. Invocation from
2617          * regular CPU hotplug is properly serialized.
2618          *
2619          * Note, that the fact that __num_online_cpus is of type atomic_t
2620          * does not protect readers which are not serialized against
2621          * concurrent hotplug operations.
2622          */
2623         if (online) {
2624                 if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
2625                         atomic_inc(&__num_online_cpus);
2626         } else {
2627                 if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
2628                         atomic_dec(&__num_online_cpus);
2629         }
2630 }
2631
2632 /*
2633  * Activate the first processor.
2634  */
2635 void __init boot_cpu_init(void)
2636 {
2637         int cpu = smp_processor_id();
2638
2639         /* Mark the boot cpu "present", "online" etc for SMP and UP case */
2640         set_cpu_online(cpu, true);
2641         set_cpu_active(cpu, true);
2642         set_cpu_present(cpu, true);
2643         set_cpu_possible(cpu, true);
2644
2645 #ifdef CONFIG_SMP
2646         __boot_cpu_id = cpu;
2647 #endif
2648 }
2649
2650 /*
2651  * Must be called _AFTER_ setting up the per_cpu areas
2652  */
2653 void __init boot_cpu_hotplug_init(void)
2654 {
2655 #ifdef CONFIG_SMP
2656         cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
2657 #endif
2658         this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
2659 }
2660
2661 /*
2662  * These are used for a global "mitigations=" cmdline option for toggling
2663  * optional CPU mitigations.
2664  */
2665 enum cpu_mitigations {
2666         CPU_MITIGATIONS_OFF,
2667         CPU_MITIGATIONS_AUTO,
2668         CPU_MITIGATIONS_AUTO_NOSMT,
2669 };
2670
2671 static enum cpu_mitigations cpu_mitigations __ro_after_init =
2672         CPU_MITIGATIONS_AUTO;
2673
2674 static int __init mitigations_parse_cmdline(char *arg)
2675 {
2676         if (!strcmp(arg, "off"))
2677                 cpu_mitigations = CPU_MITIGATIONS_OFF;
2678         else if (!strcmp(arg, "auto"))
2679                 cpu_mitigations = CPU_MITIGATIONS_AUTO;
2680         else if (!strcmp(arg, "auto,nosmt"))
2681                 cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
2682         else
2683                 pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
2684                         arg);
2685
2686         return 0;
2687 }
2688 early_param("mitigations", mitigations_parse_cmdline);
2689
2690 /* mitigations=off */
2691 bool cpu_mitigations_off(void)
2692 {
2693         return cpu_mitigations == CPU_MITIGATIONS_OFF;
2694 }
2695 EXPORT_SYMBOL_GPL(cpu_mitigations_off);
2696
2697 /* mitigations=auto,nosmt */
2698 bool cpu_mitigations_auto_nosmt(void)
2699 {
2700         return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
2701 }
2702 EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);