kernel/smp.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Generic helpers for smp ipi calls
   4  *
   5  * (C) Jens Axboe <jens.axboe@oracle.com> 2008
   6  */
   7
   8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   9
  10 #include <linux/irq_work.h>
  11 #include <linux/rcupdate.h>
  12 #include <linux/rculist.h>
  13 #include <linux/kernel.h>
  14 #include <linux/export.h>
  15 #include <linux/percpu.h>
  16 #include <linux/init.h>
  17 #include <linux/interrupt.h>
  18 #include <linux/gfp.h>
  19 #include <linux/smp.h>
  20 #include <linux/cpu.h>
  21 #include <linux/sched.h>
  22 #include <linux/sched/idle.h>
  23 #include <linux/hypervisor.h>
  24 #include <linux/sched/clock.h>
  25 #include <linux/nmi.h>
  26 #include <linux/sched/debug.h>
  27 #include <linux/jump_label.h>
  28
  29 #include <trace/events/ipi.h>
  30 #define CREATE_TRACE_POINTS
  31 #include <trace/events/csd.h>
  32 #undef CREATE_TRACE_POINTS
  33
  34 #include "smpboot.h"
  35 #include "sched/smp.h"
  36
  37 #define CSD_TYPE(_csd)  ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
  38
  39 struct call_function_data {
  40         call_single_data_t      __percpu *csd;
  41         cpumask_var_t           cpumask;
  42         cpumask_var_t           cpumask_ipi;
  43 };
  44
  45 static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
  46
  47 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
  48
  49 static void __flush_smp_call_function_queue(bool warn_cpu_offline);
  50
  51 int smpcfd_prepare_cpu(unsigned int cpu)
  52 {
  53         struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
  54
  55         if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
  56                                      cpu_to_node(cpu)))
  57                 return -ENOMEM;
  58         if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
  59                                      cpu_to_node(cpu))) {
  60                 free_cpumask_var(cfd->cpumask);
  61                 return -ENOMEM;
  62         }
  63         cfd->csd = alloc_percpu(call_single_data_t);
  64         if (!cfd->csd) {
  65                 free_cpumask_var(cfd->cpumask);
  66                 free_cpumask_var(cfd->cpumask_ipi);
  67                 return -ENOMEM;
  68         }
  69
  70         return 0;
  71 }
  72
  73 int smpcfd_dead_cpu(unsigned int cpu)
  74 {
  75         struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
  76
  77         free_cpumask_var(cfd->cpumask);
  78         free_cpumask_var(cfd->cpumask_ipi);
  79         free_percpu(cfd->csd);
  80         return 0;
  81 }
  82
  83 int smpcfd_dying_cpu(unsigned int cpu)
  84 {
  85         /*
  86          * The IPIs for the smp-call-function callbacks queued by other
  87          * CPUs might arrive late, either due to hardware latencies or
  88          * because this CPU disabled interrupts (inside stop-machine)
  89          * before the IPIs were sent. So flush out any pending callbacks
  90          * explicitly (without waiting for the IPIs to arrive), to
  91          * ensure that the outgoing CPU doesn't go offline with work
  92          * still pending.
  93          */
  94         __flush_smp_call_function_queue(false);
  95         irq_work_run();
  96         return 0;
  97 }
  98
  99 void __init call_function_init(void)
 100 {
 101         int i;
 102
 103         for_each_possible_cpu(i)
 104                 init_llist_head(&per_cpu(call_single_queue, i));
 105
 106         smpcfd_prepare_cpu(smp_processor_id());
 107 }
 108
 109 static __always_inline void
 110 send_call_function_single_ipi(int cpu)
 111 {
 112         if (call_function_single_prep_ipi(cpu)) {
 113                 trace_ipi_send_cpu(cpu, _RET_IP_,
 114                                    generic_smp_call_function_single_interrupt);
 115                 arch_send_call_function_single_ipi(cpu);
 116         }
 117 }
 118
 119 static __always_inline void
 120 send_call_function_ipi_mask(struct cpumask *mask)
 121 {
 122         trace_ipi_send_cpumask(mask, _RET_IP_,
 123                                generic_smp_call_function_single_interrupt);
 124         arch_send_call_function_ipi_mask(mask);
 125 }
 126
 127 static __always_inline void
 128 csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd)
 129 {
 130         trace_csd_function_entry(func, csd);
 131         func(info);
 132         trace_csd_function_exit(func, csd);
 133 }
 134
 135 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
 136
 137 static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
 138
 139 /*
 140  * Parse the csdlock_debug= kernel boot parameter.
 141  *
 142  * If you need to restore the old "ext" value that once provided
 143  * additional debugging information, reapply the following commits:
 144  *
 145  * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
 146  * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
 147  */
 148 static int __init csdlock_debug(char *str)
 149 {
 150         int ret;
 151         unsigned int val = 0;
 152
 153         ret = get_option(&str, &val);
 154         if (ret) {
 155                 if (val)
 156                         static_branch_enable(&csdlock_debug_enabled);
 157                 else
 158                         static_branch_disable(&csdlock_debug_enabled);
 159         }
 160
 161         return 1;
 162 }
 163 __setup("csdlock_debug=", csdlock_debug);
 164
 165 static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
 166 static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
 167 static DEFINE_PER_CPU(void *, cur_csd_info);
 168
 169 static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
 170 module_param(csd_lock_timeout, ulong, 0444);
 171
 172 static atomic_t csd_bug_count = ATOMIC_INIT(0);
 173
 174 /* Record current CSD work for current CPU, NULL to erase. */
 175 static void __csd_lock_record(struct __call_single_data *csd)
 176 {
 177         if (!csd) {
 178                 smp_mb(); /* NULL cur_csd after unlock. */
 179                 __this_cpu_write(cur_csd, NULL);
 180                 return;
 181         }
 182         __this_cpu_write(cur_csd_func, csd->func);
 183         __this_cpu_write(cur_csd_info, csd->info);
 184         smp_wmb(); /* func and info before csd. */
 185         __this_cpu_write(cur_csd, csd);
 186         smp_mb(); /* Update cur_csd before function call. */
 187                   /* Or before unlock, as the case may be. */
 188 }
 189
 190 static __always_inline void csd_lock_record(struct __call_single_data *csd)
 191 {
 192         if (static_branch_unlikely(&csdlock_debug_enabled))
 193                 __csd_lock_record(csd);
 194 }
 195
 196 static int csd_lock_wait_getcpu(struct __call_single_data *csd)
 197 {
 198         unsigned int csd_type;
 199
 200         csd_type = CSD_TYPE(csd);
 201         if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
 202                 return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
 203         return -1;
 204 }
 205
 206 /*
 207  * Complain if too much time spent waiting.  Note that only
 208  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
 209  * so waiting on other types gets much less information.
 210  */
 211 static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id)
 212 {
 213         int cpu = -1;
 214         int cpux;
 215         bool firsttime;
 216         u64 ts2, ts_delta;
 217         call_single_data_t *cpu_cur_csd;
 218         unsigned int flags = READ_ONCE(csd->node.u_flags);
 219         unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;
 220
 221         if (!(flags & CSD_FLAG_LOCK)) {
 222                 if (!unlikely(*bug_id))
 223                         return true;
 224                 cpu = csd_lock_wait_getcpu(csd);
 225                 pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
 226                          *bug_id, raw_smp_processor_id(), cpu);
 227                 return true;
 228         }
 229
 230         ts2 = sched_clock();
 231         ts_delta = ts2 - *ts1;
 232         if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
 233                 return false;
 234
 235         firsttime = !*bug_id;
 236         if (firsttime)
 237                 *bug_id = atomic_inc_return(&csd_bug_count);
 238         cpu = csd_lock_wait_getcpu(csd);
 239         if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
 240                 cpux = 0;
 241         else
 242                 cpux = cpu;
 243         cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
 244         pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
 245                  firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
 246                  cpu, csd->func, csd->info);
 247         if (cpu_cur_csd && csd != cpu_cur_csd) {
 248                 pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
 249                          *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
 250                          READ_ONCE(per_cpu(cur_csd_info, cpux)));
 251         } else {
 252                 pr_alert("\tcsd: CSD lock (#%d) %s.\n",
 253                          *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
 254         }
 255         if (cpu >= 0) {
 256                 dump_cpu_task(cpu);
 257                 if (!cpu_cur_csd) {
 258                         pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
 259                         arch_send_call_function_single_ipi(cpu);
 260                 }
 261         }
 262         dump_stack();
 263         *ts1 = ts2;
 264
 265         return false;
 266 }
 267
 268 /*
 269  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
 270  *
 271  * For non-synchronous ipi calls the csd can still be in use by the
 272  * previous function call. For multi-cpu calls its even more interesting
 273  * as we'll have to ensure no other cpu is observing our csd.
 274  */
 275 static void __csd_lock_wait(struct __call_single_data *csd)
 276 {
 277         int bug_id = 0;
 278         u64 ts0, ts1;
 279
 280         ts1 = ts0 = sched_clock();
 281         for (;;) {
 282                 if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
 283                         break;
 284                 cpu_relax();
 285         }
 286         smp_acquire__after_ctrl_dep();
 287 }
 288
 289 static __always_inline void csd_lock_wait(struct __call_single_data *csd)
 290 {
 291         if (static_branch_unlikely(&csdlock_debug_enabled)) {
 292                 __csd_lock_wait(csd);
 293                 return;
 294         }
 295
 296         smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 297 }
 298 #else
 299 static void csd_lock_record(struct __call_single_data *csd)
 300 {
 301 }
 302
 303 static __always_inline void csd_lock_wait(struct __call_single_data *csd)
 304 {
 305         smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 306 }
 307 #endif
 308
 309 static __always_inline void csd_lock(struct __call_single_data *csd)
 310 {
 311         csd_lock_wait(csd);
 312         csd->node.u_flags |= CSD_FLAG_LOCK;
 313
 314         /*
 315          * prevent CPU from reordering the above assignment
 316          * to ->flags with any subsequent assignments to other
 317          * fields of the specified call_single_data_t structure:
 318          */
 319         smp_wmb();
 320 }
 321
 322 static __always_inline void csd_unlock(struct __call_single_data *csd)
 323 {
 324         WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 325
 326         /*
 327          * ensure we're all done before releasing data:
 328          */
 329         smp_store_release(&csd->node.u_flags, 0);
 330 }
 331
 332 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 333
 334 void __smp_call_single_queue(int cpu, struct llist_node *node)
 335 {
 336         /*
 337          * We have to check the type of the CSD before queueing it, because
 338          * once queued it can have its flags cleared by
 339          *   flush_smp_call_function_queue()
 340          * even if we haven't sent the smp_call IPI yet (e.g. the stopper
 341          * executes migration_cpu_stop() on the remote CPU).
 342          */
 343         if (trace_csd_queue_cpu_enabled()) {
 344                 call_single_data_t *csd;
 345                 smp_call_func_t func;
 346
 347                 csd = container_of(node, call_single_data_t, node.llist);
 348                 func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
 349                         sched_ttwu_pending : csd->func;
 350
 351                 trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
 352         }
 353
 354         /*
 355          * The list addition should be visible to the target CPU when it pops
 356          * the head of the list to pull the entry off it in the IPI handler
 357          * because of normal cache coherency rules implied by the underlying
 358          * llist ops.
 359          *
 360          * If IPIs can go out of order to the cache coherency protocol
 361          * in an architecture, sufficient synchronisation should be added
 362          * to arch code to make it appear to obey cache coherency WRT
 363          * locking and barrier primitives. Generic code isn't really
 364          * equipped to do the right thing...
 365          */
 366         if (llist_add(node, &per_cpu(call_single_queue, cpu)))
 367                 send_call_function_single_ipi(cpu);
 368 }
 369
 370 /*
 371  * Insert a previously allocated call_single_data_t element
 372  * for execution on the given CPU. data must already have
 373  * ->func, ->info, and ->flags set.
 374  */
 375 static int generic_exec_single(int cpu, struct __call_single_data *csd)
 376 {
 377         if (cpu == smp_processor_id()) {
 378                 smp_call_func_t func = csd->func;
 379                 void *info = csd->info;
 380                 unsigned long flags;
 381
 382                 /*
 383                  * We can unlock early even for the synchronous on-stack case,
 384                  * since we're doing this from the same CPU..
 385                  */
 386                 csd_lock_record(csd);
 387                 csd_unlock(csd);
 388                 local_irq_save(flags);
 389                 csd_do_func(func, info, NULL);
 390                 csd_lock_record(NULL);
 391                 local_irq_restore(flags);
 392                 return 0;
 393         }
 394
 395         if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
 396                 csd_unlock(csd);
 397                 return -ENXIO;
 398         }
 399
 400         __smp_call_single_queue(cpu, &csd->node.llist);
 401
 402         return 0;
 403 }
 404
 405 /**
 406  * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
 407  *
 408  * Invoked by arch to handle an IPI for call function single.
 409  * Must be called with interrupts disabled.
 410  */
 411 void generic_smp_call_function_single_interrupt(void)
 412 {
 413         __flush_smp_call_function_queue(true);
 414 }
 415
 416 /**
 417  * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 418  *
 419  * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
 420  *                    offline CPU. Skip this check if set to 'false'.
 421  *
 422  * Flush any pending smp-call-function callbacks queued on this CPU. This is
 423  * invoked by the generic IPI handler, as well as by a CPU about to go offline,
 424  * to ensure that all pending IPI callbacks are run before it goes completely
 425  * offline.
 426  *
 427  * Loop through the call_single_queue and run all the queued callbacks.
 428  * Must be called with interrupts disabled.
 429  */
 430 static void __flush_smp_call_function_queue(bool warn_cpu_offline)
 431 {
 432         call_single_data_t *csd, *csd_next;
 433         struct llist_node *entry, *prev;
 434         struct llist_head *head;
 435         static bool warned;
 436
 437         lockdep_assert_irqs_disabled();
 438
 439         head = this_cpu_ptr(&call_single_queue);
 440         entry = llist_del_all(head);
 441         entry = llist_reverse_order(entry);
 442
 443         /* There shouldn't be any pending callbacks on an offline CPU. */
 444         if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
 445                      !warned && entry != NULL)) {
 446                 warned = true;
 447                 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
 448
 449                 /*
 450                  * We don't have to use the _safe() variant here
 451                  * because we are not invoking the IPI handlers yet.
 452                  */
 453                 llist_for_each_entry(csd, entry, node.llist) {
 454                         switch (CSD_TYPE(csd)) {
 455                         case CSD_TYPE_ASYNC:
 456                         case CSD_TYPE_SYNC:
 457                         case CSD_TYPE_IRQ_WORK:
 458                                 pr_warn("IPI callback %pS sent to offline CPU\n",
 459                                         csd->func);
 460                                 break;
 461
 462                         case CSD_TYPE_TTWU:
 463                                 pr_warn("IPI task-wakeup sent to offline CPU\n");
 464                                 break;
 465
 466                         default:
 467                                 pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
 468                                         CSD_TYPE(csd));
 469                                 break;
 470                         }
 471                 }
 472         }
 473
 474         /*
 475          * First; run all SYNC callbacks, people are waiting for us.
 476          */
 477         prev = NULL;
 478         llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 479                 /* Do we wait until *after* callback? */
 480                 if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
 481                         smp_call_func_t func = csd->func;
 482                         void *info = csd->info;
 483
 484                         if (prev) {
 485                                 prev->next = &csd_next->node.llist;
 486                         } else {
 487                                 entry = &csd_next->node.llist;
 488                         }
 489
 490                         csd_lock_record(csd);
 491                         csd_do_func(func, info, csd);
 492                         csd_unlock(csd);
 493                         csd_lock_record(NULL);
 494                 } else {
 495                         prev = &csd->node.llist;
 496                 }
 497         }
 498
 499         if (!entry)
 500                 return;
 501
 502         /*
 503          * Second; run all !SYNC callbacks.
 504          */
 505         prev = NULL;
 506         llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 507                 int type = CSD_TYPE(csd);
 508
 509                 if (type != CSD_TYPE_TTWU) {
 510                         if (prev) {
 511                                 prev->next = &csd_next->node.llist;
 512                         } else {
 513                                 entry = &csd_next->node.llist;
 514                         }
 515
 516                         if (type == CSD_TYPE_ASYNC) {
 517                                 smp_call_func_t func = csd->func;
 518                                 void *info = csd->info;
 519
 520                                 csd_lock_record(csd);
 521                                 csd_unlock(csd);
 522                                 csd_do_func(func, info, csd);
 523                                 csd_lock_record(NULL);
 524                         } else if (type == CSD_TYPE_IRQ_WORK) {
 525                                 irq_work_single(csd);
 526                         }
 527
 528                 } else {
 529                         prev = &csd->node.llist;
 530                 }
 531         }
 532
 533         /*
 534          * Third; only CSD_TYPE_TTWU is left, issue those.
 535          */
 536         if (entry) {
 537                 csd = llist_entry(entry, typeof(*csd), node.llist);
 538                 csd_do_func(sched_ttwu_pending, entry, csd);
 539         }
 540 }
 541
 542
 543 /**
 544  * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 545  *                                 from task context (idle, migration thread)
 546  *
 547  * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
 548  * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
 549  * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
 550  * handle queued SMP function calls before scheduling.
 551  *
 552  * The migration thread has to ensure that an eventually pending wakeup has
 553  * been handled before it migrates a task.
 554  */
 555 void flush_smp_call_function_queue(void)
 556 {
 557         unsigned int was_pending;
 558         unsigned long flags;
 559
 560         if (llist_empty(this_cpu_ptr(&call_single_queue)))
 561                 return;
 562
 563         local_irq_save(flags);
 564         /* Get the already pending soft interrupts for RT enabled kernels */
 565         was_pending = local_softirq_pending();
 566         __flush_smp_call_function_queue(true);
 567         if (local_softirq_pending())
 568                 do_softirq_post_smp_call_flush(was_pending);
 569
 570         local_irq_restore(flags);
 571 }
 572
 573 /*
 574  * smp_call_function_single - Run a function on a specific CPU
 575  * @func: The function to run. This must be fast and non-blocking.
 576  * @info: An arbitrary pointer to pass to the function.
 577  * @wait: If true, wait until function has completed on other CPUs.
 578  *
 579  * Returns 0 on success, else a negative status code.
 580  */
 581 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 582                              int wait)
 583 {
 584         call_single_data_t *csd;
 585         call_single_data_t csd_stack = {
 586                 .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
 587         };
 588         int this_cpu;
 589         int err;
 590
 591         /*
 592          * prevent preemption and reschedule on another processor,
 593          * as well as CPU removal
 594          */
 595         this_cpu = get_cpu();
 596
 597         /*
 598          * Can deadlock when called with interrupts disabled.
 599          * We allow cpu's that are not yet online though, as no one else can
 600          * send smp call function interrupt to this cpu and as such deadlocks
 601          * can't happen.
 602          */
 603         WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
 604                      && !oops_in_progress);
 605
 606         /*
 607          * When @wait we can deadlock when we interrupt between llist_add() and
 608          * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
 609          * csd_lock() on because the interrupt context uses the same csd
 610          * storage.
 611          */
 612         WARN_ON_ONCE(!in_task());
 613
 614         csd = &csd_stack;
 615         if (!wait) {
 616                 csd = this_cpu_ptr(&csd_data);
 617                 csd_lock(csd);
 618         }
 619
 620         csd->func = func;
 621         csd->info = info;
 622 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
 623         csd->node.src = smp_processor_id();
 624         csd->node.dst = cpu;
 625 #endif
 626
 627         err = generic_exec_single(cpu, csd);
 628
 629         if (wait)
 630                 csd_lock_wait(csd);
 631
 632         put_cpu();
 633
 634         return err;
 635 }
 636 EXPORT_SYMBOL(smp_call_function_single);
 637
 638 /**
 639  * smp_call_function_single_async() - Run an asynchronous function on a
 640  *                               specific CPU.
 641  * @cpu: The CPU to run on.
 642  * @csd: Pre-allocated and setup data structure
 643  *
 644  * Like smp_call_function_single(), but the call is asynchonous and
 645  * can thus be done from contexts with disabled interrupts.
 646  *
 647  * The caller passes his own pre-allocated data structure
 648  * (ie: embedded in an object) and is responsible for synchronizing it
 649  * such that the IPIs performed on the @csd are strictly serialized.
 650  *
 651  * If the function is called with one csd which has not yet been
 652  * processed by previous call to smp_call_function_single_async(), the
 653  * function will return immediately with -EBUSY showing that the csd
 654  * object is still in progress.
 655  *
 656  * NOTE: Be careful, there is unfortunately no current debugging facility to
 657  * validate the correctness of this serialization.
 658  *
 659  * Return: %0 on success or negative errno value on error
 660  */
 661 int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
 662 {
 663         int err = 0;
 664
 665         preempt_disable();
 666
 667         if (csd->node.u_flags & CSD_FLAG_LOCK) {
 668                 err = -EBUSY;
 669                 goto out;
 670         }
 671
 672         csd->node.u_flags = CSD_FLAG_LOCK;
 673         smp_wmb();
 674
 675         err = generic_exec_single(cpu, csd);
 676
 677 out:
 678         preempt_enable();
 679
 680         return err;
 681 }
 682 EXPORT_SYMBOL_GPL(smp_call_function_single_async);
 683
 684 /*
 685  * smp_call_function_any - Run a function on any of the given cpus
 686  * @mask: The mask of cpus it can run on.
 687  * @func: The function to run. This must be fast and non-blocking.
 688  * @info: An arbitrary pointer to pass to the function.
 689  * @wait: If true, wait until function has completed.
 690  *
 691  * Returns 0 on success, else a negative status code (if no cpus were online).
 692  *
 693  * Selection preference:
 694  *      1) current cpu if in @mask
 695  *      2) any cpu of current node if in @mask
 696  *      3) any other online cpu in @mask
 697  */
 698 int smp_call_function_any(const struct cpumask *mask,
 699                           smp_call_func_t func, void *info, int wait)
 700 {
 701         unsigned int cpu;
 702         const struct cpumask *nodemask;
 703         int ret;
 704
 705         /* Try for same CPU (cheapest) */
 706         cpu = get_cpu();
 707         if (cpumask_test_cpu(cpu, mask))
 708                 goto call;
 709
 710         /* Try for same node. */
 711         nodemask = cpumask_of_node(cpu_to_node(cpu));
 712         for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
 713              cpu = cpumask_next_and(cpu, nodemask, mask)) {
 714                 if (cpu_online(cpu))
 715                         goto call;
 716         }
 717
 718         /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
 719         cpu = cpumask_any_and(mask, cpu_online_mask);
 720 call:
 721         ret = smp_call_function_single(cpu, func, info, wait);
 722         put_cpu();
 723         return ret;
 724 }
 725 EXPORT_SYMBOL_GPL(smp_call_function_any);
 726
 727 /*
 728  * Flags to be used as scf_flags argument of smp_call_function_many_cond().
 729  *
 730  * %SCF_WAIT:           Wait until function execution is completed
 731  * %SCF_RUN_LOCAL:      Run also locally if local cpu is set in cpumask
 732  */
 733 #define SCF_WAIT        (1U << 0)
 734 #define SCF_RUN_LOCAL   (1U << 1)
 735
 736 static void smp_call_function_many_cond(const struct cpumask *mask,
 737                                         smp_call_func_t func, void *info,
 738                                         unsigned int scf_flags,
 739                                         smp_cond_func_t cond_func)
 740 {
 741         int cpu, last_cpu, this_cpu = smp_processor_id();
 742         struct call_function_data *cfd;
 743         bool wait = scf_flags & SCF_WAIT;
 744         int nr_cpus = 0;
 745         bool run_remote = false;
 746         bool run_local = false;
 747
 748         lockdep_assert_preemption_disabled();
 749
 750         /*
 751          * Can deadlock when called with interrupts disabled.
 752          * We allow cpu's that are not yet online though, as no one else can
 753          * send smp call function interrupt to this cpu and as such deadlocks
 754          * can't happen.
 755          */
 756         if (cpu_online(this_cpu) && !oops_in_progress &&
 757             !early_boot_irqs_disabled)
 758                 lockdep_assert_irqs_enabled();
 759
 760         /*
 761          * When @wait we can deadlock when we interrupt between llist_add() and
 762          * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
 763          * csd_lock() on because the interrupt context uses the same csd
 764          * storage.
 765          */
 766         WARN_ON_ONCE(!in_task());
 767
 768         /* Check if we need local execution. */
 769         if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
 770                 run_local = true;
 771
 772         /* Check if we need remote execution, i.e., any CPU excluding this one. */
 773         cpu = cpumask_first_and(mask, cpu_online_mask);
 774         if (cpu == this_cpu)
 775                 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
 776         if (cpu < nr_cpu_ids)
 777                 run_remote = true;
 778
 779         if (run_remote) {
 780                 cfd = this_cpu_ptr(&cfd_data);
 781                 cpumask_and(cfd->cpumask, mask, cpu_online_mask);
 782                 __cpumask_clear_cpu(this_cpu, cfd->cpumask);
 783
 784                 cpumask_clear(cfd->cpumask_ipi);
 785                 for_each_cpu(cpu, cfd->cpumask) {
 786                         call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
 787
 788                         if (cond_func && !cond_func(cpu, info)) {
 789                                 __cpumask_clear_cpu(cpu, cfd->cpumask);
 790                                 continue;
 791                         }
 792
 793                         csd_lock(csd);
 794                         if (wait)
 795                                 csd->node.u_flags |= CSD_TYPE_SYNC;
 796                         csd->func = func;
 797                         csd->info = info;
 798 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
 799                         csd->node.src = smp_processor_id();
 800                         csd->node.dst = cpu;
 801 #endif
 802                         trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
 803
 804                         if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
 805                                 __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 806                                 nr_cpus++;
 807                                 last_cpu = cpu;
 808                         }
 809                 }
 810
 811                 /*
 812                  * Choose the most efficient way to send an IPI. Note that the
 813                  * number of CPUs might be zero due to concurrent changes to the
 814                  * provided mask.
 815                  */
 816                 if (nr_cpus == 1)
 817                         send_call_function_single_ipi(last_cpu);
 818                 else if (likely(nr_cpus > 1))
 819                         send_call_function_ipi_mask(cfd->cpumask_ipi);
 820         }
 821
 822         if (run_local && (!cond_func || cond_func(this_cpu, info))) {
 823                 unsigned long flags;
 824
 825                 local_irq_save(flags);
 826                 csd_do_func(func, info, NULL);
 827                 local_irq_restore(flags);
 828         }
 829
 830         if (run_remote && wait) {
 831                 for_each_cpu(cpu, cfd->cpumask) {
 832                         call_single_data_t *csd;
 833
 834                         csd = per_cpu_ptr(cfd->csd, cpu);
 835                         csd_lock_wait(csd);
 836                 }
 837         }
 838 }
 839
 840 /**
 841  * smp_call_function_many(): Run a function on a set of CPUs.
 842  * @mask: The set of cpus to run on (only runs on online subset).
 843  * @func: The function to run. This must be fast and non-blocking.
 844  * @info: An arbitrary pointer to pass to the function.
 845  * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
 846  *        (atomically) until function has completed on other CPUs. If
 847  *        %SCF_RUN_LOCAL is set, the function will also be run locally
 848  *        if the local CPU is set in the @cpumask.
 849  *
 850  * If @wait is true, then returns once @func has returned.
 851  *
 852  * You must not call this function with disabled interrupts or from a
 853  * hardware interrupt handler or from a bottom half handler. Preemption
 854  * must be disabled when calling this function.
 855  */
 856 void smp_call_function_many(const struct cpumask *mask,
 857                             smp_call_func_t func, void *info, bool wait)
 858 {
 859         smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
 860 }
 861 EXPORT_SYMBOL(smp_call_function_many);
 862
 863 /**
 864  * smp_call_function(): Run a function on all other CPUs.
 865  * @func: The function to run. This must be fast and non-blocking.
 866  * @info: An arbitrary pointer to pass to the function.
 867  * @wait: If true, wait (atomically) until function has completed
 868  *        on other CPUs.
 869  *
 870  * Returns 0.
 871  *
 872  * If @wait is true, then returns once @func has returned; otherwise
 873  * it returns just before the target cpu calls @func.
 874  *
 875  * You must not call this function with disabled interrupts or from a
 876  * hardware interrupt handler or from a bottom half handler.
 877  */
 878 void smp_call_function(smp_call_func_t func, void *info, int wait)
 879 {
 880         preempt_disable();
 881         smp_call_function_many(cpu_online_mask, func, info, wait);
 882         preempt_enable();
 883 }
 884 EXPORT_SYMBOL(smp_call_function);
 885
 886 /* Setup configured maximum number of CPUs to activate */
 887 unsigned int setup_max_cpus = NR_CPUS;
 888 EXPORT_SYMBOL(setup_max_cpus);
 889
 890
 891 /*
 892  * Setup routine for controlling SMP activation
 893  *
 894  * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
 895  * activation entirely (the MPS table probe still happens, though).
 896  *
 897  * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
 898  * greater than 0, limits the maximum number of CPUs activated in
 899  * SMP mode to <NUM>.
 900  */
 901
 902 void __weak __init arch_disable_smp_support(void) { }
 903
 904 static int __init nosmp(char *str)
 905 {
 906         setup_max_cpus = 0;
 907         arch_disable_smp_support();
 908
 909         return 0;
 910 }
 911
 912 early_param("nosmp", nosmp);
 913
 914 /* this is hard limit */
 915 static int __init nrcpus(char *str)
 916 {
 917         int nr_cpus;
 918
 919         if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
 920                 set_nr_cpu_ids(nr_cpus);
 921
 922         return 0;
 923 }
 924
 925 early_param("nr_cpus", nrcpus);
 926
 927 static int __init maxcpus(char *str)
 928 {
 929         get_option(&str, &setup_max_cpus);
 930         if (setup_max_cpus == 0)
 931                 arch_disable_smp_support();
 932
 933         return 0;
 934 }
 935
 936 early_param("maxcpus", maxcpus);
 937
 938 #if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
 939 /* Setup number of possible processor ids */
 940 unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
 941 EXPORT_SYMBOL(nr_cpu_ids);
 942 #endif
 943
 944 /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
 945 void __init setup_nr_cpu_ids(void)
 946 {
 947         set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
 948 }
 949
 950 /* Called by boot processor to activate the rest. */
 951 void __init smp_init(void)
 952 {
 953         int num_nodes, num_cpus;
 954
 955         idle_threads_init();
 956         cpuhp_threads_init();
 957
 958         pr_info("Bringing up secondary CPUs ...\n");
 959
 960         bringup_nonboot_cpus(setup_max_cpus);
 961
 962         num_nodes = num_online_nodes();
 963         num_cpus  = num_online_cpus();
 964         pr_info("Brought up %d node%s, %d CPU%s\n",
 965                 num_nodes, (num_nodes > 1 ? "s" : ""),
 966                 num_cpus,  (num_cpus  > 1 ? "s" : ""));
 967
 968         /* Any cleanup work */
 969         smp_cpus_done(setup_max_cpus);
 970 }
 971
 972 /*
 973  * on_each_cpu_cond(): Call a function on each processor for which
 974  * the supplied function cond_func returns true, optionally waiting
 975  * for all the required CPUs to finish. This may include the local
 976  * processor.
 977  * @cond_func:  A callback function that is passed a cpu id and
 978  *              the info parameter. The function is called
 979  *              with preemption disabled. The function should
 980  *              return a blooean value indicating whether to IPI
 981  *              the specified CPU.
 982  * @func:       The function to run on all applicable CPUs.
 983  *              This must be fast and non-blocking.
 984  * @info:       An arbitrary pointer to pass to both functions.
 985  * @wait:       If true, wait (atomically) until function has
 986  *              completed on other CPUs.
 987  *
 988  * Preemption is disabled to protect against CPUs going offline but not online.
 989  * CPUs going online during the call will not be seen or sent an IPI.
 990  *
 991  * You must not call this function with disabled interrupts or
 992  * from a hardware interrupt handler or from a bottom half handler.
 993  */
 994 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 995                            void *info, bool wait, const struct cpumask *mask)
 996 {
 997         unsigned int scf_flags = SCF_RUN_LOCAL;
 998
 999         if (wait)
1000                 scf_flags |= SCF_WAIT;
1001
1002         preempt_disable();
1003         smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
1004         preempt_enable();
1005 }
1006 EXPORT_SYMBOL(on_each_cpu_cond_mask);
1007
1008 static void do_nothing(void *unused)
1009 {
1010 }
1011
1012 /**
1013  * kick_all_cpus_sync - Force all cpus out of idle
1014  *
1015  * Used to synchronize the update of pm_idle function pointer. It's
1016  * called after the pointer is updated and returns after the dummy
1017  * callback function has been executed on all cpus. The execution of
1018  * the function can only happen on the remote cpus after they have
1019  * left the idle function which had been called via pm_idle function
1020  * pointer. So it's guaranteed that nothing uses the previous pointer
1021  * anymore.
1022  */
1023 void kick_all_cpus_sync(void)
1024 {
1025         /* Make sure the change is visible before we kick the cpus */
1026         smp_mb();
1027         smp_call_function(do_nothing, NULL, 1);
1028 }
1029 EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
1030
1031 /**
1032  * wake_up_all_idle_cpus - break all cpus out of idle
1033  * wake_up_all_idle_cpus try to break all cpus which is in idle state even
1034  * including idle polling cpus, for non-idle cpus, we will do nothing
1035  * for them.
1036  */
1037 void wake_up_all_idle_cpus(void)
1038 {
1039         int cpu;
1040
1041         for_each_possible_cpu(cpu) {
1042                 preempt_disable();
1043                 if (cpu != smp_processor_id() && cpu_online(cpu))
1044                         wake_up_if_idle(cpu);
1045                 preempt_enable();
1046         }
1047 }
1048 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
1049
1050 /**
1051  * struct smp_call_on_cpu_struct - Call a function on a specific CPU
1052  * @work: &work_struct
1053  * @done: &completion to signal
1054  * @func: function to call
1055  * @data: function's data argument
1056  * @ret: return value from @func
1057  * @cpu: target CPU (%-1 for any CPU)
1058  *
1059  * Used to call a function on a specific cpu and wait for it to return.
1060  * Optionally make sure the call is done on a specified physical cpu via vcpu
1061  * pinning in order to support virtualized environments.
1062  */
1063 struct smp_call_on_cpu_struct {
1064         struct work_struct      work;
1065         struct completion       done;
1066         int                     (*func)(void *);
1067         void                    *data;
1068         int                     ret;
1069         int                     cpu;
1070 };
1071
1072 static void smp_call_on_cpu_callback(struct work_struct *work)
1073 {
1074         struct smp_call_on_cpu_struct *sscs;
1075
1076         sscs = container_of(work, struct smp_call_on_cpu_struct, work);
1077         if (sscs->cpu >= 0)
1078                 hypervisor_pin_vcpu(sscs->cpu);
1079         sscs->ret = sscs->func(sscs->data);
1080         if (sscs->cpu >= 0)
1081                 hypervisor_pin_vcpu(-1);
1082
1083         complete(&sscs->done);
1084 }
1085
1086 int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
1087 {
1088         struct smp_call_on_cpu_struct sscs = {
1089                 .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
1090                 .func = func,
1091                 .data = par,
1092                 .cpu  = phys ? cpu : -1,
1093         };
1094
1095         INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);
1096
1097         if (cpu >= nr_cpu_ids || !cpu_online(cpu))
1098                 return -ENXIO;
1099
1100         queue_work_on(cpu, system_wq, &sscs.work);
1101         wait_for_completion(&sscs.done);
1102
1103         return sscs.ret;
1104 }
1105 EXPORT_SYMBOL_GPL(smp_call_on_cpu);