kernel/cgroup/cpuset.c

   1 /*
   2  *  kernel/cpuset.c
   3  *
   4  *  Processor and Memory placement constraints for sets of tasks.
   5  *
   6  *  Copyright (C) 2003 BULL SA.
   7  *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
   8  *  Copyright (C) 2006 Google, Inc
   9  *
  10  *  Portions derived from Patrick Mochel's sysfs code.
  11  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  12  *
  13  *  2003-10-10 Written by Simon Derr.
  14  *  2003-10-22 Updates by Stephen Hemminger.
  15  *  2004 May-July Rework by Paul Jackson.
  16  *  2006 Rework by Paul Menage to use generic cgroups
  17  *  2008 Rework of the scheduler domains and CPU hotplug handling
  18  *       by Max Krasnyansky
  19  *
  20  *  This file is subject to the terms and conditions of the GNU General Public
  21  *  License.  See the file COPYING in the main directory of the Linux
  22  *  distribution for more details.
  23  */
  24
  25 #include <linux/cpu.h>
  26 #include <linux/cpumask.h>
  27 #include <linux/cpuset.h>
  28 #include <linux/err.h>
  29 #include <linux/errno.h>
  30 #include <linux/file.h>
  31 #include <linux/fs.h>
  32 #include <linux/init.h>
  33 #include <linux/interrupt.h>
  34 #include <linux/kernel.h>
  35 #include <linux/kmod.h>
  36 #include <linux/kthread.h>
  37 #include <linux/list.h>
  38 #include <linux/mempolicy.h>
  39 #include <linux/mm.h>
  40 #include <linux/memory.h>
  41 #include <linux/export.h>
  42 #include <linux/mount.h>
  43 #include <linux/fs_context.h>
  44 #include <linux/namei.h>
  45 #include <linux/pagemap.h>
  46 #include <linux/proc_fs.h>
  47 #include <linux/rcupdate.h>
  48 #include <linux/sched.h>
  49 #include <linux/sched/deadline.h>
  50 #include <linux/sched/mm.h>
  51 #include <linux/sched/task.h>
  52 #include <linux/seq_file.h>
  53 #include <linux/security.h>
  54 #include <linux/slab.h>
  55 #include <linux/spinlock.h>
  56 #include <linux/stat.h>
  57 #include <linux/string.h>
  58 #include <linux/time.h>
  59 #include <linux/time64.h>
  60 #include <linux/backing-dev.h>
  61 #include <linux/sort.h>
  62 #include <linux/oom.h>
  63 #include <linux/sched/isolation.h>
  64 #include <linux/uaccess.h>
  65 #include <linux/atomic.h>
  66 #include <linux/mutex.h>
  67 #include <linux/cgroup.h>
  68 #include <linux/wait.h>
  69
  70 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
  71 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
  72
  73 /* See "Frequency meter" comments, below. */
  74
  75 struct fmeter {
  76         int cnt;                /* unprocessed events count */
  77         int val;                /* most recent output value */
  78         time64_t time;          /* clock (secs) when val computed */
  79         spinlock_t lock;        /* guards read or write of above */
  80 };
  81
  82 struct cpuset {
  83         struct cgroup_subsys_state css;
  84
  85         unsigned long flags;            /* "unsigned long" so bitops work */
  86
  87         /*
  88          * On default hierarchy:
  89          *
  90          * The user-configured masks can only be changed by writing to
  91          * cpuset.cpus and cpuset.mems, and won't be limited by the
  92          * parent masks.
  93          *
  94          * The effective masks is the real masks that apply to the tasks
  95          * in the cpuset. They may be changed if the configured masks are
  96          * changed or hotplug happens.
  97          *
  98          * effective_mask == configured_mask & parent's effective_mask,
  99          * and if it ends up empty, it will inherit the parent's mask.
 100          *
 101          *
 102          * On legacy hierarchy:
 103          *
 104          * The user-configured masks are always the same with effective masks.
 105          */
 106
 107         /* user-configured CPUs and Memory Nodes allow to tasks */
 108         cpumask_var_t cpus_allowed;
 109         nodemask_t mems_allowed;
 110
 111         /* effective CPUs and Memory Nodes allow to tasks */
 112         cpumask_var_t effective_cpus;
 113         nodemask_t effective_mems;
 114
 115         /*
 116          * CPUs allocated to child sub-partitions (default hierarchy only)
 117          * - CPUs granted by the parent = effective_cpus U subparts_cpus
 118          * - effective_cpus and subparts_cpus are mutually exclusive.
 119          *
 120          * effective_cpus contains only onlined CPUs, but subparts_cpus
 121          * may have offlined ones.
 122          */
 123         cpumask_var_t subparts_cpus;
 124
 125         /*
 126          * This is old Memory Nodes tasks took on.
 127          *
 128          * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
 129          * - A new cpuset's old_mems_allowed is initialized when some
 130          *   task is moved into it.
 131          * - old_mems_allowed is used in cpuset_migrate_mm() when we change
 132          *   cpuset.mems_allowed and have tasks' nodemask updated, and
 133          *   then old_mems_allowed is updated to mems_allowed.
 134          */
 135         nodemask_t old_mems_allowed;
 136
 137         struct fmeter fmeter;           /* memory_pressure filter */
 138
 139         /*
 140          * Tasks are being attached to this cpuset.  Used to prevent
 141          * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
 142          */
 143         int attach_in_progress;
 144
 145         /* partition number for rebuild_sched_domains() */
 146         int pn;
 147
 148         /* for custom sched domain */
 149         int relax_domain_level;
 150
 151         /* number of CPUs in subparts_cpus */
 152         int nr_subparts_cpus;
 153
 154         /* partition root state */
 155         int partition_root_state;
 156
 157         /*
 158          * Default hierarchy only:
 159          * use_parent_ecpus - set if using parent's effective_cpus
 160          * child_ecpus_count - # of children with use_parent_ecpus set
 161          */
 162         int use_parent_ecpus;
 163         int child_ecpus_count;
 164
 165         /* Handle for cpuset.cpus.partition */
 166         struct cgroup_file partition_file;
 167 };
 168
 169 /*
 170  * Partition root states:
 171  *
 172  *   0 - not a partition root
 173  *
 174  *   1 - partition root
 175  *
 176  *  -1 - invalid partition root
 177  *       None of the cpus in cpus_allowed can be put into the parent's
 178  *       subparts_cpus. In this case, the cpuset is not a real partition
 179  *       root anymore.  However, the CPU_EXCLUSIVE bit will still be set
 180  *       and the cpuset can be restored back to a partition root if the
 181  *       parent cpuset can give more CPUs back to this child cpuset.
 182  */
 183 #define PRS_DISABLED            0
 184 #define PRS_ENABLED             1
 185 #define PRS_ERROR               -1
 186
 187 /*
 188  * Temporary cpumasks for working with partitions that are passed among
 189  * functions to avoid memory allocation in inner functions.
 190  */
 191 struct tmpmasks {
 192         cpumask_var_t addmask, delmask; /* For partition root */
 193         cpumask_var_t new_cpus;         /* For update_cpumasks_hier() */
 194 };
 195
 196 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
 197 {
 198         return css ? container_of(css, struct cpuset, css) : NULL;
 199 }
 200
 201 /* Retrieve the cpuset for a task */
 202 static inline struct cpuset *task_cs(struct task_struct *task)
 203 {
 204         return css_cs(task_css(task, cpuset_cgrp_id));
 205 }
 206
 207 static inline struct cpuset *parent_cs(struct cpuset *cs)
 208 {
 209         return css_cs(cs->css.parent);
 210 }
 211
 212 /* bits in struct cpuset flags field */
 213 typedef enum {
 214         CS_ONLINE,
 215         CS_CPU_EXCLUSIVE,
 216         CS_MEM_EXCLUSIVE,
 217         CS_MEM_HARDWALL,
 218         CS_MEMORY_MIGRATE,
 219         CS_SCHED_LOAD_BALANCE,
 220         CS_SPREAD_PAGE,
 221         CS_SPREAD_SLAB,
 222 } cpuset_flagbits_t;
 223
 224 /* convenient tests for these bits */
 225 static inline bool is_cpuset_online(struct cpuset *cs)
 226 {
 227         return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
 228 }
 229
 230 static inline int is_cpu_exclusive(const struct cpuset *cs)
 231 {
 232         return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
 233 }
 234
 235 static inline int is_mem_exclusive(const struct cpuset *cs)
 236 {
 237         return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 238 }
 239
 240 static inline int is_mem_hardwall(const struct cpuset *cs)
 241 {
 242         return test_bit(CS_MEM_HARDWALL, &cs->flags);
 243 }
 244
 245 static inline int is_sched_load_balance(const struct cpuset *cs)
 246 {
 247         return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 248 }
 249
 250 static inline int is_memory_migrate(const struct cpuset *cs)
 251 {
 252         return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
 253 }
 254
 255 static inline int is_spread_page(const struct cpuset *cs)
 256 {
 257         return test_bit(CS_SPREAD_PAGE, &cs->flags);
 258 }
 259
 260 static inline int is_spread_slab(const struct cpuset *cs)
 261 {
 262         return test_bit(CS_SPREAD_SLAB, &cs->flags);
 263 }
 264
 265 static inline int is_partition_root(const struct cpuset *cs)
 266 {
 267         return cs->partition_root_state > 0;
 268 }
 269
 270 /*
 271  * Send notification event of whenever partition_root_state changes.
 272  */
 273 static inline void notify_partition_change(struct cpuset *cs,
 274                                            int old_prs, int new_prs)
 275 {
 276         if (old_prs != new_prs)
 277                 cgroup_file_notify(&cs->partition_file);
 278 }
 279
 280 static struct cpuset top_cpuset = {
 281         .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
 282                   (1 << CS_MEM_EXCLUSIVE)),
 283         .partition_root_state = PRS_ENABLED,
 284 };
 285
 286 /**
 287  * cpuset_for_each_child - traverse online children of a cpuset
 288  * @child_cs: loop cursor pointing to the current child
 289  * @pos_css: used for iteration
 290  * @parent_cs: target cpuset to walk children of
 291  *
 292  * Walk @child_cs through the online children of @parent_cs.  Must be used
 293  * with RCU read locked.
 294  */
 295 #define cpuset_for_each_child(child_cs, pos_css, parent_cs)             \
 296         css_for_each_child((pos_css), &(parent_cs)->css)                \
 297                 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
 298
 299 /**
 300  * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 301  * @des_cs: loop cursor pointing to the current descendant
 302  * @pos_css: used for iteration
 303  * @root_cs: target cpuset to walk ancestor of
 304  *
 305  * Walk @des_cs through the online descendants of @root_cs.  Must be used
 306  * with RCU read locked.  The caller may modify @pos_css by calling
 307  * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 308  * iteration and the first node to be visited.
 309  */
 310 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
 311         css_for_each_descendant_pre((pos_css), &(root_cs)->css)         \
 312                 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
 313
 314 /*
 315  * There are two global locks guarding cpuset structures - cpuset_rwsem and
 316  * callback_lock. We also require taking task_lock() when dereferencing a
 317  * task's cpuset pointer. See "The task_lock() exception", at the end of this
 318  * comment.  The cpuset code uses only cpuset_rwsem write lock.  Other
 319  * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
 320  * prevent change to cpuset structures.
 321  *
 322  * A task must hold both locks to modify cpusets.  If a task holds
 323  * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
 324  * is the only task able to also acquire callback_lock and be able to
 325  * modify cpusets.  It can perform various checks on the cpuset structure
 326  * first, knowing nothing will change.  It can also allocate memory while
 327  * just holding cpuset_rwsem.  While it is performing these checks, various
 328  * callback routines can briefly acquire callback_lock to query cpusets.
 329  * Once it is ready to make the changes, it takes callback_lock, blocking
 330  * everyone else.
 331  *
 332  * Calls to the kernel memory allocator can not be made while holding
 333  * callback_lock, as that would risk double tripping on callback_lock
 334  * from one of the callbacks into the cpuset code from within
 335  * __alloc_pages().
 336  *
 337  * If a task is only holding callback_lock, then it has read-only
 338  * access to cpusets.
 339  *
 340  * Now, the task_struct fields mems_allowed and mempolicy may be changed
 341  * by other task, we use alloc_lock in the task_struct fields to protect
 342  * them.
 343  *
 344  * The cpuset_common_file_read() handlers only hold callback_lock across
 345  * small pieces of code, such as when reading out possibly multi-word
 346  * cpumasks and nodemasks.
 347  *
 348  * Accessing a task's cpuset should be done in accordance with the
 349  * guidelines for accessing subsystem state in kernel/cgroup.c
 350  */
 351
 352 DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
 353
 354 void cpuset_read_lock(void)
 355 {
 356         percpu_down_read(&cpuset_rwsem);
 357 }
 358
 359 void cpuset_read_unlock(void)
 360 {
 361         percpu_up_read(&cpuset_rwsem);
 362 }
 363
 364 static DEFINE_SPINLOCK(callback_lock);
 365
 366 static struct workqueue_struct *cpuset_migrate_mm_wq;
 367
 368 /*
 369  * CPU / memory hotplug is handled asynchronously.
 370  */
 371 static void cpuset_hotplug_workfn(struct work_struct *work);
 372 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
 373
 374 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
 375
 376 /*
 377  * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
 378  * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
 379  * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
 380  * With v2 behavior, "cpus" and "mems" are always what the users have
 381  * requested and won't be changed by hotplug events. Only the effective
 382  * cpus or mems will be affected.
 383  */
 384 static inline bool is_in_v2_mode(void)
 385 {
 386         return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
 387               (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
 388 }
 389
 390 /*
 391  * Return in pmask the portion of a task's cpusets's cpus_allowed that
 392  * are online and are capable of running the task.  If none are found,
 393  * walk up the cpuset hierarchy until we find one that does have some
 394  * appropriate cpus.
 395  *
 396  * One way or another, we guarantee to return some non-empty subset
 397  * of cpu_online_mask.
 398  *
 399  * Call with callback_lock or cpuset_rwsem held.
 400  */
 401 static void guarantee_online_cpus(struct task_struct *tsk,
 402                                   struct cpumask *pmask)
 403 {
 404         const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
 405         struct cpuset *cs;
 406
 407         if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
 408                 cpumask_copy(pmask, cpu_online_mask);
 409
 410         rcu_read_lock();
 411         cs = task_cs(tsk);
 412
 413         while (!cpumask_intersects(cs->effective_cpus, pmask)) {
 414                 cs = parent_cs(cs);
 415                 if (unlikely(!cs)) {
 416                         /*
 417                          * The top cpuset doesn't have any online cpu as a
 418                          * consequence of a race between cpuset_hotplug_work
 419                          * and cpu hotplug notifier.  But we know the top
 420                          * cpuset's effective_cpus is on its way to be
 421                          * identical to cpu_online_mask.
 422                          */
 423                         goto out_unlock;
 424                 }
 425         }
 426         cpumask_and(pmask, pmask, cs->effective_cpus);
 427
 428 out_unlock:
 429         rcu_read_unlock();
 430 }
 431
 432 /*
 433  * Return in *pmask the portion of a cpusets's mems_allowed that
 434  * are online, with memory.  If none are online with memory, walk
 435  * up the cpuset hierarchy until we find one that does have some
 436  * online mems.  The top cpuset always has some mems online.
 437  *
 438  * One way or another, we guarantee to return some non-empty subset
 439  * of node_states[N_MEMORY].
 440  *
 441  * Call with callback_lock or cpuset_rwsem held.
 442  */
 443 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 444 {
 445         while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
 446                 cs = parent_cs(cs);
 447         nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
 448 }
 449
 450 /*
 451  * update task's spread flag if cpuset's page/slab spread flag is set
 452  *
 453  * Call with callback_lock or cpuset_rwsem held.
 454  */
 455 static void cpuset_update_task_spread_flag(struct cpuset *cs,
 456                                         struct task_struct *tsk)
 457 {
 458         if (is_spread_page(cs))
 459                 task_set_spread_page(tsk);
 460         else
 461                 task_clear_spread_page(tsk);
 462
 463         if (is_spread_slab(cs))
 464                 task_set_spread_slab(tsk);
 465         else
 466                 task_clear_spread_slab(tsk);
 467 }
 468
 469 /*
 470  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 471  *
 472  * One cpuset is a subset of another if all its allowed CPUs and
 473  * Memory Nodes are a subset of the other, and its exclusive flags
 474  * are only set if the other's are set.  Call holding cpuset_rwsem.
 475  */
 476
 477 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 478 {
 479         return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
 480                 nodes_subset(p->mems_allowed, q->mems_allowed) &&
 481                 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
 482                 is_mem_exclusive(p) <= is_mem_exclusive(q);
 483 }
 484
 485 /**
 486  * alloc_cpumasks - allocate three cpumasks for cpuset
 487  * @cs:  the cpuset that have cpumasks to be allocated.
 488  * @tmp: the tmpmasks structure pointer
 489  * Return: 0 if successful, -ENOMEM otherwise.
 490  *
 491  * Only one of the two input arguments should be non-NULL.
 492  */
 493 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 494 {
 495         cpumask_var_t *pmask1, *pmask2, *pmask3;
 496
 497         if (cs) {
 498                 pmask1 = &cs->cpus_allowed;
 499                 pmask2 = &cs->effective_cpus;
 500                 pmask3 = &cs->subparts_cpus;
 501         } else {
 502                 pmask1 = &tmp->new_cpus;
 503                 pmask2 = &tmp->addmask;
 504                 pmask3 = &tmp->delmask;
 505         }
 506
 507         if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
 508                 return -ENOMEM;
 509
 510         if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
 511                 goto free_one;
 512
 513         if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
 514                 goto free_two;
 515
 516         return 0;
 517
 518 free_two:
 519         free_cpumask_var(*pmask2);
 520 free_one:
 521         free_cpumask_var(*pmask1);
 522         return -ENOMEM;
 523 }
 524
 525 /**
 526  * free_cpumasks - free cpumasks in a tmpmasks structure
 527  * @cs:  the cpuset that have cpumasks to be free.
 528  * @tmp: the tmpmasks structure pointer
 529  */
 530 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 531 {
 532         if (cs) {
 533                 free_cpumask_var(cs->cpus_allowed);
 534                 free_cpumask_var(cs->effective_cpus);
 535                 free_cpumask_var(cs->subparts_cpus);
 536         }
 537         if (tmp) {
 538                 free_cpumask_var(tmp->new_cpus);
 539                 free_cpumask_var(tmp->addmask);
 540                 free_cpumask_var(tmp->delmask);
 541         }
 542 }
 543
 544 /**
 545  * alloc_trial_cpuset - allocate a trial cpuset
 546  * @cs: the cpuset that the trial cpuset duplicates
 547  */
 548 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 549 {
 550         struct cpuset *trial;
 551
 552         trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
 553         if (!trial)
 554                 return NULL;
 555
 556         if (alloc_cpumasks(trial, NULL)) {
 557                 kfree(trial);
 558                 return NULL;
 559         }
 560
 561         cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 562         cpumask_copy(trial->effective_cpus, cs->effective_cpus);
 563         return trial;
 564 }
 565
 566 /**
 567  * free_cpuset - free the cpuset
 568  * @cs: the cpuset to be freed
 569  */
 570 static inline void free_cpuset(struct cpuset *cs)
 571 {
 572         free_cpumasks(cs, NULL);
 573         kfree(cs);
 574 }
 575
 576 /*
 577  * validate_change() - Used to validate that any proposed cpuset change
 578  *                     follows the structural rules for cpusets.
 579  *
 580  * If we replaced the flag and mask values of the current cpuset
 581  * (cur) with those values in the trial cpuset (trial), would
 582  * our various subset and exclusive rules still be valid?  Presumes
 583  * cpuset_rwsem held.
 584  *
 585  * 'cur' is the address of an actual, in-use cpuset.  Operations
 586  * such as list traversal that depend on the actual address of the
 587  * cpuset in the list must use cur below, not trial.
 588  *
 589  * 'trial' is the address of bulk structure copy of cur, with
 590  * perhaps one or more of the fields cpus_allowed, mems_allowed,
 591  * or flags changed to new, trial values.
 592  *
 593  * Return 0 if valid, -errno if not.
 594  */
 595
 596 static int validate_change(struct cpuset *cur, struct cpuset *trial)
 597 {
 598         struct cgroup_subsys_state *css;
 599         struct cpuset *c, *par;
 600         int ret;
 601
 602         rcu_read_lock();
 603
 604         /* Each of our child cpusets must be a subset of us */
 605         ret = -EBUSY;
 606         cpuset_for_each_child(c, css, cur)
 607                 if (!is_cpuset_subset(c, trial))
 608                         goto out;
 609
 610         /* Remaining checks don't apply to root cpuset */
 611         ret = 0;
 612         if (cur == &top_cpuset)
 613                 goto out;
 614
 615         par = parent_cs(cur);
 616
 617         /* On legacy hierarchy, we must be a subset of our parent cpuset. */
 618         ret = -EACCES;
 619         if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
 620                 goto out;
 621
 622         /*
 623          * If either I or some sibling (!= me) is exclusive, we can't
 624          * overlap
 625          */
 626         ret = -EINVAL;
 627         cpuset_for_each_child(c, css, par) {
 628                 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 629                     c != cur &&
 630                     cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
 631                         goto out;
 632                 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 633                     c != cur &&
 634                     nodes_intersects(trial->mems_allowed, c->mems_allowed))
 635                         goto out;
 636         }
 637
 638         /*
 639          * Cpusets with tasks - existing or newly being attached - can't
 640          * be changed to have empty cpus_allowed or mems_allowed.
 641          */
 642         ret = -ENOSPC;
 643         if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
 644                 if (!cpumask_empty(cur->cpus_allowed) &&
 645                     cpumask_empty(trial->cpus_allowed))
 646                         goto out;
 647                 if (!nodes_empty(cur->mems_allowed) &&
 648                     nodes_empty(trial->mems_allowed))
 649                         goto out;
 650         }
 651
 652         /*
 653          * We can't shrink if we won't have enough room for SCHED_DEADLINE
 654          * tasks.
 655          */
 656         ret = -EBUSY;
 657         if (is_cpu_exclusive(cur) &&
 658             !cpuset_cpumask_can_shrink(cur->cpus_allowed,
 659                                        trial->cpus_allowed))
 660                 goto out;
 661
 662         ret = 0;
 663 out:
 664         rcu_read_unlock();
 665         return ret;
 666 }
 667
 668 #ifdef CONFIG_SMP
 669 /*
 670  * Helper routine for generate_sched_domains().
 671  * Do cpusets a, b have overlapping effective cpus_allowed masks?
 672  */
 673 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 674 {
 675         return cpumask_intersects(a->effective_cpus, b->effective_cpus);
 676 }
 677
 678 static void
 679 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 680 {
 681         if (dattr->relax_domain_level < c->relax_domain_level)
 682                 dattr->relax_domain_level = c->relax_domain_level;
 683         return;
 684 }
 685
 686 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
 687                                     struct cpuset *root_cs)
 688 {
 689         struct cpuset *cp;
 690         struct cgroup_subsys_state *pos_css;
 691
 692         rcu_read_lock();
 693         cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
 694                 /* skip the whole subtree if @cp doesn't have any CPU */
 695                 if (cpumask_empty(cp->cpus_allowed)) {
 696                         pos_css = css_rightmost_descendant(pos_css);
 697                         continue;
 698                 }
 699
 700                 if (is_sched_load_balance(cp))
 701                         update_domain_attr(dattr, cp);
 702         }
 703         rcu_read_unlock();
 704 }
 705
 706 /* Must be called with cpuset_rwsem held.  */
 707 static inline int nr_cpusets(void)
 708 {
 709         /* jump label reference count + the top-level cpuset */
 710         return static_key_count(&cpusets_enabled_key.key) + 1;
 711 }
 712
 713 /*
 714  * generate_sched_domains()
 715  *
 716  * This function builds a partial partition of the systems CPUs
 717  * A 'partial partition' is a set of non-overlapping subsets whose
 718  * union is a subset of that set.
 719  * The output of this function needs to be passed to kernel/sched/core.c
 720  * partition_sched_domains() routine, which will rebuild the scheduler's
 721  * load balancing domains (sched domains) as specified by that partial
 722  * partition.
 723  *
 724  * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
 725  * for a background explanation of this.
 726  *
 727  * Does not return errors, on the theory that the callers of this
 728  * routine would rather not worry about failures to rebuild sched
 729  * domains when operating in the severe memory shortage situations
 730  * that could cause allocation failures below.
 731  *
 732  * Must be called with cpuset_rwsem held.
 733  *
 734  * The three key local variables below are:
 735  *    cp - cpuset pointer, used (together with pos_css) to perform a
 736  *         top-down scan of all cpusets. For our purposes, rebuilding
 737  *         the schedulers sched domains, we can ignore !is_sched_load_
 738  *         balance cpusets.
 739  *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 740  *         that need to be load balanced, for convenient iterative
 741  *         access by the subsequent code that finds the best partition,
 742  *         i.e the set of domains (subsets) of CPUs such that the
 743  *         cpus_allowed of every cpuset marked is_sched_load_balance
 744  *         is a subset of one of these domains, while there are as
 745  *         many such domains as possible, each as small as possible.
 746  * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
 747  *         the kernel/sched/core.c routine partition_sched_domains() in a
 748  *         convenient format, that can be easily compared to the prior
 749  *         value to determine what partition elements (sched domains)
 750  *         were changed (added or removed.)
 751  *
 752  * Finding the best partition (set of domains):
 753  *      The triple nested loops below over i, j, k scan over the
 754  *      load balanced cpusets (using the array of cpuset pointers in
 755  *      csa[]) looking for pairs of cpusets that have overlapping
 756  *      cpus_allowed, but which don't have the same 'pn' partition
 757  *      number and gives them in the same partition number.  It keeps
 758  *      looping on the 'restart' label until it can no longer find
 759  *      any such pairs.
 760  *
 761  *      The union of the cpus_allowed masks from the set of
 762  *      all cpusets having the same 'pn' value then form the one
 763  *      element of the partition (one sched domain) to be passed to
 764  *      partition_sched_domains().
 765  */
 766 static int generate_sched_domains(cpumask_var_t **domains,
 767                         struct sched_domain_attr **attributes)
 768 {
 769         struct cpuset *cp;      /* top-down scan of cpusets */
 770         struct cpuset **csa;    /* array of all cpuset ptrs */
 771         int csn;                /* how many cpuset ptrs in csa so far */
 772         int i, j, k;            /* indices for partition finding loops */
 773         cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
 774         struct sched_domain_attr *dattr;  /* attributes for custom domains */
 775         int ndoms = 0;          /* number of sched domains in result */
 776         int nslot;              /* next empty doms[] struct cpumask slot */
 777         struct cgroup_subsys_state *pos_css;
 778         bool root_load_balance = is_sched_load_balance(&top_cpuset);
 779
 780         doms = NULL;
 781         dattr = NULL;
 782         csa = NULL;
 783
 784         /* Special case for the 99% of systems with one, full, sched domain */
 785         if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
 786                 ndoms = 1;
 787                 doms = alloc_sched_domains(ndoms);
 788                 if (!doms)
 789                         goto done;
 790
 791                 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
 792                 if (dattr) {
 793                         *dattr = SD_ATTR_INIT;
 794                         update_domain_attr_tree(dattr, &top_cpuset);
 795                 }
 796                 cpumask_and(doms[0], top_cpuset.effective_cpus,
 797                             housekeeping_cpumask(HK_FLAG_DOMAIN));
 798
 799                 goto done;
 800         }
 801
 802         csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
 803         if (!csa)
 804                 goto done;
 805         csn = 0;
 806
 807         rcu_read_lock();
 808         if (root_load_balance)
 809                 csa[csn++] = &top_cpuset;
 810         cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
 811                 if (cp == &top_cpuset)
 812                         continue;
 813                 /*
 814                  * Continue traversing beyond @cp iff @cp has some CPUs and
 815                  * isn't load balancing.  The former is obvious.  The
 816                  * latter: All child cpusets contain a subset of the
 817                  * parent's cpus, so just skip them, and then we call
 818                  * update_domain_attr_tree() to calc relax_domain_level of
 819                  * the corresponding sched domain.
 820                  *
 821                  * If root is load-balancing, we can skip @cp if it
 822                  * is a subset of the root's effective_cpus.
 823                  */
 824                 if (!cpumask_empty(cp->cpus_allowed) &&
 825                     !(is_sched_load_balance(cp) &&
 826                       cpumask_intersects(cp->cpus_allowed,
 827                                          housekeeping_cpumask(HK_FLAG_DOMAIN))))
 828                         continue;
 829
 830                 if (root_load_balance &&
 831                     cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
 832                         continue;
 833
 834                 if (is_sched_load_balance(cp) &&
 835                     !cpumask_empty(cp->effective_cpus))
 836                         csa[csn++] = cp;
 837
 838                 /* skip @cp's subtree if not a partition root */
 839                 if (!is_partition_root(cp))
 840                         pos_css = css_rightmost_descendant(pos_css);
 841         }
 842         rcu_read_unlock();
 843
 844         for (i = 0; i < csn; i++)
 845                 csa[i]->pn = i;
 846         ndoms = csn;
 847
 848 restart:
 849         /* Find the best partition (set of sched domains) */
 850         for (i = 0; i < csn; i++) {
 851                 struct cpuset *a = csa[i];
 852                 int apn = a->pn;
 853
 854                 for (j = 0; j < csn; j++) {
 855                         struct cpuset *b = csa[j];
 856                         int bpn = b->pn;
 857
 858                         if (apn != bpn && cpusets_overlap(a, b)) {
 859                                 for (k = 0; k < csn; k++) {
 860                                         struct cpuset *c = csa[k];
 861
 862                                         if (c->pn == bpn)
 863                                                 c->pn = apn;
 864                                 }
 865                                 ndoms--;        /* one less element */
 866                                 goto restart;
 867                         }
 868                 }
 869         }
 870
 871         /*
 872          * Now we know how many domains to create.
 873          * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
 874          */
 875         doms = alloc_sched_domains(ndoms);
 876         if (!doms)
 877                 goto done;
 878
 879         /*
 880          * The rest of the code, including the scheduler, can deal with
 881          * dattr==NULL case. No need to abort if alloc fails.
 882          */
 883         dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
 884                               GFP_KERNEL);
 885
 886         for (nslot = 0, i = 0; i < csn; i++) {
 887                 struct cpuset *a = csa[i];
 888                 struct cpumask *dp;
 889                 int apn = a->pn;
 890
 891                 if (apn < 0) {
 892                         /* Skip completed partitions */
 893                         continue;
 894                 }
 895
 896                 dp = doms[nslot];
 897
 898                 if (nslot == ndoms) {
 899                         static int warnings = 10;
 900                         if (warnings) {
 901                                 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
 902                                         nslot, ndoms, csn, i, apn);
 903                                 warnings--;
 904                         }
 905                         continue;
 906                 }
 907
 908                 cpumask_clear(dp);
 909                 if (dattr)
 910                         *(dattr + nslot) = SD_ATTR_INIT;
 911                 for (j = i; j < csn; j++) {
 912                         struct cpuset *b = csa[j];
 913
 914                         if (apn == b->pn) {
 915                                 cpumask_or(dp, dp, b->effective_cpus);
 916                                 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
 917                                 if (dattr)
 918                                         update_domain_attr_tree(dattr + nslot, b);
 919
 920                                 /* Done with this partition */
 921                                 b->pn = -1;
 922                         }
 923                 }
 924                 nslot++;
 925         }
 926         BUG_ON(nslot != ndoms);
 927
 928 done:
 929         kfree(csa);
 930
 931         /*
 932          * Fallback to the default domain if kmalloc() failed.
 933          * See comments in partition_sched_domains().
 934          */
 935         if (doms == NULL)
 936                 ndoms = 1;
 937
 938         *domains    = doms;
 939         *attributes = dattr;
 940         return ndoms;
 941 }
 942
 943 static void update_tasks_root_domain(struct cpuset *cs)
 944 {
 945         struct css_task_iter it;
 946         struct task_struct *task;
 947
 948         css_task_iter_start(&cs->css, 0, &it);
 949
 950         while ((task = css_task_iter_next(&it)))
 951                 dl_add_task_root_domain(task);
 952
 953         css_task_iter_end(&it);
 954 }
 955
 956 static void rebuild_root_domains(void)
 957 {
 958         struct cpuset *cs = NULL;
 959         struct cgroup_subsys_state *pos_css;
 960
 961         percpu_rwsem_assert_held(&cpuset_rwsem);
 962         lockdep_assert_cpus_held();
 963         lockdep_assert_held(&sched_domains_mutex);
 964
 965         rcu_read_lock();
 966
 967         /*
 968          * Clear default root domain DL accounting, it will be computed again
 969          * if a task belongs to it.
 970          */
 971         dl_clear_root_domain(&def_root_domain);
 972
 973         cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
 974
 975                 if (cpumask_empty(cs->effective_cpus)) {
 976                         pos_css = css_rightmost_descendant(pos_css);
 977                         continue;
 978                 }
 979
 980                 css_get(&cs->css);
 981
 982                 rcu_read_unlock();
 983
 984                 update_tasks_root_domain(cs);
 985
 986                 rcu_read_lock();
 987                 css_put(&cs->css);
 988         }
 989         rcu_read_unlock();
 990 }
 991
 992 static void
 993 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 994                                     struct sched_domain_attr *dattr_new)
 995 {
 996         mutex_lock(&sched_domains_mutex);
 997         partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
 998         rebuild_root_domains();
 999         mutex_unlock(&sched_domains_mutex);
1000 }
1001
1002 /*
1003  * Rebuild scheduler domains.
1004  *
1005  * If the flag 'sched_load_balance' of any cpuset with non-empty
1006  * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
1007  * which has that flag enabled, or if any cpuset with a non-empty
1008  * 'cpus' is removed, then call this routine to rebuild the
1009  * scheduler's dynamic sched domains.
1010  *
1011  * Call with cpuset_rwsem held.  Takes cpus_read_lock().
1012  */
1013 static void rebuild_sched_domains_locked(void)
1014 {
1015         struct cgroup_subsys_state *pos_css;
1016         struct sched_domain_attr *attr;
1017         cpumask_var_t *doms;
1018         struct cpuset *cs;
1019         int ndoms;
1020
1021         lockdep_assert_cpus_held();
1022         percpu_rwsem_assert_held(&cpuset_rwsem);
1023
1024         /*
1025          * If we have raced with CPU hotplug, return early to avoid
1026          * passing doms with offlined cpu to partition_sched_domains().
1027          * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
1028          *
1029          * With no CPUs in any subpartitions, top_cpuset's effective CPUs
1030          * should be the same as the active CPUs, so checking only top_cpuset
1031          * is enough to detect racing CPU offlines.
1032          */
1033         if (!top_cpuset.nr_subparts_cpus &&
1034             !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1035                 return;
1036
1037         /*
1038          * With subpartition CPUs, however, the effective CPUs of a partition
1039          * root should be only a subset of the active CPUs.  Since a CPU in any
1040          * partition root could be offlined, all must be checked.
1041          */
1042         if (top_cpuset.nr_subparts_cpus) {
1043                 rcu_read_lock();
1044                 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1045                         if (!is_partition_root(cs)) {
1046                                 pos_css = css_rightmost_descendant(pos_css);
1047                                 continue;
1048                         }
1049                         if (!cpumask_subset(cs->effective_cpus,
1050                                             cpu_active_mask)) {
1051                                 rcu_read_unlock();
1052                                 return;
1053                         }
1054                 }
1055                 rcu_read_unlock();
1056         }
1057
1058         /* Generate domain masks and attrs */
1059         ndoms = generate_sched_domains(&doms, &attr);
1060
1061         /* Have scheduler rebuild the domains */
1062         partition_and_rebuild_sched_domains(ndoms, doms, attr);
1063 }
1064 #else /* !CONFIG_SMP */
1065 static void rebuild_sched_domains_locked(void)
1066 {
1067 }
1068 #endif /* CONFIG_SMP */
1069
1070 void rebuild_sched_domains(void)
1071 {
1072         cpus_read_lock();
1073         percpu_down_write(&cpuset_rwsem);
1074         rebuild_sched_domains_locked();
1075         percpu_up_write(&cpuset_rwsem);
1076         cpus_read_unlock();
1077 }
1078
1079 /**
1080  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
1081  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
1082  *
1083  * Iterate through each task of @cs updating its cpus_allowed to the
1084  * effective cpuset's.  As this function is called with cpuset_rwsem held,
1085  * cpuset membership stays stable.
1086  */
1087 static void update_tasks_cpumask(struct cpuset *cs)
1088 {
1089         struct css_task_iter it;
1090         struct task_struct *task;
1091         bool top_cs = cs == &top_cpuset;
1092
1093         css_task_iter_start(&cs->css, 0, &it);
1094         while ((task = css_task_iter_next(&it))) {
1095                 /*
1096                  * Percpu kthreads in top_cpuset are ignored
1097                  */
1098                 if (top_cs && (task->flags & PF_KTHREAD) &&
1099                     kthread_is_per_cpu(task))
1100                         continue;
1101                 set_cpus_allowed_ptr(task, cs->effective_cpus);
1102         }
1103         css_task_iter_end(&it);
1104 }
1105
1106 /**
1107  * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1108  * @new_cpus: the temp variable for the new effective_cpus mask
1109  * @cs: the cpuset the need to recompute the new effective_cpus mask
1110  * @parent: the parent cpuset
1111  *
1112  * If the parent has subpartition CPUs, include them in the list of
1113  * allowable CPUs in computing the new effective_cpus mask. Since offlined
1114  * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
1115  * to mask those out.
1116  */
1117 static void compute_effective_cpumask(struct cpumask *new_cpus,
1118                                       struct cpuset *cs, struct cpuset *parent)
1119 {
1120         if (parent->nr_subparts_cpus) {
1121                 cpumask_or(new_cpus, parent->effective_cpus,
1122                            parent->subparts_cpus);
1123                 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
1124                 cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1125         } else {
1126                 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1127         }
1128 }
1129
1130 /*
1131  * Commands for update_parent_subparts_cpumask
1132  */
1133 enum subparts_cmd {
1134         partcmd_enable,         /* Enable partition root         */
1135         partcmd_disable,        /* Disable partition root        */
1136         partcmd_update,         /* Update parent's subparts_cpus */
1137 };
1138
1139 /**
1140  * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
1141  * @cpuset:  The cpuset that requests change in partition root state
1142  * @cmd:     Partition root state change command
1143  * @newmask: Optional new cpumask for partcmd_update
1144  * @tmp:     Temporary addmask and delmask
1145  * Return:   0, 1 or an error code
1146  *
1147  * For partcmd_enable, the cpuset is being transformed from a non-partition
1148  * root to a partition root. The cpus_allowed mask of the given cpuset will
1149  * be put into parent's subparts_cpus and taken away from parent's
1150  * effective_cpus. The function will return 0 if all the CPUs listed in
1151  * cpus_allowed can be granted or an error code will be returned.
1152  *
1153  * For partcmd_disable, the cpuset is being transofrmed from a partition
1154  * root back to a non-partition root. Any CPUs in cpus_allowed that are in
1155  * parent's subparts_cpus will be taken away from that cpumask and put back
1156  * into parent's effective_cpus. 0 should always be returned.
1157  *
1158  * For partcmd_update, if the optional newmask is specified, the cpu
1159  * list is to be changed from cpus_allowed to newmask. Otherwise,
1160  * cpus_allowed is assumed to remain the same. The cpuset should either
1161  * be a partition root or an invalid partition root. The partition root
1162  * state may change if newmask is NULL and none of the requested CPUs can
1163  * be granted by the parent. The function will return 1 if changes to
1164  * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
1165  * Error code should only be returned when newmask is non-NULL.
1166  *
1167  * The partcmd_enable and partcmd_disable commands are used by
1168  * update_prstate(). The partcmd_update command is used by
1169  * update_cpumasks_hier() with newmask NULL and update_cpumask() with
1170  * newmask set.
1171  *
1172  * The checking is more strict when enabling partition root than the
1173  * other two commands.
1174  *
1175  * Because of the implicit cpu exclusive nature of a partition root,
1176  * cpumask changes that violates the cpu exclusivity rule will not be
1177  * permitted when checked by validate_change(). The validate_change()
1178  * function will also prevent any changes to the cpu list if it is not
1179  * a superset of children's cpu lists.
1180  */
1181 static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1182                                           struct cpumask *newmask,
1183                                           struct tmpmasks *tmp)
1184 {
1185         struct cpuset *parent = parent_cs(cpuset);
1186         int adding;     /* Moving cpus from effective_cpus to subparts_cpus */
1187         int deleting;   /* Moving cpus from subparts_cpus to effective_cpus */
1188         int old_prs, new_prs;
1189         bool part_error = false;        /* Partition error? */
1190
1191         percpu_rwsem_assert_held(&cpuset_rwsem);
1192
1193         /*
1194          * The parent must be a partition root.
1195          * The new cpumask, if present, or the current cpus_allowed must
1196          * not be empty.
1197          */
1198         if (!is_partition_root(parent) ||
1199            (newmask && cpumask_empty(newmask)) ||
1200            (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1201                 return -EINVAL;
1202
1203         /*
1204          * Enabling/disabling partition root is not allowed if there are
1205          * online children.
1206          */
1207         if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1208                 return -EBUSY;
1209
1210         /*
1211          * Enabling partition root is not allowed if not all the CPUs
1212          * can be granted from parent's effective_cpus or at least one
1213          * CPU will be left after that.
1214          */
1215         if ((cmd == partcmd_enable) &&
1216            (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1217              cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1218                 return -EINVAL;
1219
1220         /*
1221          * A cpumask update cannot make parent's effective_cpus become empty.
1222          */
1223         adding = deleting = false;
1224         old_prs = new_prs = cpuset->partition_root_state;
1225         if (cmd == partcmd_enable) {
1226                 cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1227                 adding = true;
1228         } else if (cmd == partcmd_disable) {
1229                 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1230                                        parent->subparts_cpus);
1231         } else if (newmask) {
1232                 /*
1233                  * partcmd_update with newmask:
1234                  *
1235                  * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
1236                  * addmask = newmask & parent->effective_cpus
1237                  *                   & ~parent->subparts_cpus
1238                  */
1239                 cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1240                 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1241                                        parent->subparts_cpus);
1242
1243                 cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1244                 adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1245                                         parent->subparts_cpus);
1246                 /*
1247                  * Return error if the new effective_cpus could become empty.
1248                  */
1249                 if (adding &&
1250                     cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1251                         if (!deleting)
1252                                 return -EINVAL;
1253                         /*
1254                          * As some of the CPUs in subparts_cpus might have
1255                          * been offlined, we need to compute the real delmask
1256                          * to confirm that.
1257                          */
1258                         if (!cpumask_and(tmp->addmask, tmp->delmask,
1259                                          cpu_active_mask))
1260                                 return -EINVAL;
1261                         cpumask_copy(tmp->addmask, parent->effective_cpus);
1262                 }
1263         } else {
1264                 /*
1265                  * partcmd_update w/o newmask:
1266                  *
1267                  * addmask = cpus_allowed & parent->effective_cpus
1268                  *
1269                  * Note that parent's subparts_cpus may have been
1270                  * pre-shrunk in case there is a change in the cpu list.
1271                  * So no deletion is needed.
1272                  */
1273                 adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1274                                      parent->effective_cpus);
1275                 part_error = cpumask_equal(tmp->addmask,
1276                                            parent->effective_cpus);
1277         }
1278
1279         if (cmd == partcmd_update) {
1280                 int prev_prs = cpuset->partition_root_state;
1281
1282                 /*
1283                  * Check for possible transition between PRS_ENABLED
1284                  * and PRS_ERROR.
1285                  */
1286                 switch (cpuset->partition_root_state) {
1287                 case PRS_ENABLED:
1288                         if (part_error)
1289                                 new_prs = PRS_ERROR;
1290                         break;
1291                 case PRS_ERROR:
1292                         if (!part_error)
1293                                 new_prs = PRS_ENABLED;
1294                         break;
1295                 }
1296                 /*
1297                  * Set part_error if previously in invalid state.
1298                  */
1299                 part_error = (prev_prs == PRS_ERROR);
1300         }
1301
1302         if (!part_error && (new_prs == PRS_ERROR))
1303                 return 0;       /* Nothing need to be done */
1304
1305         if (new_prs == PRS_ERROR) {
1306                 /*
1307                  * Remove all its cpus from parent's subparts_cpus.
1308                  */
1309                 adding = false;
1310                 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1311                                        parent->subparts_cpus);
1312         }
1313
1314         if (!adding && !deleting && (new_prs == old_prs))
1315                 return 0;
1316
1317         /*
1318          * Change the parent's subparts_cpus.
1319          * Newly added CPUs will be removed from effective_cpus and
1320          * newly deleted ones will be added back to effective_cpus.
1321          */
1322         spin_lock_irq(&callback_lock);
1323         if (adding) {
1324                 cpumask_or(parent->subparts_cpus,
1325                            parent->subparts_cpus, tmp->addmask);
1326                 cpumask_andnot(parent->effective_cpus,
1327                                parent->effective_cpus, tmp->addmask);
1328         }
1329         if (deleting) {
1330                 cpumask_andnot(parent->subparts_cpus,
1331                                parent->subparts_cpus, tmp->delmask);
1332                 /*
1333                  * Some of the CPUs in subparts_cpus might have been offlined.
1334                  */
1335                 cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1336                 cpumask_or(parent->effective_cpus,
1337                            parent->effective_cpus, tmp->delmask);
1338         }
1339
1340         parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1341
1342         if (old_prs != new_prs)
1343                 cpuset->partition_root_state = new_prs;
1344
1345         spin_unlock_irq(&callback_lock);
1346         notify_partition_change(cpuset, old_prs, new_prs);
1347
1348         return cmd == partcmd_update;
1349 }
1350
1351 /*
1352  * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
1353  * @cs:  the cpuset to consider
1354  * @tmp: temp variables for calculating effective_cpus & partition setup
1355  *
1356  * When configured cpumask is changed, the effective cpumasks of this cpuset
1357  * and all its descendants need to be updated.
1358  *
1359  * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
1360  *
1361  * Called with cpuset_rwsem held
1362  */
1363 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
1364 {
1365         struct cpuset *cp;
1366         struct cgroup_subsys_state *pos_css;
1367         bool need_rebuild_sched_domains = false;
1368         int old_prs, new_prs;
1369
1370         rcu_read_lock();
1371         cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1372                 struct cpuset *parent = parent_cs(cp);
1373
1374                 compute_effective_cpumask(tmp->new_cpus, cp, parent);
1375
1376                 /*
1377                  * If it becomes empty, inherit the effective mask of the
1378                  * parent, which is guaranteed to have some CPUs.
1379                  */
1380                 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1381                         cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1382                         if (!cp->use_parent_ecpus) {
1383                                 cp->use_parent_ecpus = true;
1384                                 parent->child_ecpus_count++;
1385                         }
1386                 } else if (cp->use_parent_ecpus) {
1387                         cp->use_parent_ecpus = false;
1388                         WARN_ON_ONCE(!parent->child_ecpus_count);
1389                         parent->child_ecpus_count--;
1390                 }
1391
1392                 /*
1393                  * Skip the whole subtree if the cpumask remains the same
1394                  * and has no partition root state.
1395                  */
1396                 if (!cp->partition_root_state &&
1397                     cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
1398                         pos_css = css_rightmost_descendant(pos_css);
1399                         continue;
1400                 }
1401
1402                 /*
1403                  * update_parent_subparts_cpumask() should have been called
1404                  * for cs already in update_cpumask(). We should also call
1405                  * update_tasks_cpumask() again for tasks in the parent
1406                  * cpuset if the parent's subparts_cpus changes.
1407                  */
1408                 old_prs = new_prs = cp->partition_root_state;
1409                 if ((cp != cs) && old_prs) {
1410                         switch (parent->partition_root_state) {
1411                         case PRS_DISABLED:
1412                                 /*
1413                                  * If parent is not a partition root or an
1414                                  * invalid partition root, clear its state
1415                                  * and its CS_CPU_EXCLUSIVE flag.
1416                                  */
1417                                 WARN_ON_ONCE(cp->partition_root_state
1418                                              != PRS_ERROR);
1419                                 new_prs = PRS_DISABLED;
1420
1421                                 /*
1422                                  * clear_bit() is an atomic operation and
1423                                  * readers aren't interested in the state
1424                                  * of CS_CPU_EXCLUSIVE anyway. So we can
1425                                  * just update the flag without holding
1426                                  * the callback_lock.
1427                                  */
1428                                 clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1429                                 break;
1430
1431                         case PRS_ENABLED:
1432                                 if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1433                                         update_tasks_cpumask(parent);
1434                                 break;
1435
1436                         case PRS_ERROR:
1437                                 /*
1438                                  * When parent is invalid, it has to be too.
1439                                  */
1440                                 new_prs = PRS_ERROR;
1441                                 break;
1442                         }
1443                 }
1444
1445                 if (!css_tryget_online(&cp->css))
1446                         continue;
1447                 rcu_read_unlock();
1448
1449                 spin_lock_irq(&callback_lock);
1450
1451                 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1452                 if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
1453                         cp->nr_subparts_cpus = 0;
1454                         cpumask_clear(cp->subparts_cpus);
1455                 } else if (cp->nr_subparts_cpus) {
1456                         /*
1457                          * Make sure that effective_cpus & subparts_cpus
1458                          * are mutually exclusive.
1459                          *
1460                          * In the unlikely event that effective_cpus
1461                          * becomes empty. we clear cp->nr_subparts_cpus and
1462                          * let its child partition roots to compete for
1463                          * CPUs again.
1464                          */
1465                         cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1466                                        cp->subparts_cpus);
1467                         if (cpumask_empty(cp->effective_cpus)) {
1468                                 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1469                                 cpumask_clear(cp->subparts_cpus);
1470                                 cp->nr_subparts_cpus = 0;
1471                         } else if (!cpumask_subset(cp->subparts_cpus,
1472                                                    tmp->new_cpus)) {
1473                                 cpumask_andnot(cp->subparts_cpus,
1474                                         cp->subparts_cpus, tmp->new_cpus);
1475                                 cp->nr_subparts_cpus
1476                                         = cpumask_weight(cp->subparts_cpus);
1477                         }
1478                 }
1479
1480                 if (new_prs != old_prs)
1481                         cp->partition_root_state = new_prs;
1482
1483                 spin_unlock_irq(&callback_lock);
1484                 notify_partition_change(cp, old_prs, new_prs);
1485
1486                 WARN_ON(!is_in_v2_mode() &&
1487                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
1488
1489                 update_tasks_cpumask(cp);
1490
1491                 /*
1492                  * On legacy hierarchy, if the effective cpumask of any non-
1493                  * empty cpuset is changed, we need to rebuild sched domains.
1494                  * On default hierarchy, the cpuset needs to be a partition
1495                  * root as well.
1496                  */
1497                 if (!cpumask_empty(cp->cpus_allowed) &&
1498                     is_sched_load_balance(cp) &&
1499                    (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1500                     is_partition_root(cp)))
1501                         need_rebuild_sched_domains = true;
1502
1503                 rcu_read_lock();
1504                 css_put(&cp->css);
1505         }
1506         rcu_read_unlock();
1507
1508         if (need_rebuild_sched_domains)
1509                 rebuild_sched_domains_locked();
1510 }
1511
1512 /**
1513  * update_sibling_cpumasks - Update siblings cpumasks
1514  * @parent:  Parent cpuset
1515  * @cs:      Current cpuset
1516  * @tmp:     Temp variables
1517  */
1518 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1519                                     struct tmpmasks *tmp)
1520 {
1521         struct cpuset *sibling;
1522         struct cgroup_subsys_state *pos_css;
1523
1524         percpu_rwsem_assert_held(&cpuset_rwsem);
1525
1526         /*
1527          * Check all its siblings and call update_cpumasks_hier()
1528          * if their use_parent_ecpus flag is set in order for them
1529          * to use the right effective_cpus value.
1530          *
1531          * The update_cpumasks_hier() function may sleep. So we have to
1532          * release the RCU read lock before calling it.
1533          */
1534         rcu_read_lock();
1535         cpuset_for_each_child(sibling, pos_css, parent) {
1536                 if (sibling == cs)
1537                         continue;
1538                 if (!sibling->use_parent_ecpus)
1539                         continue;
1540                 if (!css_tryget_online(&sibling->css))
1541                         continue;
1542
1543                 rcu_read_unlock();
1544                 update_cpumasks_hier(sibling, tmp);
1545                 rcu_read_lock();
1546                 css_put(&sibling->css);
1547         }
1548         rcu_read_unlock();
1549 }
1550
1551 /**
1552  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
1553  * @cs: the cpuset to consider
1554  * @trialcs: trial cpuset
1555  * @buf: buffer of cpu numbers written to this cpuset
1556  */
1557 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1558                           const char *buf)
1559 {
1560         int retval;
1561         struct tmpmasks tmp;
1562
1563         /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
1564         if (cs == &top_cpuset)
1565                 return -EACCES;
1566
1567         /*
1568          * An empty cpus_allowed is ok only if the cpuset has no tasks.
1569          * Since cpulist_parse() fails on an empty mask, we special case
1570          * that parsing.  The validate_change() call ensures that cpusets
1571          * with tasks have cpus.
1572          */
1573         if (!*buf) {
1574                 cpumask_clear(trialcs->cpus_allowed);
1575         } else {
1576                 retval = cpulist_parse(buf, trialcs->cpus_allowed);
1577                 if (retval < 0)
1578                         return retval;
1579
1580                 if (!cpumask_subset(trialcs->cpus_allowed,
1581                                     top_cpuset.cpus_allowed))
1582                         return -EINVAL;
1583         }
1584
1585         /* Nothing to do if the cpus didn't change */
1586         if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
1587                 return 0;
1588
1589         retval = validate_change(cs, trialcs);
1590         if (retval < 0)
1591                 return retval;
1592
1593 #ifdef CONFIG_CPUMASK_OFFSTACK
1594         /*
1595          * Use the cpumasks in trialcs for tmpmasks when they are pointers
1596          * to allocated cpumasks.
1597          */
1598         tmp.addmask  = trialcs->subparts_cpus;
1599         tmp.delmask  = trialcs->effective_cpus;
1600         tmp.new_cpus = trialcs->cpus_allowed;
1601 #endif
1602
1603         if (cs->partition_root_state) {
1604                 /* Cpumask of a partition root cannot be empty */
1605                 if (cpumask_empty(trialcs->cpus_allowed))
1606                         return -EINVAL;
1607                 if (update_parent_subparts_cpumask(cs, partcmd_update,
1608                                         trialcs->cpus_allowed, &tmp) < 0)
1609                         return -EINVAL;
1610         }
1611
1612         spin_lock_irq(&callback_lock);
1613         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
1614
1615         /*
1616          * Make sure that subparts_cpus is a subset of cpus_allowed.
1617          */
1618         if (cs->nr_subparts_cpus) {
1619                 cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
1620                 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1621         }
1622         spin_unlock_irq(&callback_lock);
1623
1624         update_cpumasks_hier(cs, &tmp);
1625
1626         if (cs->partition_root_state) {
1627                 struct cpuset *parent = parent_cs(cs);
1628
1629                 /*
1630                  * For partition root, update the cpumasks of sibling
1631                  * cpusets if they use parent's effective_cpus.
1632                  */
1633                 if (parent->child_ecpus_count)
1634                         update_sibling_cpumasks(parent, cs, &tmp);
1635         }
1636         return 0;
1637 }
1638
1639 /*
1640  * Migrate memory region from one set of nodes to another.  This is
1641  * performed asynchronously as it can be called from process migration path
1642  * holding locks involved in process management.  All mm migrations are
1643  * performed in the queued order and can be waited for by flushing
1644  * cpuset_migrate_mm_wq.
1645  */
1646
1647 struct cpuset_migrate_mm_work {
1648         struct work_struct      work;
1649         struct mm_struct        *mm;
1650         nodemask_t              from;
1651         nodemask_t              to;
1652 };
1653
1654 static void cpuset_migrate_mm_workfn(struct work_struct *work)
1655 {
1656         struct cpuset_migrate_mm_work *mwork =
1657                 container_of(work, struct cpuset_migrate_mm_work, work);
1658
1659         /* on a wq worker, no need to worry about %current's mems_allowed */
1660         do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1661         mmput(mwork->mm);
1662         kfree(mwork);
1663 }
1664
1665 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1666                                                         const nodemask_t *to)
1667 {
1668         struct cpuset_migrate_mm_work *mwork;
1669
1670         if (nodes_equal(*from, *to)) {
1671                 mmput(mm);
1672                 return;
1673         }
1674
1675         mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1676         if (mwork) {
1677                 mwork->mm = mm;
1678                 mwork->from = *from;
1679                 mwork->to = *to;
1680                 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1681                 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1682         } else {
1683                 mmput(mm);
1684         }
1685 }
1686
1687 static void cpuset_post_attach(void)
1688 {
1689         flush_workqueue(cpuset_migrate_mm_wq);
1690 }
1691
1692 /*
1693  * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1694  * @tsk: the task to change
1695  * @newmems: new nodes that the task will be set
1696  *
1697  * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
1698  * and rebind an eventual tasks' mempolicy. If the task is allocating in
1699  * parallel, it might temporarily see an empty intersection, which results in
1700  * a seqlock check and retry before OOM or allocation failure.
1701  */
1702 static void cpuset_change_task_nodemask(struct task_struct *tsk,
1703                                         nodemask_t *newmems)
1704 {
1705         task_lock(tsk);
1706
1707         local_irq_disable();
1708         write_seqcount_begin(&tsk->mems_allowed_seq);
1709
1710         nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1711         mpol_rebind_task(tsk, newmems);
1712         tsk->mems_allowed = *newmems;
1713
1714         write_seqcount_end(&tsk->mems_allowed_seq);
1715         local_irq_enable();
1716
1717         task_unlock(tsk);
1718 }
1719
1720 static void *cpuset_being_rebound;
1721
1722 /**
1723  * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1724  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1725  *
1726  * Iterate through each task of @cs updating its mems_allowed to the
1727  * effective cpuset's.  As this function is called with cpuset_rwsem held,
1728  * cpuset membership stays stable.
1729  */
1730 static void update_tasks_nodemask(struct cpuset *cs)
1731 {
1732         static nodemask_t newmems;      /* protected by cpuset_rwsem */
1733         struct css_task_iter it;
1734         struct task_struct *task;
1735
1736         cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
1737
1738         guarantee_online_mems(cs, &newmems);
1739
1740         /*
1741          * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
1742          * take while holding tasklist_lock.  Forks can happen - the
1743          * mpol_dup() cpuset_being_rebound check will catch such forks,
1744          * and rebind their vma mempolicies too.  Because we still hold
1745          * the global cpuset_rwsem, we know that no other rebind effort
1746          * will be contending for the global variable cpuset_being_rebound.
1747          * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1748          * is idempotent.  Also migrate pages in each mm to new nodes.
1749          */
1750         css_task_iter_start(&cs->css, 0, &it);
1751         while ((task = css_task_iter_next(&it))) {
1752                 struct mm_struct *mm;
1753                 bool migrate;
1754
1755                 cpuset_change_task_nodemask(task, &newmems);
1756
1757                 mm = get_task_mm(task);
1758                 if (!mm)
1759                         continue;
1760
1761                 migrate = is_memory_migrate(cs);
1762
1763                 mpol_rebind_mm(mm, &cs->mems_allowed);
1764                 if (migrate)
1765                         cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1766                 else
1767                         mmput(mm);
1768         }
1769         css_task_iter_end(&it);
1770
1771         /*
1772          * All the tasks' nodemasks have been updated, update
1773          * cs->old_mems_allowed.
1774          */
1775         cs->old_mems_allowed = newmems;
1776
1777         /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1778         cpuset_being_rebound = NULL;
1779 }
1780
1781 /*
1782  * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
1783  * @cs: the cpuset to consider
1784  * @new_mems: a temp variable for calculating new effective_mems
1785  *
1786  * When configured nodemask is changed, the effective nodemasks of this cpuset
1787  * and all its descendants need to be updated.
1788  *
1789  * On legacy hierarchy, effective_mems will be the same with mems_allowed.
1790  *
1791  * Called with cpuset_rwsem held
1792  */
1793 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1794 {
1795         struct cpuset *cp;
1796         struct cgroup_subsys_state *pos_css;
1797
1798         rcu_read_lock();
1799         cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1800                 struct cpuset *parent = parent_cs(cp);
1801
1802                 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1803
1804                 /*
1805                  * If it becomes empty, inherit the effective mask of the
1806                  * parent, which is guaranteed to have some MEMs.
1807                  */
1808                 if (is_in_v2_mode() && nodes_empty(*new_mems))
1809                         *new_mems = parent->effective_mems;
1810
1811                 /* Skip the whole subtree if the nodemask remains the same. */
1812                 if (nodes_equal(*new_mems, cp->effective_mems)) {
1813                         pos_css = css_rightmost_descendant(pos_css);
1814                         continue;
1815                 }
1816
1817                 if (!css_tryget_online(&cp->css))
1818                         continue;
1819                 rcu_read_unlock();
1820
1821                 spin_lock_irq(&callback_lock);
1822                 cp->effective_mems = *new_mems;
1823                 spin_unlock_irq(&callback_lock);
1824
1825                 WARN_ON(!is_in_v2_mode() &&
1826                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
1827
1828                 update_tasks_nodemask(cp);
1829
1830                 rcu_read_lock();
1831                 css_put(&cp->css);
1832         }
1833         rcu_read_unlock();
1834 }
1835
1836 /*
1837  * Handle user request to change the 'mems' memory placement
1838  * of a cpuset.  Needs to validate the request, update the
1839  * cpusets mems_allowed, and for each task in the cpuset,
1840  * update mems_allowed and rebind task's mempolicy and any vma
1841  * mempolicies and if the cpuset is marked 'memory_migrate',
1842  * migrate the tasks pages to the new memory.
1843  *
1844  * Call with cpuset_rwsem held. May take callback_lock during call.
1845  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1846  * lock each such tasks mm->mmap_lock, scan its vma's and rebind
1847  * their mempolicies to the cpusets new mems_allowed.
1848  */
1849 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1850                            const char *buf)
1851 {
1852         int retval;
1853
1854         /*
1855          * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1856          * it's read-only
1857          */
1858         if (cs == &top_cpuset) {
1859                 retval = -EACCES;
1860                 goto done;
1861         }
1862
1863         /*
1864          * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1865          * Since nodelist_parse() fails on an empty mask, we special case
1866          * that parsing.  The validate_change() call ensures that cpusets
1867          * with tasks have memory.
1868          */
1869         if (!*buf) {
1870                 nodes_clear(trialcs->mems_allowed);
1871         } else {
1872                 retval = nodelist_parse(buf, trialcs->mems_allowed);
1873                 if (retval < 0)
1874                         goto done;
1875
1876                 if (!nodes_subset(trialcs->mems_allowed,
1877                                   top_cpuset.mems_allowed)) {
1878                         retval = -EINVAL;
1879                         goto done;
1880                 }
1881         }
1882
1883         if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1884                 retval = 0;             /* Too easy - nothing to do */
1885                 goto done;
1886         }
1887         retval = validate_change(cs, trialcs);
1888         if (retval < 0)
1889                 goto done;
1890
1891         spin_lock_irq(&callback_lock);
1892         cs->mems_allowed = trialcs->mems_allowed;
1893         spin_unlock_irq(&callback_lock);
1894
1895         /* use trialcs->mems_allowed as a temp variable */
1896         update_nodemasks_hier(cs, &trialcs->mems_allowed);
1897 done:
1898         return retval;
1899 }
1900
1901 bool current_cpuset_is_being_rebound(void)
1902 {
1903         bool ret;
1904
1905         rcu_read_lock();
1906         ret = task_cs(current) == cpuset_being_rebound;
1907         rcu_read_unlock();
1908
1909         return ret;
1910 }
1911
1912 static int update_relax_domain_level(struct cpuset *cs, s64 val)
1913 {
1914 #ifdef CONFIG_SMP
1915         if (val < -1 || val >= sched_domain_level_max)
1916                 return -EINVAL;
1917 #endif
1918
1919         if (val != cs->relax_domain_level) {
1920                 cs->relax_domain_level = val;
1921                 if (!cpumask_empty(cs->cpus_allowed) &&
1922                     is_sched_load_balance(cs))
1923                         rebuild_sched_domains_locked();
1924         }
1925
1926         return 0;
1927 }
1928
1929 /**
1930  * update_tasks_flags - update the spread flags of tasks in the cpuset.
1931  * @cs: the cpuset in which each task's spread flags needs to be changed
1932  *
1933  * Iterate through each task of @cs updating its spread flags.  As this
1934  * function is called with cpuset_rwsem held, cpuset membership stays
1935  * stable.
1936  */
1937 static void update_tasks_flags(struct cpuset *cs)
1938 {
1939         struct css_task_iter it;
1940         struct task_struct *task;
1941
1942         css_task_iter_start(&cs->css, 0, &it);
1943         while ((task = css_task_iter_next(&it)))
1944                 cpuset_update_task_spread_flag(cs, task);
1945         css_task_iter_end(&it);
1946 }
1947
1948 /*
1949  * update_flag - read a 0 or a 1 in a file and update associated flag
1950  * bit:         the bit to update (see cpuset_flagbits_t)
1951  * cs:          the cpuset to update
1952  * turning_on:  whether the flag is being set or cleared
1953  *
1954  * Call with cpuset_rwsem held.
1955  */
1956
1957 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1958                        int turning_on)
1959 {
1960         struct cpuset *trialcs;
1961         int balance_flag_changed;
1962         int spread_flag_changed;
1963         int err;
1964
1965         trialcs = alloc_trial_cpuset(cs);
1966         if (!trialcs)
1967                 return -ENOMEM;
1968
1969         if (turning_on)
1970                 set_bit(bit, &trialcs->flags);
1971         else
1972                 clear_bit(bit, &trialcs->flags);
1973
1974         err = validate_change(cs, trialcs);
1975         if (err < 0)
1976                 goto out;
1977
1978         balance_flag_changed = (is_sched_load_balance(cs) !=
1979                                 is_sched_load_balance(trialcs));
1980
1981         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1982                         || (is_spread_page(cs) != is_spread_page(trialcs)));
1983
1984         spin_lock_irq(&callback_lock);
1985         cs->flags = trialcs->flags;
1986         spin_unlock_irq(&callback_lock);
1987
1988         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1989                 rebuild_sched_domains_locked();
1990
1991         if (spread_flag_changed)
1992                 update_tasks_flags(cs);
1993 out:
1994         free_cpuset(trialcs);
1995         return err;
1996 }
1997
1998 /*
1999  * update_prstate - update partititon_root_state
2000  * cs: the cpuset to update
2001  * new_prs: new partition root state
2002  *
2003  * Call with cpuset_rwsem held.
2004  */
2005 static int update_prstate(struct cpuset *cs, int new_prs)
2006 {
2007         int err, old_prs = cs->partition_root_state;
2008         struct cpuset *parent = parent_cs(cs);
2009         struct tmpmasks tmpmask;
2010
2011         if (old_prs == new_prs)
2012                 return 0;
2013
2014         /*
2015          * Cannot force a partial or invalid partition root to a full
2016          * partition root.
2017          */
2018         if (new_prs && (old_prs == PRS_ERROR))
2019                 return -EINVAL;
2020
2021         if (alloc_cpumasks(NULL, &tmpmask))
2022                 return -ENOMEM;
2023
2024         err = -EINVAL;
2025         if (!old_prs) {
2026                 /*
2027                  * Turning on partition root requires setting the
2028                  * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
2029                  * cannot be NULL.
2030                  */
2031                 if (cpumask_empty(cs->cpus_allowed))
2032                         goto out;
2033
2034                 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
2035                 if (err)
2036                         goto out;
2037
2038                 err = update_parent_subparts_cpumask(cs, partcmd_enable,
2039                                                      NULL, &tmpmask);
2040                 if (err) {
2041                         update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2042                         goto out;
2043                 }
2044         } else {
2045                 /*
2046                  * Turning off partition root will clear the
2047                  * CS_CPU_EXCLUSIVE bit.
2048                  */
2049                 if (old_prs == PRS_ERROR) {
2050                         update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2051                         err = 0;
2052                         goto out;
2053                 }
2054
2055                 err = update_parent_subparts_cpumask(cs, partcmd_disable,
2056                                                      NULL, &tmpmask);
2057                 if (err)
2058                         goto out;
2059
2060                 /* Turning off CS_CPU_EXCLUSIVE will not return error */
2061                 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2062         }
2063
2064         update_tasks_cpumask(parent);
2065
2066         if (parent->child_ecpus_count)
2067                 update_sibling_cpumasks(parent, cs, &tmpmask);
2068
2069         rebuild_sched_domains_locked();
2070 out:
2071         if (!err) {
2072                 spin_lock_irq(&callback_lock);
2073                 cs->partition_root_state = new_prs;
2074                 spin_unlock_irq(&callback_lock);
2075                 notify_partition_change(cs, old_prs, new_prs);
2076         }
2077
2078         free_cpumasks(NULL, &tmpmask);
2079         return err;
2080 }
2081
2082 /*
2083  * Frequency meter - How fast is some event occurring?
2084  *
2085  * These routines manage a digitally filtered, constant time based,
2086  * event frequency meter.  There are four routines:
2087  *   fmeter_init() - initialize a frequency meter.
2088  *   fmeter_markevent() - called each time the event happens.
2089  *   fmeter_getrate() - returns the recent rate of such events.
2090  *   fmeter_update() - internal routine used to update fmeter.
2091  *
2092  * A common data structure is passed to each of these routines,
2093  * which is used to keep track of the state required to manage the
2094  * frequency meter and its digital filter.
2095  *
2096  * The filter works on the number of events marked per unit time.
2097  * The filter is single-pole low-pass recursive (IIR).  The time unit
2098  * is 1 second.  Arithmetic is done using 32-bit integers scaled to
2099  * simulate 3 decimal digits of precision (multiplied by 1000).
2100  *
2101  * With an FM_COEF of 933, and a time base of 1 second, the filter
2102  * has a half-life of 10 seconds, meaning that if the events quit
2103  * happening, then the rate returned from the fmeter_getrate()
2104  * will be cut in half each 10 seconds, until it converges to zero.
2105  *
2106  * It is not worth doing a real infinitely recursive filter.  If more
2107  * than FM_MAXTICKS ticks have elapsed since the last filter event,
2108  * just compute FM_MAXTICKS ticks worth, by which point the level
2109  * will be stable.
2110  *
2111  * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
2112  * arithmetic overflow in the fmeter_update() routine.
2113  *
2114  * Given the simple 32 bit integer arithmetic used, this meter works
2115  * best for reporting rates between one per millisecond (msec) and
2116  * one per 32 (approx) seconds.  At constant rates faster than one
2117  * per msec it maxes out at values just under 1,000,000.  At constant
2118  * rates between one per msec, and one per second it will stabilize
2119  * to a value N*1000, where N is the rate of events per second.
2120  * At constant rates between one per second and one per 32 seconds,
2121  * it will be choppy, moving up on the seconds that have an event,
2122  * and then decaying until the next event.  At rates slower than
2123  * about one in 32 seconds, it decays all the way back to zero between
2124  * each event.
2125  */
2126
2127 #define FM_COEF 933             /* coefficient for half-life of 10 secs */
2128 #define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
2129 #define FM_MAXCNT 1000000       /* limit cnt to avoid overflow */
2130 #define FM_SCALE 1000           /* faux fixed point scale */
2131
2132 /* Initialize a frequency meter */
2133 static void fmeter_init(struct fmeter *fmp)
2134 {
2135         fmp->cnt = 0;
2136         fmp->val = 0;
2137         fmp->time = 0;
2138         spin_lock_init(&fmp->lock);
2139 }
2140
2141 /* Internal meter update - process cnt events and update value */
2142 static void fmeter_update(struct fmeter *fmp)
2143 {
2144         time64_t now;
2145         u32 ticks;
2146
2147         now = ktime_get_seconds();
2148         ticks = now - fmp->time;
2149
2150         if (ticks == 0)
2151                 return;
2152
2153         ticks = min(FM_MAXTICKS, ticks);
2154         while (ticks-- > 0)
2155                 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
2156         fmp->time = now;
2157
2158         fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
2159         fmp->cnt = 0;
2160 }
2161
2162 /* Process any previous ticks, then bump cnt by one (times scale). */
2163 static void fmeter_markevent(struct fmeter *fmp)
2164 {
2165         spin_lock(&fmp->lock);
2166         fmeter_update(fmp);
2167         fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
2168         spin_unlock(&fmp->lock);
2169 }
2170
2171 /* Process any previous ticks, then return current value. */
2172 static int fmeter_getrate(struct fmeter *fmp)
2173 {
2174         int val;
2175
2176         spin_lock(&fmp->lock);
2177         fmeter_update(fmp);
2178         val = fmp->val;
2179         spin_unlock(&fmp->lock);
2180         return val;
2181 }
2182
2183 static struct cpuset *cpuset_attach_old_cs;
2184
2185 /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
2186 static int cpuset_can_attach(struct cgroup_taskset *tset)
2187 {
2188         struct cgroup_subsys_state *css;
2189         struct cpuset *cs;
2190         struct task_struct *task;
2191         int ret;
2192
2193         /* used later by cpuset_attach() */
2194         cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2195         cs = css_cs(css);
2196
2197         percpu_down_write(&cpuset_rwsem);
2198
2199         /* allow moving tasks into an empty cpuset if on default hierarchy */
2200         ret = -ENOSPC;
2201         if (!is_in_v2_mode() &&
2202             (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
2203                 goto out_unlock;
2204
2205         cgroup_taskset_for_each(task, css, tset) {
2206                 ret = task_can_attach(task, cs->effective_cpus);
2207                 if (ret)
2208                         goto out_unlock;
2209                 ret = security_task_setscheduler(task);
2210                 if (ret)
2211                         goto out_unlock;
2212         }
2213
2214         /*
2215          * Mark attach is in progress.  This makes validate_change() fail
2216          * changes which zero cpus/mems_allowed.
2217          */
2218         cs->attach_in_progress++;
2219         ret = 0;
2220 out_unlock:
2221         percpu_up_write(&cpuset_rwsem);
2222         return ret;
2223 }
2224
2225 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2226 {
2227         struct cgroup_subsys_state *css;
2228
2229         cgroup_taskset_first(tset, &css);
2230
2231         percpu_down_write(&cpuset_rwsem);
2232         css_cs(css)->attach_in_progress--;
2233         percpu_up_write(&cpuset_rwsem);
2234 }
2235
2236 /*
2237  * Protected by cpuset_rwsem.  cpus_attach is used only by cpuset_attach()
2238  * but we can't allocate it dynamically there.  Define it global and
2239  * allocate from cpuset_init().
2240  */
2241 static cpumask_var_t cpus_attach;
2242
2243 static void cpuset_attach(struct cgroup_taskset *tset)
2244 {
2245         /* static buf protected by cpuset_rwsem */
2246         static nodemask_t cpuset_attach_nodemask_to;
2247         struct task_struct *task;
2248         struct task_struct *leader;
2249         struct cgroup_subsys_state *css;
2250         struct cpuset *cs;
2251         struct cpuset *oldcs = cpuset_attach_old_cs;
2252
2253         cgroup_taskset_first(tset, &css);
2254         cs = css_cs(css);
2255
2256         lockdep_assert_cpus_held();     /* see cgroup_attach_lock() */
2257         percpu_down_write(&cpuset_rwsem);
2258
2259         guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
2260
2261         cgroup_taskset_for_each(task, css, tset) {
2262                 if (cs != &top_cpuset)
2263                         guarantee_online_cpus(task, cpus_attach);
2264                 else
2265                         cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
2266                 /*
2267                  * can_attach beforehand should guarantee that this doesn't
2268                  * fail.  TODO: have a better way to handle failure here
2269                  */
2270                 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2271
2272                 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
2273                 cpuset_update_task_spread_flag(cs, task);
2274         }
2275
2276         /*
2277          * Change mm for all threadgroup leaders. This is expensive and may
2278          * sleep and should be moved outside migration path proper.
2279          */
2280         cpuset_attach_nodemask_to = cs->effective_mems;
2281         cgroup_taskset_for_each_leader(leader, css, tset) {
2282                 struct mm_struct *mm = get_task_mm(leader);
2283
2284                 if (mm) {
2285                         mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2286
2287                         /*
2288                          * old_mems_allowed is the same with mems_allowed
2289                          * here, except if this task is being moved
2290                          * automatically due to hotplug.  In that case
2291                          * @mems_allowed has been updated and is empty, so
2292                          * @old_mems_allowed is the right nodesets that we
2293                          * migrate mm from.
2294                          */
2295                         if (is_memory_migrate(cs))
2296                                 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
2297                                                   &cpuset_attach_nodemask_to);
2298                         else
2299                                 mmput(mm);
2300                 }
2301         }
2302
2303         cs->old_mems_allowed = cpuset_attach_nodemask_to;
2304
2305         cs->attach_in_progress--;
2306         if (!cs->attach_in_progress)
2307                 wake_up(&cpuset_attach_wq);
2308
2309         percpu_up_write(&cpuset_rwsem);
2310 }
2311
2312 /* The various types of files and directories in a cpuset file system */
2313
2314 typedef enum {
2315         FILE_MEMORY_MIGRATE,
2316         FILE_CPULIST,
2317         FILE_MEMLIST,
2318         FILE_EFFECTIVE_CPULIST,
2319         FILE_EFFECTIVE_MEMLIST,
2320         FILE_SUBPARTS_CPULIST,
2321         FILE_CPU_EXCLUSIVE,
2322         FILE_MEM_EXCLUSIVE,
2323         FILE_MEM_HARDWALL,
2324         FILE_SCHED_LOAD_BALANCE,
2325         FILE_PARTITION_ROOT,
2326         FILE_SCHED_RELAX_DOMAIN_LEVEL,
2327         FILE_MEMORY_PRESSURE_ENABLED,
2328         FILE_MEMORY_PRESSURE,
2329         FILE_SPREAD_PAGE,
2330         FILE_SPREAD_SLAB,
2331 } cpuset_filetype_t;
2332
2333 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2334                             u64 val)
2335 {
2336         struct cpuset *cs = css_cs(css);
2337         cpuset_filetype_t type = cft->private;
2338         int retval = 0;
2339
2340         cpus_read_lock();
2341         percpu_down_write(&cpuset_rwsem);
2342         if (!is_cpuset_online(cs)) {
2343                 retval = -ENODEV;
2344                 goto out_unlock;
2345         }
2346
2347         switch (type) {
2348         case FILE_CPU_EXCLUSIVE:
2349                 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
2350                 break;
2351         case FILE_MEM_EXCLUSIVE:
2352                 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
2353                 break;
2354         case FILE_MEM_HARDWALL:
2355                 retval = update_flag(CS_MEM_HARDWALL, cs, val);
2356                 break;
2357         case FILE_SCHED_LOAD_BALANCE:
2358                 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
2359                 break;
2360         case FILE_MEMORY_MIGRATE:
2361                 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
2362                 break;
2363         case FILE_MEMORY_PRESSURE_ENABLED:
2364                 cpuset_memory_pressure_enabled = !!val;
2365                 break;
2366         case FILE_SPREAD_PAGE:
2367                 retval = update_flag(CS_SPREAD_PAGE, cs, val);
2368                 break;
2369         case FILE_SPREAD_SLAB:
2370                 retval = update_flag(CS_SPREAD_SLAB, cs, val);
2371                 break;
2372         default:
2373                 retval = -EINVAL;
2374                 break;
2375         }
2376 out_unlock:
2377         percpu_up_write(&cpuset_rwsem);
2378         cpus_read_unlock();
2379         return retval;
2380 }
2381
2382 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2383                             s64 val)
2384 {
2385         struct cpuset *cs = css_cs(css);
2386         cpuset_filetype_t type = cft->private;
2387         int retval = -ENODEV;
2388
2389         cpus_read_lock();
2390         percpu_down_write(&cpuset_rwsem);
2391         if (!is_cpuset_online(cs))
2392                 goto out_unlock;
2393
2394         switch (type) {
2395         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2396                 retval = update_relax_domain_level(cs, val);
2397                 break;
2398         default:
2399                 retval = -EINVAL;
2400                 break;
2401         }
2402 out_unlock:
2403         percpu_up_write(&cpuset_rwsem);
2404         cpus_read_unlock();
2405         return retval;
2406 }
2407
2408 /*
2409  * Common handling for a write to a "cpus" or "mems" file.
2410  */
2411 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2412                                     char *buf, size_t nbytes, loff_t off)
2413 {
2414         struct cpuset *cs = css_cs(of_css(of));
2415         struct cpuset *trialcs;
2416         int retval = -ENODEV;
2417
2418         buf = strstrip(buf);
2419
2420         /*
2421          * CPU or memory hotunplug may leave @cs w/o any execution
2422          * resources, in which case the hotplug code asynchronously updates
2423          * configuration and transfers all tasks to the nearest ancestor
2424          * which can execute.
2425          *
2426          * As writes to "cpus" or "mems" may restore @cs's execution
2427          * resources, wait for the previously scheduled operations before
2428          * proceeding, so that we don't end up keep removing tasks added
2429          * after execution capability is restored.
2430          *
2431          * cpuset_hotplug_work calls back into cgroup core via
2432          * cgroup_transfer_tasks() and waiting for it from a cgroupfs
2433          * operation like this one can lead to a deadlock through kernfs
2434          * active_ref protection.  Let's break the protection.  Losing the
2435          * protection is okay as we check whether @cs is online after
2436          * grabbing cpuset_rwsem anyway.  This only happens on the legacy
2437          * hierarchies.
2438          */
2439         css_get(&cs->css);
2440         kernfs_break_active_protection(of->kn);
2441         flush_work(&cpuset_hotplug_work);
2442
2443         cpus_read_lock();
2444         percpu_down_write(&cpuset_rwsem);
2445         if (!is_cpuset_online(cs))
2446                 goto out_unlock;
2447
2448         trialcs = alloc_trial_cpuset(cs);
2449         if (!trialcs) {
2450                 retval = -ENOMEM;
2451                 goto out_unlock;
2452         }
2453
2454         switch (of_cft(of)->private) {
2455         case FILE_CPULIST:
2456                 retval = update_cpumask(cs, trialcs, buf);
2457                 break;
2458         case FILE_MEMLIST:
2459                 retval = update_nodemask(cs, trialcs, buf);
2460                 break;
2461         default:
2462                 retval = -EINVAL;
2463                 break;
2464         }
2465
2466         free_cpuset(trialcs);
2467 out_unlock:
2468         percpu_up_write(&cpuset_rwsem);
2469         cpus_read_unlock();
2470         kernfs_unbreak_active_protection(of->kn);
2471         css_put(&cs->css);
2472         flush_workqueue(cpuset_migrate_mm_wq);
2473         return retval ?: nbytes;
2474 }
2475
2476 /*
2477  * These ascii lists should be read in a single call, by using a user
2478  * buffer large enough to hold the entire map.  If read in smaller
2479  * chunks, there is no guarantee of atomicity.  Since the display format
2480  * used, list of ranges of sequential numbers, is variable length,
2481  * and since these maps can change value dynamically, one could read
2482  * gibberish by doing partial reads while a list was changing.
2483  */
2484 static int cpuset_common_seq_show(struct seq_file *sf, void *v)
2485 {
2486         struct cpuset *cs = css_cs(seq_css(sf));
2487         cpuset_filetype_t type = seq_cft(sf)->private;
2488         int ret = 0;
2489
2490         spin_lock_irq(&callback_lock);
2491
2492         switch (type) {
2493         case FILE_CPULIST:
2494                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
2495                 break;
2496         case FILE_MEMLIST:
2497                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
2498                 break;
2499         case FILE_EFFECTIVE_CPULIST:
2500                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
2501                 break;
2502         case FILE_EFFECTIVE_MEMLIST:
2503                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
2504                 break;
2505         case FILE_SUBPARTS_CPULIST:
2506                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2507                 break;
2508         default:
2509                 ret = -EINVAL;
2510         }
2511
2512         spin_unlock_irq(&callback_lock);
2513         return ret;
2514 }
2515
2516 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
2517 {
2518         struct cpuset *cs = css_cs(css);
2519         cpuset_filetype_t type = cft->private;
2520         switch (type) {
2521         case FILE_CPU_EXCLUSIVE:
2522                 return is_cpu_exclusive(cs);
2523         case FILE_MEM_EXCLUSIVE:
2524                 return is_mem_exclusive(cs);
2525         case FILE_MEM_HARDWALL:
2526                 return is_mem_hardwall(cs);
2527         case FILE_SCHED_LOAD_BALANCE:
2528                 return is_sched_load_balance(cs);
2529         case FILE_MEMORY_MIGRATE:
2530                 return is_memory_migrate(cs);
2531         case FILE_MEMORY_PRESSURE_ENABLED:
2532                 return cpuset_memory_pressure_enabled;
2533         case FILE_MEMORY_PRESSURE:
2534                 return fmeter_getrate(&cs->fmeter);
2535         case FILE_SPREAD_PAGE:
2536                 return is_spread_page(cs);
2537         case FILE_SPREAD_SLAB:
2538                 return is_spread_slab(cs);
2539         default:
2540                 BUG();
2541         }
2542
2543         /* Unreachable but makes gcc happy */
2544         return 0;
2545 }
2546
2547 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
2548 {
2549         struct cpuset *cs = css_cs(css);
2550         cpuset_filetype_t type = cft->private;
2551         switch (type) {
2552         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2553                 return cs->relax_domain_level;
2554         default:
2555                 BUG();
2556         }
2557
2558         /* Unreachable but makes gcc happy */
2559         return 0;
2560 }
2561
2562 static int sched_partition_show(struct seq_file *seq, void *v)
2563 {
2564         struct cpuset *cs = css_cs(seq_css(seq));
2565
2566         switch (cs->partition_root_state) {
2567         case PRS_ENABLED:
2568                 seq_puts(seq, "root\n");
2569                 break;
2570         case PRS_DISABLED:
2571                 seq_puts(seq, "member\n");
2572                 break;
2573         case PRS_ERROR:
2574                 seq_puts(seq, "root invalid\n");
2575                 break;
2576         }
2577         return 0;
2578 }
2579
2580 static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2581                                      size_t nbytes, loff_t off)
2582 {
2583         struct cpuset *cs = css_cs(of_css(of));
2584         int val;
2585         int retval = -ENODEV;
2586
2587         buf = strstrip(buf);
2588
2589         /*
2590          * Convert "root" to ENABLED, and convert "member" to DISABLED.
2591          */
2592         if (!strcmp(buf, "root"))
2593                 val = PRS_ENABLED;
2594         else if (!strcmp(buf, "member"))
2595                 val = PRS_DISABLED;
2596         else
2597                 return -EINVAL;
2598
2599         css_get(&cs->css);
2600         cpus_read_lock();
2601         percpu_down_write(&cpuset_rwsem);
2602         if (!is_cpuset_online(cs))
2603                 goto out_unlock;
2604
2605         retval = update_prstate(cs, val);
2606 out_unlock:
2607         percpu_up_write(&cpuset_rwsem);
2608         cpus_read_unlock();
2609         css_put(&cs->css);
2610         return retval ?: nbytes;
2611 }
2612
2613 /*
2614  * for the common functions, 'private' gives the type of file
2615  */
2616
2617 static struct cftype legacy_files[] = {
2618         {
2619                 .name = "cpus",
2620                 .seq_show = cpuset_common_seq_show,
2621                 .write = cpuset_write_resmask,
2622                 .max_write_len = (100U + 6 * NR_CPUS),
2623                 .private = FILE_CPULIST,
2624         },
2625
2626         {
2627                 .name = "mems",
2628                 .seq_show = cpuset_common_seq_show,
2629                 .write = cpuset_write_resmask,
2630                 .max_write_len = (100U + 6 * MAX_NUMNODES),
2631                 .private = FILE_MEMLIST,
2632         },
2633
2634         {
2635                 .name = "effective_cpus",
2636                 .seq_show = cpuset_common_seq_show,
2637                 .private = FILE_EFFECTIVE_CPULIST,
2638         },
2639
2640         {
2641                 .name = "effective_mems",
2642                 .seq_show = cpuset_common_seq_show,
2643                 .private = FILE_EFFECTIVE_MEMLIST,
2644         },
2645
2646         {
2647                 .name = "cpu_exclusive",
2648                 .read_u64 = cpuset_read_u64,
2649                 .write_u64 = cpuset_write_u64,
2650                 .private = FILE_CPU_EXCLUSIVE,
2651         },
2652
2653         {
2654                 .name = "mem_exclusive",
2655                 .read_u64 = cpuset_read_u64,
2656                 .write_u64 = cpuset_write_u64,
2657                 .private = FILE_MEM_EXCLUSIVE,
2658         },
2659
2660         {
2661                 .name = "mem_hardwall",
2662                 .read_u64 = cpuset_read_u64,
2663                 .write_u64 = cpuset_write_u64,
2664                 .private = FILE_MEM_HARDWALL,
2665         },
2666
2667         {
2668                 .name = "sched_load_balance",
2669                 .read_u64 = cpuset_read_u64,
2670                 .write_u64 = cpuset_write_u64,
2671                 .private = FILE_SCHED_LOAD_BALANCE,
2672         },
2673
2674         {
2675                 .name = "sched_relax_domain_level",
2676                 .read_s64 = cpuset_read_s64,
2677                 .write_s64 = cpuset_write_s64,
2678                 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
2679         },
2680
2681         {
2682                 .name = "memory_migrate",
2683                 .read_u64 = cpuset_read_u64,
2684                 .write_u64 = cpuset_write_u64,
2685                 .private = FILE_MEMORY_MIGRATE,
2686         },
2687
2688         {
2689                 .name = "memory_pressure",
2690                 .read_u64 = cpuset_read_u64,
2691                 .private = FILE_MEMORY_PRESSURE,
2692         },
2693
2694         {
2695                 .name = "memory_spread_page",
2696                 .read_u64 = cpuset_read_u64,
2697                 .write_u64 = cpuset_write_u64,
2698                 .private = FILE_SPREAD_PAGE,
2699         },
2700
2701         {
2702                 .name = "memory_spread_slab",
2703                 .read_u64 = cpuset_read_u64,
2704                 .write_u64 = cpuset_write_u64,
2705                 .private = FILE_SPREAD_SLAB,
2706         },
2707
2708         {
2709                 .name = "memory_pressure_enabled",
2710                 .flags = CFTYPE_ONLY_ON_ROOT,
2711                 .read_u64 = cpuset_read_u64,
2712                 .write_u64 = cpuset_write_u64,
2713                 .private = FILE_MEMORY_PRESSURE_ENABLED,
2714         },
2715
2716         { }     /* terminate */
2717 };
2718
2719 /*
2720  * This is currently a minimal set for the default hierarchy. It can be
2721  * expanded later on by migrating more features and control files from v1.
2722  */
2723 static struct cftype dfl_files[] = {
2724         {
2725                 .name = "cpus",
2726                 .seq_show = cpuset_common_seq_show,
2727                 .write = cpuset_write_resmask,
2728                 .max_write_len = (100U + 6 * NR_CPUS),
2729                 .private = FILE_CPULIST,
2730                 .flags = CFTYPE_NOT_ON_ROOT,
2731         },
2732
2733         {
2734                 .name = "mems",
2735                 .seq_show = cpuset_common_seq_show,
2736                 .write = cpuset_write_resmask,
2737                 .max_write_len = (100U + 6 * MAX_NUMNODES),
2738                 .private = FILE_MEMLIST,
2739                 .flags = CFTYPE_NOT_ON_ROOT,
2740         },
2741
2742         {
2743                 .name = "cpus.effective",
2744                 .seq_show = cpuset_common_seq_show,
2745                 .private = FILE_EFFECTIVE_CPULIST,
2746         },
2747
2748         {
2749                 .name = "mems.effective",
2750                 .seq_show = cpuset_common_seq_show,
2751                 .private = FILE_EFFECTIVE_MEMLIST,
2752         },
2753
2754         {
2755                 .name = "cpus.partition",
2756                 .seq_show = sched_partition_show,
2757                 .write = sched_partition_write,
2758                 .private = FILE_PARTITION_ROOT,
2759                 .flags = CFTYPE_NOT_ON_ROOT,
2760                 .file_offset = offsetof(struct cpuset, partition_file),
2761         },
2762
2763         {
2764                 .name = "cpus.subpartitions",
2765                 .seq_show = cpuset_common_seq_show,
2766                 .private = FILE_SUBPARTS_CPULIST,
2767                 .flags = CFTYPE_DEBUG,
2768         },
2769
2770         { }     /* terminate */
2771 };
2772
2773
2774 /*
2775  *      cpuset_css_alloc - allocate a cpuset css
2776  *      cgrp:   control group that the new cpuset will be part of
2777  */
2778
2779 static struct cgroup_subsys_state *
2780 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2781 {
2782         struct cpuset *cs;
2783
2784         if (!parent_css)
2785                 return &top_cpuset.css;
2786
2787         cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2788         if (!cs)
2789                 return ERR_PTR(-ENOMEM);
2790
2791         if (alloc_cpumasks(cs, NULL)) {
2792                 kfree(cs);
2793                 return ERR_PTR(-ENOMEM);
2794         }
2795
2796         __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2797         nodes_clear(cs->mems_allowed);
2798         nodes_clear(cs->effective_mems);
2799         fmeter_init(&cs->fmeter);
2800         cs->relax_domain_level = -1;
2801
2802         /* Set CS_MEMORY_MIGRATE for default hierarchy */
2803         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
2804                 __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
2805
2806         return &cs->css;
2807 }
2808
2809 static int cpuset_css_online(struct cgroup_subsys_state *css)
2810 {
2811         struct cpuset *cs = css_cs(css);
2812         struct cpuset *parent = parent_cs(cs);
2813         struct cpuset *tmp_cs;
2814         struct cgroup_subsys_state *pos_css;
2815
2816         if (!parent)
2817                 return 0;
2818
2819         cpus_read_lock();
2820         percpu_down_write(&cpuset_rwsem);
2821
2822         set_bit(CS_ONLINE, &cs->flags);
2823         if (is_spread_page(parent))
2824                 set_bit(CS_SPREAD_PAGE, &cs->flags);
2825         if (is_spread_slab(parent))
2826                 set_bit(CS_SPREAD_SLAB, &cs->flags);
2827
2828         cpuset_inc();
2829
2830         spin_lock_irq(&callback_lock);
2831         if (is_in_v2_mode()) {
2832                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
2833                 cs->effective_mems = parent->effective_mems;
2834                 cs->use_parent_ecpus = true;
2835                 parent->child_ecpus_count++;
2836         }
2837         spin_unlock_irq(&callback_lock);
2838
2839         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2840                 goto out_unlock;
2841
2842         /*
2843          * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
2844          * set.  This flag handling is implemented in cgroup core for
2845          * histrical reasons - the flag may be specified during mount.
2846          *
2847          * Currently, if any sibling cpusets have exclusive cpus or mem, we
2848          * refuse to clone the configuration - thereby refusing the task to
2849          * be entered, and as a result refusing the sys_unshare() or
2850          * clone() which initiated it.  If this becomes a problem for some
2851          * users who wish to allow that scenario, then this could be
2852          * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
2853          * (and likewise for mems) to the new cgroup.
2854          */
2855         rcu_read_lock();
2856         cpuset_for_each_child(tmp_cs, pos_css, parent) {
2857                 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2858                         rcu_read_unlock();
2859                         goto out_unlock;
2860                 }
2861         }
2862         rcu_read_unlock();
2863
2864         spin_lock_irq(&callback_lock);
2865         cs->mems_allowed = parent->mems_allowed;
2866         cs->effective_mems = parent->mems_allowed;
2867         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2868         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2869         spin_unlock_irq(&callback_lock);
2870 out_unlock:
2871         percpu_up_write(&cpuset_rwsem);
2872         cpus_read_unlock();
2873         return 0;
2874 }
2875
2876 /*
2877  * If the cpuset being removed has its flag 'sched_load_balance'
2878  * enabled, then simulate turning sched_load_balance off, which
2879  * will call rebuild_sched_domains_locked(). That is not needed
2880  * in the default hierarchy where only changes in partition
2881  * will cause repartitioning.
2882  *
2883  * If the cpuset has the 'sched.partition' flag enabled, simulate
2884  * turning 'sched.partition" off.
2885  */
2886
2887 static void cpuset_css_offline(struct cgroup_subsys_state *css)
2888 {
2889         struct cpuset *cs = css_cs(css);
2890
2891         cpus_read_lock();
2892         percpu_down_write(&cpuset_rwsem);
2893
2894         if (is_partition_root(cs))
2895                 update_prstate(cs, 0);
2896
2897         if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2898             is_sched_load_balance(cs))
2899                 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2900
2901         if (cs->use_parent_ecpus) {
2902                 struct cpuset *parent = parent_cs(cs);
2903
2904                 cs->use_parent_ecpus = false;
2905                 parent->child_ecpus_count--;
2906         }
2907
2908         cpuset_dec();
2909         clear_bit(CS_ONLINE, &cs->flags);
2910
2911         percpu_up_write(&cpuset_rwsem);
2912         cpus_read_unlock();
2913 }
2914
2915 static void cpuset_css_free(struct cgroup_subsys_state *css)
2916 {
2917         struct cpuset *cs = css_cs(css);
2918
2919         free_cpuset(cs);
2920 }
2921
2922 static void cpuset_bind(struct cgroup_subsys_state *root_css)
2923 {
2924         percpu_down_write(&cpuset_rwsem);
2925         spin_lock_irq(&callback_lock);
2926
2927         if (is_in_v2_mode()) {
2928                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2929                 top_cpuset.mems_allowed = node_possible_map;
2930         } else {
2931                 cpumask_copy(top_cpuset.cpus_allowed,
2932                              top_cpuset.effective_cpus);
2933                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2934         }
2935
2936         spin_unlock_irq(&callback_lock);
2937         percpu_up_write(&cpuset_rwsem);
2938 }
2939
2940 /*
2941  * Make sure the new task conform to the current state of its parent,
2942  * which could have been changed by cpuset just after it inherits the
2943  * state from the parent and before it sits on the cgroup's task list.
2944  */
2945 static void cpuset_fork(struct task_struct *task)
2946 {
2947         if (task_css_is_root(task, cpuset_cgrp_id))
2948                 return;
2949
2950         set_cpus_allowed_ptr(task, current->cpus_ptr);
2951         task->mems_allowed = current->mems_allowed;
2952 }
2953
2954 struct cgroup_subsys cpuset_cgrp_subsys = {
2955         .css_alloc      = cpuset_css_alloc,
2956         .css_online     = cpuset_css_online,
2957         .css_offline    = cpuset_css_offline,
2958         .css_free       = cpuset_css_free,
2959         .can_attach     = cpuset_can_attach,
2960         .cancel_attach  = cpuset_cancel_attach,
2961         .attach         = cpuset_attach,
2962         .post_attach    = cpuset_post_attach,
2963         .bind           = cpuset_bind,
2964         .fork           = cpuset_fork,
2965         .legacy_cftypes = legacy_files,
2966         .dfl_cftypes    = dfl_files,
2967         .early_init     = true,
2968         .threaded       = true,
2969 };
2970
2971 /**
2972  * cpuset_init - initialize cpusets at system boot
2973  *
2974  * Description: Initialize top_cpuset
2975  **/
2976
2977 int __init cpuset_init(void)
2978 {
2979         BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
2980
2981         BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2982         BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2983         BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
2984
2985         cpumask_setall(top_cpuset.cpus_allowed);
2986         nodes_setall(top_cpuset.mems_allowed);
2987         cpumask_setall(top_cpuset.effective_cpus);
2988         nodes_setall(top_cpuset.effective_mems);
2989
2990         fmeter_init(&top_cpuset.fmeter);
2991         set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2992         top_cpuset.relax_domain_level = -1;
2993
2994         BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2995
2996         return 0;
2997 }
2998
2999 /*
3000  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
3001  * or memory nodes, we need to walk over the cpuset hierarchy,
3002  * removing that CPU or node from all cpusets.  If this removes the
3003  * last CPU or node from a cpuset, then move the tasks in the empty
3004  * cpuset to its next-highest non-empty parent.
3005  */
3006 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
3007 {
3008         struct cpuset *parent;
3009
3010         /*
3011          * Find its next-highest non-empty parent, (top cpuset
3012          * has online cpus, so can't be empty).
3013          */
3014         parent = parent_cs(cs);
3015         while (cpumask_empty(parent->cpus_allowed) ||
3016                         nodes_empty(parent->mems_allowed))
3017                 parent = parent_cs(parent);
3018
3019         if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
3020                 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
3021                 pr_cont_cgroup_name(cs->css.cgroup);
3022                 pr_cont("\n");
3023         }
3024 }
3025
3026 static void
3027 hotplug_update_tasks_legacy(struct cpuset *cs,
3028                             struct cpumask *new_cpus, nodemask_t *new_mems,
3029                             bool cpus_updated, bool mems_updated)
3030 {
3031         bool is_empty;
3032
3033         spin_lock_irq(&callback_lock);
3034         cpumask_copy(cs->cpus_allowed, new_cpus);
3035         cpumask_copy(cs->effective_cpus, new_cpus);
3036         cs->mems_allowed = *new_mems;
3037         cs->effective_mems = *new_mems;
3038         spin_unlock_irq(&callback_lock);
3039
3040         /*
3041          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
3042          * as the tasks will be migratecd to an ancestor.
3043          */
3044         if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
3045                 update_tasks_cpumask(cs);
3046         if (mems_updated && !nodes_empty(cs->mems_allowed))
3047                 update_tasks_nodemask(cs);
3048
3049         is_empty = cpumask_empty(cs->cpus_allowed) ||
3050                    nodes_empty(cs->mems_allowed);
3051
3052         percpu_up_write(&cpuset_rwsem);
3053
3054         /*
3055          * Move tasks to the nearest ancestor with execution resources,
3056          * This is full cgroup operation which will also call back into
3057          * cpuset. Should be done outside any lock.
3058          */
3059         if (is_empty)
3060                 remove_tasks_in_empty_cpuset(cs);
3061
3062         percpu_down_write(&cpuset_rwsem);
3063 }
3064
3065 static void
3066 hotplug_update_tasks(struct cpuset *cs,
3067                      struct cpumask *new_cpus, nodemask_t *new_mems,
3068                      bool cpus_updated, bool mems_updated)
3069 {
3070         if (cpumask_empty(new_cpus))
3071                 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
3072         if (nodes_empty(*new_mems))
3073                 *new_mems = parent_cs(cs)->effective_mems;
3074
3075         spin_lock_irq(&callback_lock);
3076         cpumask_copy(cs->effective_cpus, new_cpus);
3077         cs->effective_mems = *new_mems;
3078         spin_unlock_irq(&callback_lock);
3079
3080         if (cpus_updated)
3081                 update_tasks_cpumask(cs);
3082         if (mems_updated)
3083                 update_tasks_nodemask(cs);
3084 }
3085
3086 static bool force_rebuild;
3087
3088 void cpuset_force_rebuild(void)
3089 {
3090         force_rebuild = true;
3091 }
3092
3093 /**
3094  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
3095  * @cs: cpuset in interest
3096  * @tmp: the tmpmasks structure pointer
3097  *
3098  * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
3099  * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
3100  * all its tasks are moved to the nearest ancestor with both resources.
3101  */
3102 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3103 {
3104         static cpumask_t new_cpus;
3105         static nodemask_t new_mems;
3106         bool cpus_updated;
3107         bool mems_updated;
3108         struct cpuset *parent;
3109 retry:
3110         wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3111
3112         percpu_down_write(&cpuset_rwsem);
3113
3114         /*
3115          * We have raced with task attaching. We wait until attaching
3116          * is finished, so we won't attach a task to an empty cpuset.
3117          */
3118         if (cs->attach_in_progress) {
3119                 percpu_up_write(&cpuset_rwsem);
3120                 goto retry;
3121         }
3122
3123         parent = parent_cs(cs);
3124         compute_effective_cpumask(&new_cpus, cs, parent);
3125         nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3126
3127         if (cs->nr_subparts_cpus)
3128                 /*
3129                  * Make sure that CPUs allocated to child partitions
3130                  * do not show up in effective_cpus.
3131                  */
3132                 cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3133
3134         if (!tmp || !cs->partition_root_state)
3135                 goto update_tasks;
3136
3137         /*
3138          * In the unlikely event that a partition root has empty
3139          * effective_cpus or its parent becomes erroneous, we have to
3140          * transition it to the erroneous state.
3141          */
3142         if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3143            (parent->partition_root_state == PRS_ERROR))) {
3144                 if (cs->nr_subparts_cpus) {
3145                         spin_lock_irq(&callback_lock);
3146                         cs->nr_subparts_cpus = 0;
3147                         cpumask_clear(cs->subparts_cpus);
3148                         spin_unlock_irq(&callback_lock);
3149                         compute_effective_cpumask(&new_cpus, cs, parent);
3150                 }
3151
3152                 /*
3153                  * If the effective_cpus is empty because the child
3154                  * partitions take away all the CPUs, we can keep
3155                  * the current partition and let the child partitions
3156                  * fight for available CPUs.
3157                  */
3158                 if ((parent->partition_root_state == PRS_ERROR) ||
3159                      cpumask_empty(&new_cpus)) {
3160                         int old_prs;
3161
3162                         update_parent_subparts_cpumask(cs, partcmd_disable,
3163                                                        NULL, tmp);
3164                         old_prs = cs->partition_root_state;
3165                         if (old_prs != PRS_ERROR) {
3166                                 spin_lock_irq(&callback_lock);
3167                                 cs->partition_root_state = PRS_ERROR;
3168                                 spin_unlock_irq(&callback_lock);
3169                                 notify_partition_change(cs, old_prs, PRS_ERROR);
3170                         }
3171                 }
3172                 cpuset_force_rebuild();
3173         }
3174
3175         /*
3176          * On the other hand, an erroneous partition root may be transitioned
3177          * back to a regular one or a partition root with no CPU allocated
3178          * from the parent may change to erroneous.
3179          */
3180         if (is_partition_root(parent) &&
3181            ((cs->partition_root_state == PRS_ERROR) ||
3182             !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3183              update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3184                 cpuset_force_rebuild();
3185
3186 update_tasks:
3187         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3188         mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3189
3190         if (is_in_v2_mode())
3191                 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3192                                      cpus_updated, mems_updated);
3193         else
3194                 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
3195                                             cpus_updated, mems_updated);
3196
3197         percpu_up_write(&cpuset_rwsem);
3198 }
3199
3200 /**
3201  * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
3202  *
3203  * This function is called after either CPU or memory configuration has
3204  * changed and updates cpuset accordingly.  The top_cpuset is always
3205  * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
3206  * order to make cpusets transparent (of no affect) on systems that are
3207  * actively using CPU hotplug but making no active use of cpusets.
3208  *
3209  * Non-root cpusets are only affected by offlining.  If any CPUs or memory
3210  * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
3211  * all descendants.
3212  *
3213  * Note that CPU offlining during suspend is ignored.  We don't modify
3214  * cpusets across suspend/resume cycles at all.
3215  */
3216 static void cpuset_hotplug_workfn(struct work_struct *work)
3217 {
3218         static cpumask_t new_cpus;
3219         static nodemask_t new_mems;
3220         bool cpus_updated, mems_updated;
3221         bool on_dfl = is_in_v2_mode();
3222         struct tmpmasks tmp, *ptmp = NULL;
3223
3224         if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3225                 ptmp = &tmp;
3226
3227         percpu_down_write(&cpuset_rwsem);
3228
3229         /* fetch the available cpus/mems and find out which changed how */
3230         cpumask_copy(&new_cpus, cpu_active_mask);
3231         new_mems = node_states[N_MEMORY];
3232
3233         /*
3234          * If subparts_cpus is populated, it is likely that the check below
3235          * will produce a false positive on cpus_updated when the cpu list
3236          * isn't changed. It is extra work, but it is better to be safe.
3237          */
3238         cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
3239         mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3240
3241         /*
3242          * In the rare case that hotplug removes all the cpus in subparts_cpus,
3243          * we assumed that cpus are updated.
3244          */
3245         if (!cpus_updated && top_cpuset.nr_subparts_cpus)
3246                 cpus_updated = true;
3247
3248         /* synchronize cpus_allowed to cpu_active_mask */
3249         if (cpus_updated) {
3250                 spin_lock_irq(&callback_lock);
3251                 if (!on_dfl)
3252                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3253                 /*
3254                  * Make sure that CPUs allocated to child partitions
3255                  * do not show up in effective_cpus. If no CPU is left,
3256                  * we clear the subparts_cpus & let the child partitions
3257                  * fight for the CPUs again.
3258                  */
3259                 if (top_cpuset.nr_subparts_cpus) {
3260                         if (cpumask_subset(&new_cpus,
3261                                            top_cpuset.subparts_cpus)) {
3262                                 top_cpuset.nr_subparts_cpus = 0;
3263                                 cpumask_clear(top_cpuset.subparts_cpus);
3264                         } else {
3265                                 cpumask_andnot(&new_cpus, &new_cpus,
3266                                                top_cpuset.subparts_cpus);
3267                         }
3268                 }
3269                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3270                 spin_unlock_irq(&callback_lock);
3271                 /* we don't mess with cpumasks of tasks in top_cpuset */
3272         }
3273
3274         /* synchronize mems_allowed to N_MEMORY */
3275         if (mems_updated) {
3276                 spin_lock_irq(&callback_lock);
3277                 if (!on_dfl)
3278                         top_cpuset.mems_allowed = new_mems;
3279                 top_cpuset.effective_mems = new_mems;
3280                 spin_unlock_irq(&callback_lock);
3281                 update_tasks_nodemask(&top_cpuset);
3282         }
3283
3284         percpu_up_write(&cpuset_rwsem);
3285
3286         /* if cpus or mems changed, we need to propagate to descendants */
3287         if (cpus_updated || mems_updated) {
3288                 struct cpuset *cs;
3289                 struct cgroup_subsys_state *pos_css;
3290
3291                 rcu_read_lock();
3292                 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3293                         if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3294                                 continue;
3295                         rcu_read_unlock();
3296
3297                         cpuset_hotplug_update_tasks(cs, ptmp);
3298
3299                         rcu_read_lock();
3300                         css_put(&cs->css);
3301                 }
3302                 rcu_read_unlock();
3303         }
3304
3305         /* rebuild sched domains if cpus_allowed has changed */
3306         if (cpus_updated || force_rebuild) {
3307                 force_rebuild = false;
3308                 rebuild_sched_domains();
3309         }
3310
3311         free_cpumasks(NULL, ptmp);
3312 }
3313
3314 void cpuset_update_active_cpus(void)
3315 {
3316         /*
3317          * We're inside cpu hotplug critical region which usually nests
3318          * inside cgroup synchronization.  Bounce actual hotplug processing
3319          * to a work item to avoid reverse locking order.
3320          */
3321         schedule_work(&cpuset_hotplug_work);
3322 }
3323
3324 void cpuset_wait_for_hotplug(void)
3325 {
3326         flush_work(&cpuset_hotplug_work);
3327 }
3328
3329 /*
3330  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
3331  * Call this routine anytime after node_states[N_MEMORY] changes.
3332  * See cpuset_update_active_cpus() for CPU hotplug handling.
3333  */
3334 static int cpuset_track_online_nodes(struct notifier_block *self,
3335                                 unsigned long action, void *arg)
3336 {
3337         schedule_work(&cpuset_hotplug_work);
3338         return NOTIFY_OK;
3339 }
3340
3341 static struct notifier_block cpuset_track_online_nodes_nb = {
3342         .notifier_call = cpuset_track_online_nodes,
3343         .priority = 10,         /* ??! */
3344 };
3345
3346 /**
3347  * cpuset_init_smp - initialize cpus_allowed
3348  *
3349  * Description: Finish top cpuset after cpu, node maps are initialized
3350  */
3351 void __init cpuset_init_smp(void)
3352 {
3353         /*
3354          * cpus_allowd/mems_allowed set to v2 values in the initial
3355          * cpuset_bind() call will be reset to v1 values in another
3356          * cpuset_bind() call when v1 cpuset is mounted.
3357          */
3358         top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3359
3360         cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3361         top_cpuset.effective_mems = node_states[N_MEMORY];
3362
3363         register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
3364
3365         cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3366         BUG_ON(!cpuset_migrate_mm_wq);
3367 }
3368
3369 /**
3370  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
3371  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
3372  * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
3373  *
3374  * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
3375  * attached to the specified @tsk.  Guaranteed to return some non-empty
3376  * subset of cpu_online_mask, even if this means going outside the
3377  * tasks cpuset.
3378  **/
3379
3380 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3381 {
3382         unsigned long flags;
3383
3384         spin_lock_irqsave(&callback_lock, flags);
3385         guarantee_online_cpus(tsk, pmask);
3386         spin_unlock_irqrestore(&callback_lock, flags);
3387 }
3388
3389 /**
3390  * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
3391  * @tsk: pointer to task_struct with which the scheduler is struggling
3392  *
3393  * Description: In the case that the scheduler cannot find an allowed cpu in
3394  * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
3395  * mode however, this value is the same as task_cs(tsk)->effective_cpus,
3396  * which will not contain a sane cpumask during cases such as cpu hotplugging.
3397  * This is the absolute last resort for the scheduler and it is only used if
3398  * _every_ other avenue has been traveled.
3399  *
3400  * Returns true if the affinity of @tsk was changed, false otherwise.
3401  **/
3402
3403 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3404 {
3405         const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
3406         const struct cpumask *cs_mask;
3407         bool changed = false;
3408
3409         rcu_read_lock();
3410         cs_mask = task_cs(tsk)->cpus_allowed;
3411         if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
3412                 do_set_cpus_allowed(tsk, cs_mask);
3413                 changed = true;
3414         }
3415         rcu_read_unlock();
3416
3417         /*
3418          * We own tsk->cpus_allowed, nobody can change it under us.
3419          *
3420          * But we used cs && cs->cpus_allowed lockless and thus can
3421          * race with cgroup_attach_task() or update_cpumask() and get
3422          * the wrong tsk->cpus_allowed. However, both cases imply the
3423          * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
3424          * which takes task_rq_lock().
3425          *
3426          * If we are called after it dropped the lock we must see all
3427          * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
3428          * set any mask even if it is not right from task_cs() pov,
3429          * the pending set_cpus_allowed_ptr() will fix things.
3430          *
3431          * select_fallback_rq() will fix things ups and set cpu_possible_mask
3432          * if required.
3433          */
3434         return changed;
3435 }
3436
3437 void __init cpuset_init_current_mems_allowed(void)
3438 {
3439         nodes_setall(current->mems_allowed);
3440 }
3441
3442 /**
3443  * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
3444  * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
3445  *
3446  * Description: Returns the nodemask_t mems_allowed of the cpuset
3447  * attached to the specified @tsk.  Guaranteed to return some non-empty
3448  * subset of node_states[N_MEMORY], even if this means going outside the
3449  * tasks cpuset.
3450  **/
3451
3452 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
3453 {
3454         nodemask_t mask;
3455         unsigned long flags;
3456
3457         spin_lock_irqsave(&callback_lock, flags);
3458         rcu_read_lock();
3459         guarantee_online_mems(task_cs(tsk), &mask);
3460         rcu_read_unlock();
3461         spin_unlock_irqrestore(&callback_lock, flags);
3462
3463         return mask;
3464 }
3465
3466 /**
3467  * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
3468  * @nodemask: the nodemask to be checked
3469  *
3470  * Are any of the nodes in the nodemask allowed in current->mems_allowed?
3471  */
3472 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
3473 {
3474         return nodes_intersects(*nodemask, current->mems_allowed);
3475 }
3476
3477 /*
3478  * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
3479  * mem_hardwall ancestor to the specified cpuset.  Call holding
3480  * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
3481  * (an unusual configuration), then returns the root cpuset.
3482  */
3483 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
3484 {
3485         while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
3486                 cs = parent_cs(cs);
3487         return cs;
3488 }
3489
3490 /**
3491  * cpuset_node_allowed - Can we allocate on a memory node?
3492  * @node: is this an allowed node?
3493  * @gfp_mask: memory allocation flags
3494  *
3495  * If we're in interrupt, yes, we can always allocate.  If @node is set in
3496  * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
3497  * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
3498  * yes.  If current has access to memory reserves as an oom victim, yes.
3499  * Otherwise, no.
3500  *
3501  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
3502  * and do not allow allocations outside the current tasks cpuset
3503  * unless the task has been OOM killed.
3504  * GFP_KERNEL allocations are not so marked, so can escape to the
3505  * nearest enclosing hardwalled ancestor cpuset.
3506  *
3507  * Scanning up parent cpusets requires callback_lock.  The
3508  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
3509  * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
3510  * current tasks mems_allowed came up empty on the first pass over
3511  * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
3512  * cpuset are short of memory, might require taking the callback_lock.
3513  *
3514  * The first call here from mm/page_alloc:get_page_from_freelist()
3515  * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
3516  * so no allocation on a node outside the cpuset is allowed (unless
3517  * in interrupt, of course).
3518  *
3519  * The second pass through get_page_from_freelist() doesn't even call
3520  * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
3521  * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
3522  * in alloc_flags.  That logic and the checks below have the combined
3523  * affect that:
3524  *      in_interrupt - any node ok (current task context irrelevant)
3525  *      GFP_ATOMIC   - any node ok
3526  *      tsk_is_oom_victim   - any node ok
3527  *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
3528  *      GFP_USER     - only nodes in current tasks mems allowed ok.
3529  */
3530 bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
3531 {
3532         struct cpuset *cs;              /* current cpuset ancestors */
3533         int allowed;                    /* is allocation in zone z allowed? */
3534         unsigned long flags;
3535
3536         if (in_interrupt())
3537                 return true;
3538         if (node_isset(node, current->mems_allowed))
3539                 return true;
3540         /*
3541          * Allow tasks that have access to memory reserves because they have
3542          * been OOM killed to get memory anywhere.
3543          */
3544         if (unlikely(tsk_is_oom_victim(current)))
3545                 return true;
3546         if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
3547                 return false;
3548
3549         if (current->flags & PF_EXITING) /* Let dying task have memory */
3550                 return true;
3551
3552         /* Not hardwall and node outside mems_allowed: scan up cpusets */
3553         spin_lock_irqsave(&callback_lock, flags);
3554
3555         rcu_read_lock();
3556         cs = nearest_hardwall_ancestor(task_cs(current));
3557         allowed = node_isset(node, cs->mems_allowed);
3558         rcu_read_unlock();
3559
3560         spin_unlock_irqrestore(&callback_lock, flags);
3561         return allowed;
3562 }
3563
3564 /**
3565  * cpuset_mem_spread_node() - On which node to begin search for a file page
3566  * cpuset_slab_spread_node() - On which node to begin search for a slab page
3567  *
3568  * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
3569  * tasks in a cpuset with is_spread_page or is_spread_slab set),
3570  * and if the memory allocation used cpuset_mem_spread_node()
3571  * to determine on which node to start looking, as it will for
3572  * certain page cache or slab cache pages such as used for file
3573  * system buffers and inode caches, then instead of starting on the
3574  * local node to look for a free page, rather spread the starting
3575  * node around the tasks mems_allowed nodes.
3576  *
3577  * We don't have to worry about the returned node being offline
3578  * because "it can't happen", and even if it did, it would be ok.
3579  *
3580  * The routines calling guarantee_online_mems() are careful to
3581  * only set nodes in task->mems_allowed that are online.  So it
3582  * should not be possible for the following code to return an
3583  * offline node.  But if it did, that would be ok, as this routine
3584  * is not returning the node where the allocation must be, only
3585  * the node where the search should start.  The zonelist passed to
3586  * __alloc_pages() will include all nodes.  If the slab allocator
3587  * is passed an offline node, it will fall back to the local node.
3588  * See kmem_cache_alloc_node().
3589  */
3590
3591 static int cpuset_spread_node(int *rotor)
3592 {
3593         return *rotor = next_node_in(*rotor, current->mems_allowed);
3594 }
3595
3596 int cpuset_mem_spread_node(void)
3597 {
3598         if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
3599                 current->cpuset_mem_spread_rotor =
3600                         node_random(&current->mems_allowed);
3601
3602         return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
3603 }
3604
3605 int cpuset_slab_spread_node(void)
3606 {
3607         if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
3608                 current->cpuset_slab_spread_rotor =
3609                         node_random(&current->mems_allowed);
3610
3611         return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
3612 }
3613
3614 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
3615
3616 /**
3617  * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
3618  * @tsk1: pointer to task_struct of some task.
3619  * @tsk2: pointer to task_struct of some other task.
3620  *
3621  * Description: Return true if @tsk1's mems_allowed intersects the
3622  * mems_allowed of @tsk2.  Used by the OOM killer to determine if
3623  * one of the task's memory usage might impact the memory available
3624  * to the other.
3625  **/
3626
3627 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
3628                                    const struct task_struct *tsk2)
3629 {
3630         return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
3631 }
3632
3633 /**
3634  * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
3635  *
3636  * Description: Prints current's name, cpuset name, and cached copy of its
3637  * mems_allowed to the kernel log.
3638  */
3639 void cpuset_print_current_mems_allowed(void)
3640 {
3641         struct cgroup *cgrp;
3642
3643         rcu_read_lock();
3644
3645         cgrp = task_cs(current)->css.cgroup;
3646         pr_cont(",cpuset=");
3647         pr_cont_cgroup_name(cgrp);
3648         pr_cont(",mems_allowed=%*pbl",
3649                 nodemask_pr_args(&current->mems_allowed));
3650
3651         rcu_read_unlock();
3652 }
3653
3654 /*
3655  * Collection of memory_pressure is suppressed unless
3656  * this flag is enabled by writing "1" to the special
3657  * cpuset file 'memory_pressure_enabled' in the root cpuset.
3658  */
3659
3660 int cpuset_memory_pressure_enabled __read_mostly;
3661
3662 /**
3663  * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
3664  *
3665  * Keep a running average of the rate of synchronous (direct)
3666  * page reclaim efforts initiated by tasks in each cpuset.
3667  *
3668  * This represents the rate at which some task in the cpuset
3669  * ran low on memory on all nodes it was allowed to use, and
3670  * had to enter the kernels page reclaim code in an effort to
3671  * create more free memory by tossing clean pages or swapping
3672  * or writing dirty pages.
3673  *
3674  * Display to user space in the per-cpuset read-only file
3675  * "memory_pressure".  Value displayed is an integer
3676  * representing the recent rate of entry into the synchronous
3677  * (direct) page reclaim by any task attached to the cpuset.
3678  **/
3679
3680 void __cpuset_memory_pressure_bump(void)
3681 {
3682         rcu_read_lock();
3683         fmeter_markevent(&task_cs(current)->fmeter);
3684         rcu_read_unlock();
3685 }
3686
3687 #ifdef CONFIG_PROC_PID_CPUSET
3688 /*
3689  * proc_cpuset_show()
3690  *  - Print tasks cpuset path into seq_file.
3691  *  - Used for /proc/<pid>/cpuset.
3692  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
3693  *    doesn't really matter if tsk->cpuset changes after we read it,
3694  *    and we take cpuset_rwsem, keeping cpuset_attach() from changing it
3695  *    anyway.
3696  */
3697 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
3698                      struct pid *pid, struct task_struct *tsk)
3699 {
3700         char *buf;
3701         struct cgroup_subsys_state *css;
3702         int retval;
3703
3704         retval = -ENOMEM;
3705         buf = kmalloc(PATH_MAX, GFP_KERNEL);
3706         if (!buf)
3707                 goto out;
3708
3709         css = task_get_css(tsk, cpuset_cgrp_id);
3710         retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
3711                                 current->nsproxy->cgroup_ns);
3712         css_put(css);
3713         if (retval >= PATH_MAX)
3714                 retval = -ENAMETOOLONG;
3715         if (retval < 0)
3716                 goto out_free;
3717         seq_puts(m, buf);
3718         seq_putc(m, '\n');
3719         retval = 0;
3720 out_free:
3721         kfree(buf);
3722 out:
3723         return retval;
3724 }
3725 #endif /* CONFIG_PROC_PID_CPUSET */
3726
3727 /* Display task mems_allowed in /proc/<pid>/status file. */
3728 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
3729 {
3730         seq_printf(m, "Mems_allowed:\t%*pb\n",
3731                    nodemask_pr_args(&task->mems_allowed));
3732         seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
3733                    nodemask_pr_args(&task->mems_allowed));
3734 }