mm/mempolicy.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Simple NUMA memory policy for the Linux kernel.
   4  *
   5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * preferred many Try a set of nodes first before normal fallback. This is
  35  *                similar to preferred without the special case.
  36  *
  37  * default        Allocate on the local node first, or when on a VMA
  38  *                use the process policy. This is what Linux always did
  39  *                in a NUMA aware kernel and still does by, ahem, default.
  40  *
  41  * The process policy is applied for most non interrupt memory allocations
  42  * in that process' context. Interrupts ignore the policies and always
  43  * try to allocate on the local CPU. The VMA policy is only applied for memory
  44  * allocations for a VMA in the VM.
  45  *
  46  * Currently there are a few corner cases in swapping where the policy
  47  * is not applied, but the majority should be handled. When process policy
  48  * is used it is not remembered over swap outs/swap ins.
  49  *
  50  * Only the highest zone in the zone hierarchy gets policied. Allocations
  51  * requesting a lower zone just use default policy. This implies that
  52  * on systems with highmem kernel lowmem allocation don't get policied.
  53  * Same with GFP_DMA allocations.
  54  *
  55  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  56  * all users and remembered even when nobody has memory mapped.
  57  */
  58
  59 /* Notebook:
  60    fix mmap readahead to honour policy and enable policy for any page cache
  61    object
  62    statistics for bigpages
  63    global policy for page cache? currently it uses process policy. Requires
  64    first item above.
  65    handle mremap for shared memory (currently ignored for the policy)
  66    grows down?
  67    make bind policy root only? It can trigger oom much faster and the
  68    kernel is not always grateful with that.
  69 */
  70
  71 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  72
  73 #include <linux/mempolicy.h>
  74 #include <linux/pagewalk.h>
  75 #include <linux/highmem.h>
  76 #include <linux/hugetlb.h>
  77 #include <linux/kernel.h>
  78 #include <linux/sched.h>
  79 #include <linux/sched/mm.h>
  80 #include <linux/sched/numa_balancing.h>
  81 #include <linux/sched/task.h>
  82 #include <linux/nodemask.h>
  83 #include <linux/cpuset.h>
  84 #include <linux/slab.h>
  85 #include <linux/string.h>
  86 #include <linux/export.h>
  87 #include <linux/nsproxy.h>
  88 #include <linux/interrupt.h>
  89 #include <linux/init.h>
  90 #include <linux/compat.h>
  91 #include <linux/ptrace.h>
  92 #include <linux/swap.h>
  93 #include <linux/seq_file.h>
  94 #include <linux/proc_fs.h>
  95 #include <linux/migrate.h>
  96 #include <linux/ksm.h>
  97 #include <linux/rmap.h>
  98 #include <linux/security.h>
  99 #include <linux/syscalls.h>
 100 #include <linux/ctype.h>
 101 #include <linux/mm_inline.h>
 102 #include <linux/mmu_notifier.h>
 103 #include <linux/printk.h>
 104 #include <linux/swapops.h>
 105
 106 #include <asm/tlbflush.h>
 107 #include <asm/tlb.h>
 108 #include <linux/uaccess.h>
 109
 110 #include "internal.h"
 111
 112 /* Internal flags */
 113 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 114 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 115
 116 static struct kmem_cache *policy_cache;
 117 static struct kmem_cache *sn_cache;
 118
 119 /* Highest zone. An specific allocation for a zone below that is not
 120    policied. */
 121 enum zone_type policy_zone = 0;
 122
 123 /*
 124  * run-time system-wide default policy => local allocation
 125  */
 126 static struct mempolicy default_policy = {
 127         .refcnt = ATOMIC_INIT(1), /* never free it */
 128         .mode = MPOL_LOCAL,
 129 };
 130
 131 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 132
 133 /**
 134  * numa_map_to_online_node - Find closest online node
 135  * @node: Node id to start the search
 136  *
 137  * Lookup the next closest node by distance if @nid is not online.
 138  *
 139  * Return: this @node if it is online, otherwise the closest node by distance
 140  */
 141 int numa_map_to_online_node(int node)
 142 {
 143         int min_dist = INT_MAX, dist, n, min_node;
 144
 145         if (node == NUMA_NO_NODE || node_online(node))
 146                 return node;
 147
 148         min_node = node;
 149         for_each_online_node(n) {
 150                 dist = node_distance(node, n);
 151                 if (dist < min_dist) {
 152                         min_dist = dist;
 153                         min_node = n;
 154                 }
 155         }
 156
 157         return min_node;
 158 }
 159 EXPORT_SYMBOL_GPL(numa_map_to_online_node);
 160
 161 struct mempolicy *get_task_policy(struct task_struct *p)
 162 {
 163         struct mempolicy *pol = p->mempolicy;
 164         int node;
 165
 166         if (pol)
 167                 return pol;
 168
 169         node = numa_node_id();
 170         if (node != NUMA_NO_NODE) {
 171                 pol = &preferred_node_policy[node];
 172                 /* preferred_node_policy is not initialised early in boot */
 173                 if (pol->mode)
 174                         return pol;
 175         }
 176
 177         return &default_policy;
 178 }
 179
 180 static const struct mempolicy_operations {
 181         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 182         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 183 } mpol_ops[MPOL_MAX];
 184
 185 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 186 {
 187         return pol->flags & MPOL_MODE_FLAGS;
 188 }
 189
 190 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 191                                    const nodemask_t *rel)
 192 {
 193         nodemask_t tmp;
 194         nodes_fold(tmp, *orig, nodes_weight(*rel));
 195         nodes_onto(*ret, tmp, *rel);
 196 }
 197
 198 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 199 {
 200         if (nodes_empty(*nodes))
 201                 return -EINVAL;
 202         pol->nodes = *nodes;
 203         return 0;
 204 }
 205
 206 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 207 {
 208         if (nodes_empty(*nodes))
 209                 return -EINVAL;
 210
 211         nodes_clear(pol->nodes);
 212         node_set(first_node(*nodes), pol->nodes);
 213         return 0;
 214 }
 215
 216 /*
 217  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 218  * any, for the new policy.  mpol_new() has already validated the nodes
 219  * parameter with respect to the policy mode and flags.
 220  *
 221  * Must be called holding task's alloc_lock to protect task's mems_allowed
 222  * and mempolicy.  May also be called holding the mmap_lock for write.
 223  */
 224 static int mpol_set_nodemask(struct mempolicy *pol,
 225                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 226 {
 227         int ret;
 228
 229         /*
 230          * Default (pol==NULL) resp. local memory policies are not a
 231          * subject of any remapping. They also do not need any special
 232          * constructor.
 233          */
 234         if (!pol || pol->mode == MPOL_LOCAL)
 235                 return 0;
 236
 237         /* Check N_MEMORY */
 238         nodes_and(nsc->mask1,
 239                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 240
 241         VM_BUG_ON(!nodes);
 242
 243         if (pol->flags & MPOL_F_RELATIVE_NODES)
 244                 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 245         else
 246                 nodes_and(nsc->mask2, *nodes, nsc->mask1);
 247
 248         if (mpol_store_user_nodemask(pol))
 249                 pol->w.user_nodemask = *nodes;
 250         else
 251                 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
 252
 253         ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 254         return ret;
 255 }
 256
 257 /*
 258  * This function just creates a new policy, does some check and simple
 259  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 260  */
 261 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 262                                   nodemask_t *nodes)
 263 {
 264         struct mempolicy *policy;
 265
 266         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 267                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 268
 269         if (mode == MPOL_DEFAULT) {
 270                 if (nodes && !nodes_empty(*nodes))
 271                         return ERR_PTR(-EINVAL);
 272                 return NULL;
 273         }
 274         VM_BUG_ON(!nodes);
 275
 276         /*
 277          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 278          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 279          * All other modes require a valid pointer to a non-empty nodemask.
 280          */
 281         if (mode == MPOL_PREFERRED) {
 282                 if (nodes_empty(*nodes)) {
 283                         if (((flags & MPOL_F_STATIC_NODES) ||
 284                              (flags & MPOL_F_RELATIVE_NODES)))
 285                                 return ERR_PTR(-EINVAL);
 286
 287                         mode = MPOL_LOCAL;
 288                 }
 289         } else if (mode == MPOL_LOCAL) {
 290                 if (!nodes_empty(*nodes) ||
 291                     (flags & MPOL_F_STATIC_NODES) ||
 292                     (flags & MPOL_F_RELATIVE_NODES))
 293                         return ERR_PTR(-EINVAL);
 294         } else if (nodes_empty(*nodes))
 295                 return ERR_PTR(-EINVAL);
 296         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 297         if (!policy)
 298                 return ERR_PTR(-ENOMEM);
 299         atomic_set(&policy->refcnt, 1);
 300         policy->mode = mode;
 301         policy->flags = flags;
 302         policy->home_node = NUMA_NO_NODE;
 303
 304         return policy;
 305 }
 306
 307 /* Slow path of a mpol destructor. */
 308 void __mpol_put(struct mempolicy *p)
 309 {
 310         if (!atomic_dec_and_test(&p->refcnt))
 311                 return;
 312         kmem_cache_free(policy_cache, p);
 313 }
 314
 315 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 316 {
 317 }
 318
 319 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 320 {
 321         nodemask_t tmp;
 322
 323         if (pol->flags & MPOL_F_STATIC_NODES)
 324                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 325         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 326                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 327         else {
 328                 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
 329                                                                 *nodes);
 330                 pol->w.cpuset_mems_allowed = *nodes;
 331         }
 332
 333         if (nodes_empty(tmp))
 334                 tmp = *nodes;
 335
 336         pol->nodes = tmp;
 337 }
 338
 339 static void mpol_rebind_preferred(struct mempolicy *pol,
 340                                                 const nodemask_t *nodes)
 341 {
 342         pol->w.cpuset_mems_allowed = *nodes;
 343 }
 344
 345 /*
 346  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 347  *
 348  * Per-vma policies are protected by mmap_lock. Allocations using per-task
 349  * policies are protected by task->mems_allowed_seq to prevent a premature
 350  * OOM/allocation failure due to parallel nodemask modification.
 351  */
 352 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 353 {
 354         if (!pol || pol->mode == MPOL_LOCAL)
 355                 return;
 356         if (!mpol_store_user_nodemask(pol) &&
 357             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 358                 return;
 359
 360         mpol_ops[pol->mode].rebind(pol, newmask);
 361 }
 362
 363 /*
 364  * Wrapper for mpol_rebind_policy() that just requires task
 365  * pointer, and updates task mempolicy.
 366  *
 367  * Called with task's alloc_lock held.
 368  */
 369
 370 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 371 {
 372         mpol_rebind_policy(tsk->mempolicy, new);
 373 }
 374
 375 /*
 376  * Rebind each vma in mm to new nodemask.
 377  *
 378  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 379  */
 380
 381 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 382 {
 383         struct vm_area_struct *vma;
 384         VMA_ITERATOR(vmi, mm, 0);
 385
 386         mmap_write_lock(mm);
 387         for_each_vma(vmi, vma)
 388                 mpol_rebind_policy(vma->vm_policy, new);
 389         mmap_write_unlock(mm);
 390 }
 391
 392 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 393         [MPOL_DEFAULT] = {
 394                 .rebind = mpol_rebind_default,
 395         },
 396         [MPOL_INTERLEAVE] = {
 397                 .create = mpol_new_nodemask,
 398                 .rebind = mpol_rebind_nodemask,
 399         },
 400         [MPOL_PREFERRED] = {
 401                 .create = mpol_new_preferred,
 402                 .rebind = mpol_rebind_preferred,
 403         },
 404         [MPOL_BIND] = {
 405                 .create = mpol_new_nodemask,
 406                 .rebind = mpol_rebind_nodemask,
 407         },
 408         [MPOL_LOCAL] = {
 409                 .rebind = mpol_rebind_default,
 410         },
 411         [MPOL_PREFERRED_MANY] = {
 412                 .create = mpol_new_nodemask,
 413                 .rebind = mpol_rebind_preferred,
 414         },
 415 };
 416
 417 static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
 418                                 unsigned long flags);
 419
 420 struct queue_pages {
 421         struct list_head *pagelist;
 422         unsigned long flags;
 423         nodemask_t *nmask;
 424         unsigned long start;
 425         unsigned long end;
 426         struct vm_area_struct *first;
 427 };
 428
 429 /*
 430  * Check if the folio's nid is in qp->nmask.
 431  *
 432  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 433  * in the invert of qp->nmask.
 434  */
 435 static inline bool queue_folio_required(struct folio *folio,
 436                                         struct queue_pages *qp)
 437 {
 438         int nid = folio_nid(folio);
 439         unsigned long flags = qp->flags;
 440
 441         return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 442 }
 443
 444 /*
 445  * queue_folios_pmd() has three possible return values:
 446  * 0 - folios are placed on the right node or queued successfully, or
 447  *     special page is met, i.e. huge zero page.
 448  * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 449  *     specified.
 450  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 451  *        existing folio was already on a node that does not follow the
 452  *        policy.
 453  */
 454 static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 455                                 unsigned long end, struct mm_walk *walk)
 456         __releases(ptl)
 457 {
 458         int ret = 0;
 459         struct folio *folio;
 460         struct queue_pages *qp = walk->private;
 461         unsigned long flags;
 462
 463         if (unlikely(is_pmd_migration_entry(*pmd))) {
 464                 ret = -EIO;
 465                 goto unlock;
 466         }
 467         folio = pfn_folio(pmd_pfn(*pmd));
 468         if (is_huge_zero_page(&folio->page)) {
 469                 walk->action = ACTION_CONTINUE;
 470                 goto unlock;
 471         }
 472         if (!queue_folio_required(folio, qp))
 473                 goto unlock;
 474
 475         flags = qp->flags;
 476         /* go to folio migration */
 477         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 478                 if (!vma_migratable(walk->vma) ||
 479                     migrate_folio_add(folio, qp->pagelist, flags)) {
 480                         ret = 1;
 481                         goto unlock;
 482                 }
 483         } else
 484                 ret = -EIO;
 485 unlock:
 486         spin_unlock(ptl);
 487         return ret;
 488 }
 489
 490 /*
 491  * Scan through pages checking if pages follow certain conditions,
 492  * and move them to the pagelist if they do.
 493  *
 494  * queue_folios_pte_range() has three possible return values:
 495  * 0 - folios are placed on the right node or queued successfully, or
 496  *     special page is met, i.e. zero page.
 497  * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 498  *     specified.
 499  * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
 500  *        on a node that does not follow the policy.
 501  */
 502 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 503                         unsigned long end, struct mm_walk *walk)
 504 {
 505         struct vm_area_struct *vma = walk->vma;
 506         struct folio *folio;
 507         struct queue_pages *qp = walk->private;
 508         unsigned long flags = qp->flags;
 509         bool has_unmovable = false;
 510         pte_t *pte, *mapped_pte;
 511         spinlock_t *ptl;
 512
 513         ptl = pmd_trans_huge_lock(pmd, vma);
 514         if (ptl)
 515                 return queue_folios_pmd(pmd, ptl, addr, end, walk);
 516
 517         if (pmd_trans_unstable(pmd))
 518                 return 0;
 519
 520         mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 521         for (; addr != end; pte++, addr += PAGE_SIZE) {
 522                 if (!pte_present(*pte))
 523                         continue;
 524                 folio = vm_normal_folio(vma, addr, *pte);
 525                 if (!folio || folio_is_zone_device(folio))
 526                         continue;
 527                 /*
 528                  * vm_normal_folio() filters out zero pages, but there might
 529                  * still be reserved folios to skip, perhaps in a VDSO.
 530                  */
 531                 if (folio_test_reserved(folio))
 532                         continue;
 533                 if (!queue_folio_required(folio, qp))
 534                         continue;
 535                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 536                         /* MPOL_MF_STRICT must be specified if we get here */
 537                         if (!vma_migratable(vma)) {
 538                                 has_unmovable = true;
 539                                 break;
 540                         }
 541
 542                         /*
 543                          * Do not abort immediately since there may be
 544                          * temporary off LRU pages in the range.  Still
 545                          * need migrate other LRU pages.
 546                          */
 547                         if (migrate_folio_add(folio, qp->pagelist, flags))
 548                                 has_unmovable = true;
 549                 } else
 550                         break;
 551         }
 552         pte_unmap_unlock(mapped_pte, ptl);
 553         cond_resched();
 554
 555         if (has_unmovable)
 556                 return 1;
 557
 558         return addr != end ? -EIO : 0;
 559 }
 560
 561 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
 562                                unsigned long addr, unsigned long end,
 563                                struct mm_walk *walk)
 564 {
 565         int ret = 0;
 566 #ifdef CONFIG_HUGETLB_PAGE
 567         struct queue_pages *qp = walk->private;
 568         unsigned long flags = (qp->flags & MPOL_MF_VALID);
 569         struct folio *folio;
 570         spinlock_t *ptl;
 571         pte_t entry;
 572
 573         ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 574         entry = huge_ptep_get(pte);
 575         if (!pte_present(entry))
 576                 goto unlock;
 577         folio = pfn_folio(pte_pfn(entry));
 578         if (!queue_folio_required(folio, qp))
 579                 goto unlock;
 580
 581         if (flags == MPOL_MF_STRICT) {
 582                 /*
 583                  * STRICT alone means only detecting misplaced folio and no
 584                  * need to further check other vma.
 585                  */
 586                 ret = -EIO;
 587                 goto unlock;
 588         }
 589
 590         if (!vma_migratable(walk->vma)) {
 591                 /*
 592                  * Must be STRICT with MOVE*, otherwise .test_walk() have
 593                  * stopped walking current vma.
 594                  * Detecting misplaced folio but allow migrating folios which
 595                  * have been queued.
 596                  */
 597                 ret = 1;
 598                 goto unlock;
 599         }
 600
 601         /*
 602          * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it
 603          * is shared it is likely not worth migrating.
 604          *
 605          * To check if the folio is shared, ideally we want to make sure
 606          * every page is mapped to the same process. Doing that is very
 607          * expensive, so check the estimated mapcount of the folio instead.
 608          */
 609         if (flags & (MPOL_MF_MOVE_ALL) ||
 610             (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 &&
 611              !hugetlb_pmd_shared(pte))) {
 612                 if (!isolate_hugetlb(folio, qp->pagelist) &&
 613                         (flags & MPOL_MF_STRICT))
 614                         /*
 615                          * Failed to isolate folio but allow migrating pages
 616                          * which have been queued.
 617                          */
 618                         ret = 1;
 619         }
 620 unlock:
 621         spin_unlock(ptl);
 622 #else
 623         BUG();
 624 #endif
 625         return ret;
 626 }
 627
 628 #ifdef CONFIG_NUMA_BALANCING
 629 /*
 630  * This is used to mark a range of virtual addresses to be inaccessible.
 631  * These are later cleared by a NUMA hinting fault. Depending on these
 632  * faults, pages may be migrated for better NUMA placement.
 633  *
 634  * This is assuming that NUMA faults are handled using PROT_NONE. If
 635  * an architecture makes a different choice, it will need further
 636  * changes to the core.
 637  */
 638 unsigned long change_prot_numa(struct vm_area_struct *vma,
 639                         unsigned long addr, unsigned long end)
 640 {
 641         struct mmu_gather tlb;
 642         long nr_updated;
 643
 644         tlb_gather_mmu(&tlb, vma->vm_mm);
 645
 646         nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
 647         if (nr_updated > 0)
 648                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 649
 650         tlb_finish_mmu(&tlb);
 651
 652         return nr_updated;
 653 }
 654 #else
 655 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 656                         unsigned long addr, unsigned long end)
 657 {
 658         return 0;
 659 }
 660 #endif /* CONFIG_NUMA_BALANCING */
 661
 662 static int queue_pages_test_walk(unsigned long start, unsigned long end,
 663                                 struct mm_walk *walk)
 664 {
 665         struct vm_area_struct *next, *vma = walk->vma;
 666         struct queue_pages *qp = walk->private;
 667         unsigned long endvma = vma->vm_end;
 668         unsigned long flags = qp->flags;
 669
 670         /* range check first */
 671         VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
 672
 673         if (!qp->first) {
 674                 qp->first = vma;
 675                 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 676                         (qp->start < vma->vm_start))
 677                         /* hole at head side of range */
 678                         return -EFAULT;
 679         }
 680         next = find_vma(vma->vm_mm, vma->vm_end);
 681         if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 682                 ((vma->vm_end < qp->end) &&
 683                 (!next || vma->vm_end < next->vm_start)))
 684                 /* hole at middle or tail of range */
 685                 return -EFAULT;
 686
 687         /*
 688          * Need check MPOL_MF_STRICT to return -EIO if possible
 689          * regardless of vma_migratable
 690          */
 691         if (!vma_migratable(vma) &&
 692             !(flags & MPOL_MF_STRICT))
 693                 return 1;
 694
 695         if (endvma > end)
 696                 endvma = end;
 697
 698         if (flags & MPOL_MF_LAZY) {
 699                 /* Similar to task_numa_work, skip inaccessible VMAs */
 700                 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
 701                         !(vma->vm_flags & VM_MIXEDMAP))
 702                         change_prot_numa(vma, start, endvma);
 703                 return 1;
 704         }
 705
 706         /* queue pages from current vma */
 707         if (flags & MPOL_MF_VALID)
 708                 return 0;
 709         return 1;
 710 }
 711
 712 static const struct mm_walk_ops queue_pages_walk_ops = {
 713         .hugetlb_entry          = queue_folios_hugetlb,
 714         .pmd_entry              = queue_folios_pte_range,
 715         .test_walk              = queue_pages_test_walk,
 716 };
 717
 718 /*
 719  * Walk through page tables and collect pages to be migrated.
 720  *
 721  * If pages found in a given range are on a set of nodes (determined by
 722  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 723  * passed via @private.
 724  *
 725  * queue_pages_range() has three possible return values:
 726  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 727  *     specified.
 728  * 0 - queue pages successfully or no misplaced page.
 729  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 730  *         memory range specified by nodemask and maxnode points outside
 731  *         your accessible address space (-EFAULT)
 732  */
 733 static int
 734 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 735                 nodemask_t *nodes, unsigned long flags,
 736                 struct list_head *pagelist)
 737 {
 738         int err;
 739         struct queue_pages qp = {
 740                 .pagelist = pagelist,
 741                 .flags = flags,
 742                 .nmask = nodes,
 743                 .start = start,
 744                 .end = end,
 745                 .first = NULL,
 746         };
 747
 748         err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
 749
 750         if (!qp.first)
 751                 /* whole range in hole */
 752                 err = -EFAULT;
 753
 754         return err;
 755 }
 756
 757 /*
 758  * Apply policy to a single VMA
 759  * This must be called with the mmap_lock held for writing.
 760  */
 761 static int vma_replace_policy(struct vm_area_struct *vma,
 762                                                 struct mempolicy *pol)
 763 {
 764         int err;
 765         struct mempolicy *old;
 766         struct mempolicy *new;
 767
 768         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 769                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 770                  vma->vm_ops, vma->vm_file,
 771                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 772
 773         new = mpol_dup(pol);
 774         if (IS_ERR(new))
 775                 return PTR_ERR(new);
 776
 777         if (vma->vm_ops && vma->vm_ops->set_policy) {
 778                 err = vma->vm_ops->set_policy(vma, new);
 779                 if (err)
 780                         goto err_out;
 781         }
 782
 783         old = vma->vm_policy;
 784         vma->vm_policy = new; /* protected by mmap_lock */
 785         mpol_put(old);
 786
 787         return 0;
 788  err_out:
 789         mpol_put(new);
 790         return err;
 791 }
 792
 793 /* Split or merge the VMA (if required) and apply the new policy */
 794 static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
 795                 struct vm_area_struct **prev, unsigned long start,
 796                 unsigned long end, struct mempolicy *new_pol)
 797 {
 798         struct vm_area_struct *merged;
 799         unsigned long vmstart, vmend;
 800         pgoff_t pgoff;
 801         int err;
 802
 803         vmend = min(end, vma->vm_end);
 804         if (start > vma->vm_start) {
 805                 *prev = vma;
 806                 vmstart = start;
 807         } else {
 808                 vmstart = vma->vm_start;
 809         }
 810
 811         if (mpol_equal(vma_policy(vma), new_pol))
 812                 return 0;
 813
 814         pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 815         merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags,
 816                          vma->anon_vma, vma->vm_file, pgoff, new_pol,
 817                          vma->vm_userfaultfd_ctx, anon_vma_name(vma));
 818         if (merged) {
 819                 *prev = merged;
 820                 return vma_replace_policy(merged, new_pol);
 821         }
 822
 823         if (vma->vm_start != vmstart) {
 824                 err = split_vma(vmi, vma, vmstart, 1);
 825                 if (err)
 826                         return err;
 827         }
 828
 829         if (vma->vm_end != vmend) {
 830                 err = split_vma(vmi, vma, vmend, 0);
 831                 if (err)
 832                         return err;
 833         }
 834
 835         *prev = vma;
 836         return vma_replace_policy(vma, new_pol);
 837 }
 838
 839 /* Set the process memory policy */
 840 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 841                              nodemask_t *nodes)
 842 {
 843         struct mempolicy *new, *old;
 844         NODEMASK_SCRATCH(scratch);
 845         int ret;
 846
 847         if (!scratch)
 848                 return -ENOMEM;
 849
 850         new = mpol_new(mode, flags, nodes);
 851         if (IS_ERR(new)) {
 852                 ret = PTR_ERR(new);
 853                 goto out;
 854         }
 855
 856         task_lock(current);
 857         ret = mpol_set_nodemask(new, nodes, scratch);
 858         if (ret) {
 859                 task_unlock(current);
 860                 mpol_put(new);
 861                 goto out;
 862         }
 863
 864         old = current->mempolicy;
 865         current->mempolicy = new;
 866         if (new && new->mode == MPOL_INTERLEAVE)
 867                 current->il_prev = MAX_NUMNODES-1;
 868         task_unlock(current);
 869         mpol_put(old);
 870         ret = 0;
 871 out:
 872         NODEMASK_SCRATCH_FREE(scratch);
 873         return ret;
 874 }
 875
 876 /*
 877  * Return nodemask for policy for get_mempolicy() query
 878  *
 879  * Called with task's alloc_lock held
 880  */
 881 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 882 {
 883         nodes_clear(*nodes);
 884         if (p == &default_policy)
 885                 return;
 886
 887         switch (p->mode) {
 888         case MPOL_BIND:
 889         case MPOL_INTERLEAVE:
 890         case MPOL_PREFERRED:
 891         case MPOL_PREFERRED_MANY:
 892                 *nodes = p->nodes;
 893                 break;
 894         case MPOL_LOCAL:
 895                 /* return empty node mask for local allocation */
 896                 break;
 897         default:
 898                 BUG();
 899         }
 900 }
 901
 902 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 903 {
 904         struct page *p = NULL;
 905         int ret;
 906
 907         ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
 908         if (ret > 0) {
 909                 ret = page_to_nid(p);
 910                 put_page(p);
 911         }
 912         return ret;
 913 }
 914
 915 /* Retrieve NUMA policy */
 916 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 917                              unsigned long addr, unsigned long flags)
 918 {
 919         int err;
 920         struct mm_struct *mm = current->mm;
 921         struct vm_area_struct *vma = NULL;
 922         struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 923
 924         if (flags &
 925                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 926                 return -EINVAL;
 927
 928         if (flags & MPOL_F_MEMS_ALLOWED) {
 929                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 930                         return -EINVAL;
 931                 *policy = 0;    /* just so it's initialized */
 932                 task_lock(current);
 933                 *nmask  = cpuset_current_mems_allowed;
 934                 task_unlock(current);
 935                 return 0;
 936         }
 937
 938         if (flags & MPOL_F_ADDR) {
 939                 /*
 940                  * Do NOT fall back to task policy if the
 941                  * vma/shared policy at addr is NULL.  We
 942                  * want to return MPOL_DEFAULT in this case.
 943                  */
 944                 mmap_read_lock(mm);
 945                 vma = vma_lookup(mm, addr);
 946                 if (!vma) {
 947                         mmap_read_unlock(mm);
 948                         return -EFAULT;
 949                 }
 950                 if (vma->vm_ops && vma->vm_ops->get_policy)
 951                         pol = vma->vm_ops->get_policy(vma, addr);
 952                 else
 953                         pol = vma->vm_policy;
 954         } else if (addr)
 955                 return -EINVAL;
 956
 957         if (!pol)
 958                 pol = &default_policy;  /* indicates default behavior */
 959
 960         if (flags & MPOL_F_NODE) {
 961                 if (flags & MPOL_F_ADDR) {
 962                         /*
 963                          * Take a refcount on the mpol, because we are about to
 964                          * drop the mmap_lock, after which only "pol" remains
 965                          * valid, "vma" is stale.
 966                          */
 967                         pol_refcount = pol;
 968                         vma = NULL;
 969                         mpol_get(pol);
 970                         mmap_read_unlock(mm);
 971                         err = lookup_node(mm, addr);
 972                         if (err < 0)
 973                                 goto out;
 974                         *policy = err;
 975                 } else if (pol == current->mempolicy &&
 976                                 pol->mode == MPOL_INTERLEAVE) {
 977                         *policy = next_node_in(current->il_prev, pol->nodes);
 978                 } else {
 979                         err = -EINVAL;
 980                         goto out;
 981                 }
 982         } else {
 983                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 984                                                 pol->mode;
 985                 /*
 986                  * Internal mempolicy flags must be masked off before exposing
 987                  * the policy to userspace.
 988                  */
 989                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 990         }
 991
 992         err = 0;
 993         if (nmask) {
 994                 if (mpol_store_user_nodemask(pol)) {
 995                         *nmask = pol->w.user_nodemask;
 996                 } else {
 997                         task_lock(current);
 998                         get_policy_nodemask(pol, nmask);
 999                         task_unlock(current);
1000                 }
1001         }
1002
1003  out:
1004         mpol_cond_put(pol);
1005         if (vma)
1006                 mmap_read_unlock(mm);
1007         if (pol_refcount)
1008                 mpol_put(pol_refcount);
1009         return err;
1010 }
1011
1012 #ifdef CONFIG_MIGRATION
1013 static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1014                                 unsigned long flags)
1015 {
1016         /*
1017          * We try to migrate only unshared folios. If it is shared it
1018          * is likely not worth migrating.
1019          *
1020          * To check if the folio is shared, ideally we want to make sure
1021          * every page is mapped to the same process. Doing that is very
1022          * expensive, so check the estimated mapcount of the folio instead.
1023          */
1024         if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
1025                 if (folio_isolate_lru(folio)) {
1026                         list_add_tail(&folio->lru, foliolist);
1027                         node_stat_mod_folio(folio,
1028                                 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1029                                 folio_nr_pages(folio));
1030                 } else if (flags & MPOL_MF_STRICT) {
1031                         /*
1032                          * Non-movable folio may reach here.  And, there may be
1033                          * temporary off LRU folios or non-LRU movable folios.
1034                          * Treat them as unmovable folios since they can't be
1035                          * isolated, so they can't be moved at the moment.  It
1036                          * should return -EIO for this case too.
1037                          */
1038                         return -EIO;
1039                 }
1040         }
1041
1042         return 0;
1043 }
1044
1045 /*
1046  * Migrate pages from one node to a target node.
1047  * Returns error or the number of pages not migrated.
1048  */
1049 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1050                            int flags)
1051 {
1052         nodemask_t nmask;
1053         struct vm_area_struct *vma;
1054         LIST_HEAD(pagelist);
1055         int err = 0;
1056         struct migration_target_control mtc = {
1057                 .nid = dest,
1058                 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1059         };
1060
1061         nodes_clear(nmask);
1062         node_set(source, nmask);
1063
1064         /*
1065          * This does not "check" the range but isolates all pages that
1066          * need migration.  Between passing in the full user address
1067          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1068          */
1069         vma = find_vma(mm, 0);
1070         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1071         queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1072                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1073
1074         if (!list_empty(&pagelist)) {
1075                 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1076                                 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1077                 if (err)
1078                         putback_movable_pages(&pagelist);
1079         }
1080
1081         return err;
1082 }
1083
1084 /*
1085  * Move pages between the two nodesets so as to preserve the physical
1086  * layout as much as possible.
1087  *
1088  * Returns the number of page that could not be moved.
1089  */
1090 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1091                      const nodemask_t *to, int flags)
1092 {
1093         int busy = 0;
1094         int err = 0;
1095         nodemask_t tmp;
1096
1097         lru_cache_disable();
1098
1099         mmap_read_lock(mm);
1100
1101         /*
1102          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1103          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1104          * bit in 'tmp', and return that <source, dest> pair for migration.
1105          * The pair of nodemasks 'to' and 'from' define the map.
1106          *
1107          * If no pair of bits is found that way, fallback to picking some
1108          * pair of 'source' and 'dest' bits that are not the same.  If the
1109          * 'source' and 'dest' bits are the same, this represents a node
1110          * that will be migrating to itself, so no pages need move.
1111          *
1112          * If no bits are left in 'tmp', or if all remaining bits left
1113          * in 'tmp' correspond to the same bit in 'to', return false
1114          * (nothing left to migrate).
1115          *
1116          * This lets us pick a pair of nodes to migrate between, such that
1117          * if possible the dest node is not already occupied by some other
1118          * source node, minimizing the risk of overloading the memory on a
1119          * node that would happen if we migrated incoming memory to a node
1120          * before migrating outgoing memory source that same node.
1121          *
1122          * A single scan of tmp is sufficient.  As we go, we remember the
1123          * most recent <s, d> pair that moved (s != d).  If we find a pair
1124          * that not only moved, but what's better, moved to an empty slot
1125          * (d is not set in tmp), then we break out then, with that pair.
1126          * Otherwise when we finish scanning from_tmp, we at least have the
1127          * most recent <s, d> pair that moved.  If we get all the way through
1128          * the scan of tmp without finding any node that moved, much less
1129          * moved to an empty node, then there is nothing left worth migrating.
1130          */
1131
1132         tmp = *from;
1133         while (!nodes_empty(tmp)) {
1134                 int s, d;
1135                 int source = NUMA_NO_NODE;
1136                 int dest = 0;
1137
1138                 for_each_node_mask(s, tmp) {
1139
1140                         /*
1141                          * do_migrate_pages() tries to maintain the relative
1142                          * node relationship of the pages established between
1143                          * threads and memory areas.
1144                          *
1145                          * However if the number of source nodes is not equal to
1146                          * the number of destination nodes we can not preserve
1147                          * this node relative relationship.  In that case, skip
1148                          * copying memory from a node that is in the destination
1149                          * mask.
1150                          *
1151                          * Example: [2,3,4] -> [3,4,5] moves everything.
1152                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1153                          */
1154
1155                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1156                                                 (node_isset(s, *to)))
1157                                 continue;
1158
1159                         d = node_remap(s, *from, *to);
1160                         if (s == d)
1161                                 continue;
1162
1163                         source = s;     /* Node moved. Memorize */
1164                         dest = d;
1165
1166                         /* dest not in remaining from nodes? */
1167                         if (!node_isset(dest, tmp))
1168                                 break;
1169                 }
1170                 if (source == NUMA_NO_NODE)
1171                         break;
1172
1173                 node_clear(source, tmp);
1174                 err = migrate_to_node(mm, source, dest, flags);
1175                 if (err > 0)
1176                         busy += err;
1177                 if (err < 0)
1178                         break;
1179         }
1180         mmap_read_unlock(mm);
1181
1182         lru_cache_enable();
1183         if (err < 0)
1184                 return err;
1185         return busy;
1186
1187 }
1188
1189 /*
1190  * Allocate a new page for page migration based on vma policy.
1191  * Start by assuming the page is mapped by the same vma as contains @start.
1192  * Search forward from there, if not.  N.B., this assumes that the
1193  * list of pages handed to migrate_pages()--which is how we get here--
1194  * is in virtual address order.
1195  */
1196 static struct page *new_page(struct page *page, unsigned long start)
1197 {
1198         struct folio *dst, *src = page_folio(page);
1199         struct vm_area_struct *vma;
1200         unsigned long address;
1201         VMA_ITERATOR(vmi, current->mm, start);
1202         gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
1203
1204         for_each_vma(vmi, vma) {
1205                 address = page_address_in_vma(page, vma);
1206                 if (address != -EFAULT)
1207                         break;
1208         }
1209
1210         if (folio_test_hugetlb(src)) {
1211                 dst = alloc_hugetlb_folio_vma(folio_hstate(src),
1212                                 vma, address);
1213                 return &dst->page;
1214         }
1215
1216         if (folio_test_large(src))
1217                 gfp = GFP_TRANSHUGE;
1218
1219         /*
1220          * if !vma, vma_alloc_folio() will use task or system default policy
1221          */
1222         dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
1223                         folio_test_large(src));
1224         return &dst->page;
1225 }
1226 #else
1227
1228 static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1229                                 unsigned long flags)
1230 {
1231         return -EIO;
1232 }
1233
1234 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1235                      const nodemask_t *to, int flags)
1236 {
1237         return -ENOSYS;
1238 }
1239
1240 static struct page *new_page(struct page *page, unsigned long start)
1241 {
1242         return NULL;
1243 }
1244 #endif
1245
1246 static long do_mbind(unsigned long start, unsigned long len,
1247                      unsigned short mode, unsigned short mode_flags,
1248                      nodemask_t *nmask, unsigned long flags)
1249 {
1250         struct mm_struct *mm = current->mm;
1251         struct vm_area_struct *vma, *prev;
1252         struct vma_iterator vmi;
1253         struct mempolicy *new;
1254         unsigned long end;
1255         int err;
1256         int ret;
1257         LIST_HEAD(pagelist);
1258
1259         if (flags & ~(unsigned long)MPOL_MF_VALID)
1260                 return -EINVAL;
1261         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1262                 return -EPERM;
1263
1264         if (start & ~PAGE_MASK)
1265                 return -EINVAL;
1266
1267         if (mode == MPOL_DEFAULT)
1268                 flags &= ~MPOL_MF_STRICT;
1269
1270         len = PAGE_ALIGN(len);
1271         end = start + len;
1272
1273         if (end < start)
1274                 return -EINVAL;
1275         if (end == start)
1276                 return 0;
1277
1278         new = mpol_new(mode, mode_flags, nmask);
1279         if (IS_ERR(new))
1280                 return PTR_ERR(new);
1281
1282         if (flags & MPOL_MF_LAZY)
1283                 new->flags |= MPOL_F_MOF;
1284
1285         /*
1286          * If we are using the default policy then operation
1287          * on discontinuous address spaces is okay after all
1288          */
1289         if (!new)
1290                 flags |= MPOL_MF_DISCONTIG_OK;
1291
1292         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1293                  start, start + len, mode, mode_flags,
1294                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1295
1296         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1297
1298                 lru_cache_disable();
1299         }
1300         {
1301                 NODEMASK_SCRATCH(scratch);
1302                 if (scratch) {
1303                         mmap_write_lock(mm);
1304                         err = mpol_set_nodemask(new, nmask, scratch);
1305                         if (err)
1306                                 mmap_write_unlock(mm);
1307                 } else
1308                         err = -ENOMEM;
1309                 NODEMASK_SCRATCH_FREE(scratch);
1310         }
1311         if (err)
1312                 goto mpol_out;
1313
1314         ret = queue_pages_range(mm, start, end, nmask,
1315                           flags | MPOL_MF_INVERT, &pagelist);
1316
1317         if (ret < 0) {
1318                 err = ret;
1319                 goto up_out;
1320         }
1321
1322         vma_iter_init(&vmi, mm, start);
1323         prev = vma_prev(&vmi);
1324         for_each_vma_range(vmi, vma, end) {
1325                 err = mbind_range(&vmi, vma, &prev, start, end, new);
1326                 if (err)
1327                         break;
1328         }
1329
1330         if (!err) {
1331                 int nr_failed = 0;
1332
1333                 if (!list_empty(&pagelist)) {
1334                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1335                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1336                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
1337                         if (nr_failed)
1338                                 putback_movable_pages(&pagelist);
1339                 }
1340
1341                 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1342                         err = -EIO;
1343         } else {
1344 up_out:
1345                 if (!list_empty(&pagelist))
1346                         putback_movable_pages(&pagelist);
1347         }
1348
1349         mmap_write_unlock(mm);
1350 mpol_out:
1351         mpol_put(new);
1352         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1353                 lru_cache_enable();
1354         return err;
1355 }
1356
1357 /*
1358  * User space interface with variable sized bitmaps for nodelists.
1359  */
1360 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1361                       unsigned long maxnode)
1362 {
1363         unsigned long nlongs = BITS_TO_LONGS(maxnode);
1364         int ret;
1365
1366         if (in_compat_syscall())
1367                 ret = compat_get_bitmap(mask,
1368                                         (const compat_ulong_t __user *)nmask,
1369                                         maxnode);
1370         else
1371                 ret = copy_from_user(mask, nmask,
1372                                      nlongs * sizeof(unsigned long));
1373
1374         if (ret)
1375                 return -EFAULT;
1376
1377         if (maxnode % BITS_PER_LONG)
1378                 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1379
1380         return 0;
1381 }
1382
1383 /* Copy a node mask from user space. */
1384 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1385                      unsigned long maxnode)
1386 {
1387         --maxnode;
1388         nodes_clear(*nodes);
1389         if (maxnode == 0 || !nmask)
1390                 return 0;
1391         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1392                 return -EINVAL;
1393
1394         /*
1395          * When the user specified more nodes than supported just check
1396          * if the non supported part is all zero, one word at a time,
1397          * starting at the end.
1398          */
1399         while (maxnode > MAX_NUMNODES) {
1400                 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1401                 unsigned long t;
1402
1403                 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1404                         return -EFAULT;
1405
1406                 if (maxnode - bits >= MAX_NUMNODES) {
1407                         maxnode -= bits;
1408                 } else {
1409                         maxnode = MAX_NUMNODES;
1410                         t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1411                 }
1412                 if (t)
1413                         return -EINVAL;
1414         }
1415
1416         return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1417 }
1418
1419 /* Copy a kernel node mask to user space */
1420 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1421                               nodemask_t *nodes)
1422 {
1423         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1424         unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1425         bool compat = in_compat_syscall();
1426
1427         if (compat)
1428                 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1429
1430         if (copy > nbytes) {
1431                 if (copy > PAGE_SIZE)
1432                         return -EINVAL;
1433                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1434                         return -EFAULT;
1435                 copy = nbytes;
1436                 maxnode = nr_node_ids;
1437         }
1438
1439         if (compat)
1440                 return compat_put_bitmap((compat_ulong_t __user *)mask,
1441                                          nodes_addr(*nodes), maxnode);
1442
1443         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1444 }
1445
1446 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1447 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1448 {
1449         *flags = *mode & MPOL_MODE_FLAGS;
1450         *mode &= ~MPOL_MODE_FLAGS;
1451
1452         if ((unsigned int)(*mode) >=  MPOL_MAX)
1453                 return -EINVAL;
1454         if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1455                 return -EINVAL;
1456         if (*flags & MPOL_F_NUMA_BALANCING) {
1457                 if (*mode != MPOL_BIND)
1458                         return -EINVAL;
1459                 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1460         }
1461         return 0;
1462 }
1463
1464 static long kernel_mbind(unsigned long start, unsigned long len,
1465                          unsigned long mode, const unsigned long __user *nmask,
1466                          unsigned long maxnode, unsigned int flags)
1467 {
1468         unsigned short mode_flags;
1469         nodemask_t nodes;
1470         int lmode = mode;
1471         int err;
1472
1473         start = untagged_addr(start);
1474         err = sanitize_mpol_flags(&lmode, &mode_flags);
1475         if (err)
1476                 return err;
1477
1478         err = get_nodes(&nodes, nmask, maxnode);
1479         if (err)
1480                 return err;
1481
1482         return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1483 }
1484
1485 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1486                 unsigned long, home_node, unsigned long, flags)
1487 {
1488         struct mm_struct *mm = current->mm;
1489         struct vm_area_struct *vma, *prev;
1490         struct mempolicy *new, *old;
1491         unsigned long end;
1492         int err = -ENOENT;
1493         VMA_ITERATOR(vmi, mm, start);
1494
1495         start = untagged_addr(start);
1496         if (start & ~PAGE_MASK)
1497                 return -EINVAL;
1498         /*
1499          * flags is used for future extension if any.
1500          */
1501         if (flags != 0)
1502                 return -EINVAL;
1503
1504         /*
1505          * Check home_node is online to avoid accessing uninitialized
1506          * NODE_DATA.
1507          */
1508         if (home_node >= MAX_NUMNODES || !node_online(home_node))
1509                 return -EINVAL;
1510
1511         len = PAGE_ALIGN(len);
1512         end = start + len;
1513
1514         if (end < start)
1515                 return -EINVAL;
1516         if (end == start)
1517                 return 0;
1518         mmap_write_lock(mm);
1519         prev = vma_prev(&vmi);
1520         for_each_vma_range(vmi, vma, end) {
1521                 /*
1522                  * If any vma in the range got policy other than MPOL_BIND
1523                  * or MPOL_PREFERRED_MANY we return error. We don't reset
1524                  * the home node for vmas we already updated before.
1525                  */
1526                 old = vma_policy(vma);
1527                 if (!old)
1528                         continue;
1529                 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1530                         err = -EOPNOTSUPP;
1531                         break;
1532                 }
1533                 new = mpol_dup(old);
1534                 if (IS_ERR(new)) {
1535                         err = PTR_ERR(new);
1536                         break;
1537                 }
1538
1539                 new->home_node = home_node;
1540                 err = mbind_range(&vmi, vma, &prev, start, end, new);
1541                 mpol_put(new);
1542                 if (err)
1543                         break;
1544         }
1545         mmap_write_unlock(mm);
1546         return err;
1547 }
1548
1549 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1550                 unsigned long, mode, const unsigned long __user *, nmask,
1551                 unsigned long, maxnode, unsigned int, flags)
1552 {
1553         return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1554 }
1555
1556 /* Set the process memory policy */
1557 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1558                                  unsigned long maxnode)
1559 {
1560         unsigned short mode_flags;
1561         nodemask_t nodes;
1562         int lmode = mode;
1563         int err;
1564
1565         err = sanitize_mpol_flags(&lmode, &mode_flags);
1566         if (err)
1567                 return err;
1568
1569         err = get_nodes(&nodes, nmask, maxnode);
1570         if (err)
1571                 return err;
1572
1573         return do_set_mempolicy(lmode, mode_flags, &nodes);
1574 }
1575
1576 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1577                 unsigned long, maxnode)
1578 {
1579         return kernel_set_mempolicy(mode, nmask, maxnode);
1580 }
1581
1582 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1583                                 const unsigned long __user *old_nodes,
1584                                 const unsigned long __user *new_nodes)
1585 {
1586         struct mm_struct *mm = NULL;
1587         struct task_struct *task;
1588         nodemask_t task_nodes;
1589         int err;
1590         nodemask_t *old;
1591         nodemask_t *new;
1592         NODEMASK_SCRATCH(scratch);
1593
1594         if (!scratch)
1595                 return -ENOMEM;
1596
1597         old = &scratch->mask1;
1598         new = &scratch->mask2;
1599
1600         err = get_nodes(old, old_nodes, maxnode);
1601         if (err)
1602                 goto out;
1603
1604         err = get_nodes(new, new_nodes, maxnode);
1605         if (err)
1606                 goto out;
1607
1608         /* Find the mm_struct */
1609         rcu_read_lock();
1610         task = pid ? find_task_by_vpid(pid) : current;
1611         if (!task) {
1612                 rcu_read_unlock();
1613                 err = -ESRCH;
1614                 goto out;
1615         }
1616         get_task_struct(task);
1617
1618         err = -EINVAL;
1619
1620         /*
1621          * Check if this process has the right to modify the specified process.
1622          * Use the regular "ptrace_may_access()" checks.
1623          */
1624         if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1625                 rcu_read_unlock();
1626                 err = -EPERM;
1627                 goto out_put;
1628         }
1629         rcu_read_unlock();
1630
1631         task_nodes = cpuset_mems_allowed(task);
1632         /* Is the user allowed to access the target nodes? */
1633         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1634                 err = -EPERM;
1635                 goto out_put;
1636         }
1637
1638         task_nodes = cpuset_mems_allowed(current);
1639         nodes_and(*new, *new, task_nodes);
1640         if (nodes_empty(*new))
1641                 goto out_put;
1642
1643         err = security_task_movememory(task);
1644         if (err)
1645                 goto out_put;
1646
1647         mm = get_task_mm(task);
1648         put_task_struct(task);
1649
1650         if (!mm) {
1651                 err = -EINVAL;
1652                 goto out;
1653         }
1654
1655         err = do_migrate_pages(mm, old, new,
1656                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1657
1658         mmput(mm);
1659 out:
1660         NODEMASK_SCRATCH_FREE(scratch);
1661
1662         return err;
1663
1664 out_put:
1665         put_task_struct(task);
1666         goto out;
1667
1668 }
1669
1670 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1671                 const unsigned long __user *, old_nodes,
1672                 const unsigned long __user *, new_nodes)
1673 {
1674         return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1675 }
1676
1677
1678 /* Retrieve NUMA policy */
1679 static int kernel_get_mempolicy(int __user *policy,
1680                                 unsigned long __user *nmask,
1681                                 unsigned long maxnode,
1682                                 unsigned long addr,
1683                                 unsigned long flags)
1684 {
1685         int err;
1686         int pval;
1687         nodemask_t nodes;
1688
1689         if (nmask != NULL && maxnode < nr_node_ids)
1690                 return -EINVAL;
1691
1692         addr = untagged_addr(addr);
1693
1694         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1695
1696         if (err)
1697                 return err;
1698
1699         if (policy && put_user(pval, policy))
1700                 return -EFAULT;
1701
1702         if (nmask)
1703                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1704
1705         return err;
1706 }
1707
1708 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1709                 unsigned long __user *, nmask, unsigned long, maxnode,
1710                 unsigned long, addr, unsigned long, flags)
1711 {
1712         return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1713 }
1714
1715 bool vma_migratable(struct vm_area_struct *vma)
1716 {
1717         if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1718                 return false;
1719
1720         /*
1721          * DAX device mappings require predictable access latency, so avoid
1722          * incurring periodic faults.
1723          */
1724         if (vma_is_dax(vma))
1725                 return false;
1726
1727         if (is_vm_hugetlb_page(vma) &&
1728                 !hugepage_migration_supported(hstate_vma(vma)))
1729                 return false;
1730
1731         /*
1732          * Migration allocates pages in the highest zone. If we cannot
1733          * do so then migration (at least from node to node) is not
1734          * possible.
1735          */
1736         if (vma->vm_file &&
1737                 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1738                         < policy_zone)
1739                 return false;
1740         return true;
1741 }
1742
1743 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1744                                                 unsigned long addr)
1745 {
1746         struct mempolicy *pol = NULL;
1747
1748         if (vma) {
1749                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1750                         pol = vma->vm_ops->get_policy(vma, addr);
1751                 } else if (vma->vm_policy) {
1752                         pol = vma->vm_policy;
1753
1754                         /*
1755                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1756                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1757                          * count on these policies which will be dropped by
1758                          * mpol_cond_put() later
1759                          */
1760                         if (mpol_needs_cond_ref(pol))
1761                                 mpol_get(pol);
1762                 }
1763         }
1764
1765         return pol;
1766 }
1767
1768 /*
1769  * get_vma_policy(@vma, @addr)
1770  * @vma: virtual memory area whose policy is sought
1771  * @addr: address in @vma for shared policy lookup
1772  *
1773  * Returns effective policy for a VMA at specified address.
1774  * Falls back to current->mempolicy or system default policy, as necessary.
1775  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1776  * count--added by the get_policy() vm_op, as appropriate--to protect against
1777  * freeing by another task.  It is the caller's responsibility to free the
1778  * extra reference for shared policies.
1779  */
1780 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1781                                                 unsigned long addr)
1782 {
1783         struct mempolicy *pol = __get_vma_policy(vma, addr);
1784
1785         if (!pol)
1786                 pol = get_task_policy(current);
1787
1788         return pol;
1789 }
1790
1791 bool vma_policy_mof(struct vm_area_struct *vma)
1792 {
1793         struct mempolicy *pol;
1794
1795         if (vma->vm_ops && vma->vm_ops->get_policy) {
1796                 bool ret = false;
1797
1798                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1799                 if (pol && (pol->flags & MPOL_F_MOF))
1800                         ret = true;
1801                 mpol_cond_put(pol);
1802
1803                 return ret;
1804         }
1805
1806         pol = vma->vm_policy;
1807         if (!pol)
1808                 pol = get_task_policy(current);
1809
1810         return pol->flags & MPOL_F_MOF;
1811 }
1812
1813 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1814 {
1815         enum zone_type dynamic_policy_zone = policy_zone;
1816
1817         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1818
1819         /*
1820          * if policy->nodes has movable memory only,
1821          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1822          *
1823          * policy->nodes is intersect with node_states[N_MEMORY].
1824          * so if the following test fails, it implies
1825          * policy->nodes has movable memory only.
1826          */
1827         if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1828                 dynamic_policy_zone = ZONE_MOVABLE;
1829
1830         return zone >= dynamic_policy_zone;
1831 }
1832
1833 /*
1834  * Return a nodemask representing a mempolicy for filtering nodes for
1835  * page allocation
1836  */
1837 nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1838 {
1839         int mode = policy->mode;
1840
1841         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1842         if (unlikely(mode == MPOL_BIND) &&
1843                 apply_policy_zone(policy, gfp_zone(gfp)) &&
1844                 cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1845                 return &policy->nodes;
1846
1847         if (mode == MPOL_PREFERRED_MANY)
1848                 return &policy->nodes;
1849
1850         return NULL;
1851 }
1852
1853 /*
1854  * Return the  preferred node id for 'prefer' mempolicy, and return
1855  * the given id for all other policies.
1856  *
1857  * policy_node() is always coupled with policy_nodemask(), which
1858  * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1859  */
1860 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1861 {
1862         if (policy->mode == MPOL_PREFERRED) {
1863                 nd = first_node(policy->nodes);
1864         } else {
1865                 /*
1866                  * __GFP_THISNODE shouldn't even be used with the bind policy
1867                  * because we might easily break the expectation to stay on the
1868                  * requested node and not break the policy.
1869                  */
1870                 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1871         }
1872
1873         if ((policy->mode == MPOL_BIND ||
1874              policy->mode == MPOL_PREFERRED_MANY) &&
1875             policy->home_node != NUMA_NO_NODE)
1876                 return policy->home_node;
1877
1878         return nd;
1879 }
1880
1881 /* Do dynamic interleaving for a process */
1882 static unsigned interleave_nodes(struct mempolicy *policy)
1883 {
1884         unsigned next;
1885         struct task_struct *me = current;
1886
1887         next = next_node_in(me->il_prev, policy->nodes);
1888         if (next < MAX_NUMNODES)
1889                 me->il_prev = next;
1890         return next;
1891 }
1892
1893 /*
1894  * Depending on the memory policy provide a node from which to allocate the
1895  * next slab entry.
1896  */
1897 unsigned int mempolicy_slab_node(void)
1898 {
1899         struct mempolicy *policy;
1900         int node = numa_mem_id();
1901
1902         if (!in_task())
1903                 return node;
1904
1905         policy = current->mempolicy;
1906         if (!policy)
1907                 return node;
1908
1909         switch (policy->mode) {
1910         case MPOL_PREFERRED:
1911                 return first_node(policy->nodes);
1912
1913         case MPOL_INTERLEAVE:
1914                 return interleave_nodes(policy);
1915
1916         case MPOL_BIND:
1917         case MPOL_PREFERRED_MANY:
1918         {
1919                 struct zoneref *z;
1920
1921                 /*
1922                  * Follow bind policy behavior and start allocation at the
1923                  * first node.
1924                  */
1925                 struct zonelist *zonelist;
1926                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1927                 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1928                 z = first_zones_zonelist(zonelist, highest_zoneidx,
1929                                                         &policy->nodes);
1930                 return z->zone ? zone_to_nid(z->zone) : node;
1931         }
1932         case MPOL_LOCAL:
1933                 return node;
1934
1935         default:
1936                 BUG();
1937         }
1938 }
1939
1940 /*
1941  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1942  * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
1943  * number of present nodes.
1944  */
1945 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1946 {
1947         nodemask_t nodemask = pol->nodes;
1948         unsigned int target, nnodes;
1949         int i;
1950         int nid;
1951         /*
1952          * The barrier will stabilize the nodemask in a register or on
1953          * the stack so that it will stop changing under the code.
1954          *
1955          * Between first_node() and next_node(), pol->nodes could be changed
1956          * by other threads. So we put pol->nodes in a local stack.
1957          */
1958         barrier();
1959
1960         nnodes = nodes_weight(nodemask);
1961         if (!nnodes)
1962                 return numa_node_id();
1963         target = (unsigned int)n % nnodes;
1964         nid = first_node(nodemask);
1965         for (i = 0; i < target; i++)
1966                 nid = next_node(nid, nodemask);
1967         return nid;
1968 }
1969
1970 /* Determine a node number for interleave */
1971 static inline unsigned interleave_nid(struct mempolicy *pol,
1972                  struct vm_area_struct *vma, unsigned long addr, int shift)
1973 {
1974         if (vma) {
1975                 unsigned long off;
1976
1977                 /*
1978                  * for small pages, there is no difference between
1979                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1980                  * for huge pages, since vm_pgoff is in units of small
1981                  * pages, we need to shift off the always 0 bits to get
1982                  * a useful offset.
1983                  */
1984                 BUG_ON(shift < PAGE_SHIFT);
1985                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1986                 off += (addr - vma->vm_start) >> shift;
1987                 return offset_il_node(pol, off);
1988         } else
1989                 return interleave_nodes(pol);
1990 }
1991
1992 #ifdef CONFIG_HUGETLBFS
1993 /*
1994  * huge_node(@vma, @addr, @gfp_flags, @mpol)
1995  * @vma: virtual memory area whose policy is sought
1996  * @addr: address in @vma for shared policy lookup and interleave policy
1997  * @gfp_flags: for requested zone
1998  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1999  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2000  *
2001  * Returns a nid suitable for a huge page allocation and a pointer
2002  * to the struct mempolicy for conditional unref after allocation.
2003  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2004  * to the mempolicy's @nodemask for filtering the zonelist.
2005  *
2006  * Must be protected by read_mems_allowed_begin()
2007  */
2008 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2009                                 struct mempolicy **mpol, nodemask_t **nodemask)
2010 {
2011         int nid;
2012         int mode;
2013
2014         *mpol = get_vma_policy(vma, addr);
2015         *nodemask = NULL;
2016         mode = (*mpol)->mode;
2017
2018         if (unlikely(mode == MPOL_INTERLEAVE)) {
2019                 nid = interleave_nid(*mpol, vma, addr,
2020                                         huge_page_shift(hstate_vma(vma)));
2021         } else {
2022                 nid = policy_node(gfp_flags, *mpol, numa_node_id());
2023                 if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
2024                         *nodemask = &(*mpol)->nodes;
2025         }
2026         return nid;
2027 }
2028
2029 /*
2030  * init_nodemask_of_mempolicy
2031  *
2032  * If the current task's mempolicy is "default" [NULL], return 'false'
2033  * to indicate default policy.  Otherwise, extract the policy nodemask
2034  * for 'bind' or 'interleave' policy into the argument nodemask, or
2035  * initialize the argument nodemask to contain the single node for
2036  * 'preferred' or 'local' policy and return 'true' to indicate presence
2037  * of non-default mempolicy.
2038  *
2039  * We don't bother with reference counting the mempolicy [mpol_get/put]
2040  * because the current task is examining it's own mempolicy and a task's
2041  * mempolicy is only ever changed by the task itself.
2042  *
2043  * N.B., it is the caller's responsibility to free a returned nodemask.
2044  */
2045 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2046 {
2047         struct mempolicy *mempolicy;
2048
2049         if (!(mask && current->mempolicy))
2050                 return false;
2051
2052         task_lock(current);
2053         mempolicy = current->mempolicy;
2054         switch (mempolicy->mode) {
2055         case MPOL_PREFERRED:
2056         case MPOL_PREFERRED_MANY:
2057         case MPOL_BIND:
2058         case MPOL_INTERLEAVE:
2059                 *mask = mempolicy->nodes;
2060                 break;
2061
2062         case MPOL_LOCAL:
2063                 init_nodemask_of_node(mask, numa_node_id());
2064                 break;
2065
2066         default:
2067                 BUG();
2068         }
2069         task_unlock(current);
2070
2071         return true;
2072 }
2073 #endif
2074
2075 /*
2076  * mempolicy_in_oom_domain
2077  *
2078  * If tsk's mempolicy is "bind", check for intersection between mask and
2079  * the policy nodemask. Otherwise, return true for all other policies
2080  * including "interleave", as a tsk with "interleave" policy may have
2081  * memory allocated from all nodes in system.
2082  *
2083  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2084  */
2085 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2086                                         const nodemask_t *mask)
2087 {
2088         struct mempolicy *mempolicy;
2089         bool ret = true;
2090
2091         if (!mask)
2092                 return ret;
2093
2094         task_lock(tsk);
2095         mempolicy = tsk->mempolicy;
2096         if (mempolicy && mempolicy->mode == MPOL_BIND)
2097                 ret = nodes_intersects(mempolicy->nodes, *mask);
2098         task_unlock(tsk);
2099
2100         return ret;
2101 }
2102
2103 /* Allocate a page in interleaved policy.
2104    Own path because it needs to do special accounting. */
2105 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2106                                         unsigned nid)
2107 {
2108         struct page *page;
2109
2110         page = __alloc_pages(gfp, order, nid, NULL);
2111         /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2112         if (!static_branch_likely(&vm_numa_stat_key))
2113                 return page;
2114         if (page && page_to_nid(page) == nid) {
2115                 preempt_disable();
2116                 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2117                 preempt_enable();
2118         }
2119         return page;
2120 }
2121
2122 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2123                                                 int nid, struct mempolicy *pol)
2124 {
2125         struct page *page;
2126         gfp_t preferred_gfp;
2127
2128         /*
2129          * This is a two pass approach. The first pass will only try the
2130          * preferred nodes but skip the direct reclaim and allow the
2131          * allocation to fail, while the second pass will try all the
2132          * nodes in system.
2133          */
2134         preferred_gfp = gfp | __GFP_NOWARN;
2135         preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2136         page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2137         if (!page)
2138                 page = __alloc_pages(gfp, order, nid, NULL);
2139
2140         return page;
2141 }
2142
2143 /**
2144  * vma_alloc_folio - Allocate a folio for a VMA.
2145  * @gfp: GFP flags.
2146  * @order: Order of the folio.
2147  * @vma: Pointer to VMA or NULL if not available.
2148  * @addr: Virtual address of the allocation.  Must be inside @vma.
2149  * @hugepage: For hugepages try only the preferred node if possible.
2150  *
2151  * Allocate a folio for a specific address in @vma, using the appropriate
2152  * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
2153  * of the mm_struct of the VMA to prevent it from going away.  Should be
2154  * used for all allocations for folios that will be mapped into user space.
2155  *
2156  * Return: The folio on success or NULL if allocation fails.
2157  */
2158 struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2159                 unsigned long addr, bool hugepage)
2160 {
2161         struct mempolicy *pol;
2162         int node = numa_node_id();
2163         struct folio *folio;
2164         int preferred_nid;
2165         nodemask_t *nmask;
2166
2167         pol = get_vma_policy(vma, addr);
2168
2169         if (pol->mode == MPOL_INTERLEAVE) {
2170                 struct page *page;
2171                 unsigned nid;
2172
2173                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2174                 mpol_cond_put(pol);
2175                 gfp |= __GFP_COMP;
2176                 page = alloc_page_interleave(gfp, order, nid);
2177                 if (page && order > 1)
2178                         prep_transhuge_page(page);
2179                 folio = (struct folio *)page;
2180                 goto out;
2181         }
2182
2183         if (pol->mode == MPOL_PREFERRED_MANY) {
2184                 struct page *page;
2185
2186                 node = policy_node(gfp, pol, node);
2187                 gfp |= __GFP_COMP;
2188                 page = alloc_pages_preferred_many(gfp, order, node, pol);
2189                 mpol_cond_put(pol);
2190                 if (page && order > 1)
2191                         prep_transhuge_page(page);
2192                 folio = (struct folio *)page;
2193                 goto out;
2194         }
2195
2196         if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2197                 int hpage_node = node;
2198
2199                 /*
2200                  * For hugepage allocation and non-interleave policy which
2201                  * allows the current node (or other explicitly preferred
2202                  * node) we only try to allocate from the current/preferred
2203                  * node and don't fall back to other nodes, as the cost of
2204                  * remote accesses would likely offset THP benefits.
2205                  *
2206                  * If the policy is interleave or does not allow the current
2207                  * node in its nodemask, we allocate the standard way.
2208                  */
2209                 if (pol->mode == MPOL_PREFERRED)
2210                         hpage_node = first_node(pol->nodes);
2211
2212                 nmask = policy_nodemask(gfp, pol);
2213                 if (!nmask || node_isset(hpage_node, *nmask)) {
2214                         mpol_cond_put(pol);
2215                         /*
2216                          * First, try to allocate THP only on local node, but
2217                          * don't reclaim unnecessarily, just compact.
2218                          */
2219                         folio = __folio_alloc_node(gfp | __GFP_THISNODE |
2220                                         __GFP_NORETRY, order, hpage_node);
2221
2222                         /*
2223                          * If hugepage allocations are configured to always
2224                          * synchronous compact or the vma has been madvised
2225                          * to prefer hugepage backing, retry allowing remote
2226                          * memory with both reclaim and compact as well.
2227                          */
2228                         if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
2229                                 folio = __folio_alloc(gfp, order, hpage_node,
2230                                                       nmask);
2231
2232                         goto out;
2233                 }
2234         }
2235
2236         nmask = policy_nodemask(gfp, pol);
2237         preferred_nid = policy_node(gfp, pol, node);
2238         folio = __folio_alloc(gfp, order, preferred_nid, nmask);
2239         mpol_cond_put(pol);
2240 out:
2241         return folio;
2242 }
2243 EXPORT_SYMBOL(vma_alloc_folio);
2244
2245 /**
2246  * alloc_pages - Allocate pages.
2247  * @gfp: GFP flags.
2248  * @order: Power of two of number of pages to allocate.
2249  *
2250  * Allocate 1 << @order contiguous pages.  The physical address of the
2251  * first page is naturally aligned (eg an order-3 allocation will be aligned
2252  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2253  * process is honoured when in process context.
2254  *
2255  * Context: Can be called from any context, providing the appropriate GFP
2256  * flags are used.
2257  * Return: The page on success or NULL if allocation fails.
2258  */
2259 struct page *alloc_pages(gfp_t gfp, unsigned order)
2260 {
2261         struct mempolicy *pol = &default_policy;
2262         struct page *page;
2263
2264         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2265                 pol = get_task_policy(current);
2266
2267         /*
2268          * No reference counting needed for current->mempolicy
2269          * nor system default_policy
2270          */
2271         if (pol->mode == MPOL_INTERLEAVE)
2272                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2273         else if (pol->mode == MPOL_PREFERRED_MANY)
2274                 page = alloc_pages_preferred_many(gfp, order,
2275                                   policy_node(gfp, pol, numa_node_id()), pol);
2276         else
2277                 page = __alloc_pages(gfp, order,
2278                                 policy_node(gfp, pol, numa_node_id()),
2279                                 policy_nodemask(gfp, pol));
2280
2281         return page;
2282 }
2283 EXPORT_SYMBOL(alloc_pages);
2284
2285 struct folio *folio_alloc(gfp_t gfp, unsigned order)
2286 {
2287         struct page *page = alloc_pages(gfp | __GFP_COMP, order);
2288
2289         if (page && order > 1)
2290                 prep_transhuge_page(page);
2291         return (struct folio *)page;
2292 }
2293 EXPORT_SYMBOL(folio_alloc);
2294
2295 static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2296                 struct mempolicy *pol, unsigned long nr_pages,
2297                 struct page **page_array)
2298 {
2299         int nodes;
2300         unsigned long nr_pages_per_node;
2301         int delta;
2302         int i;
2303         unsigned long nr_allocated;
2304         unsigned long total_allocated = 0;
2305
2306         nodes = nodes_weight(pol->nodes);
2307         nr_pages_per_node = nr_pages / nodes;
2308         delta = nr_pages - nodes * nr_pages_per_node;
2309
2310         for (i = 0; i < nodes; i++) {
2311                 if (delta) {
2312                         nr_allocated = __alloc_pages_bulk(gfp,
2313                                         interleave_nodes(pol), NULL,
2314                                         nr_pages_per_node + 1, NULL,
2315                                         page_array);
2316                         delta--;
2317                 } else {
2318                         nr_allocated = __alloc_pages_bulk(gfp,
2319                                         interleave_nodes(pol), NULL,
2320                                         nr_pages_per_node, NULL, page_array);
2321                 }
2322
2323                 page_array += nr_allocated;
2324                 total_allocated += nr_allocated;
2325         }
2326
2327         return total_allocated;
2328 }
2329
2330 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2331                 struct mempolicy *pol, unsigned long nr_pages,
2332                 struct page **page_array)
2333 {
2334         gfp_t preferred_gfp;
2335         unsigned long nr_allocated = 0;
2336
2337         preferred_gfp = gfp | __GFP_NOWARN;
2338         preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2339
2340         nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2341                                            nr_pages, NULL, page_array);
2342
2343         if (nr_allocated < nr_pages)
2344                 nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2345                                 nr_pages - nr_allocated, NULL,
2346                                 page_array + nr_allocated);
2347         return nr_allocated;
2348 }
2349
2350 /* alloc pages bulk and mempolicy should be considered at the
2351  * same time in some situation such as vmalloc.
2352  *
2353  * It can accelerate memory allocation especially interleaving
2354  * allocate memory.
2355  */
2356 unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2357                 unsigned long nr_pages, struct page **page_array)
2358 {
2359         struct mempolicy *pol = &default_policy;
2360
2361         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2362                 pol = get_task_policy(current);
2363
2364         if (pol->mode == MPOL_INTERLEAVE)
2365                 return alloc_pages_bulk_array_interleave(gfp, pol,
2366                                                          nr_pages, page_array);
2367
2368         if (pol->mode == MPOL_PREFERRED_MANY)
2369                 return alloc_pages_bulk_array_preferred_many(gfp,
2370                                 numa_node_id(), pol, nr_pages, page_array);
2371
2372         return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2373                                   policy_nodemask(gfp, pol), nr_pages, NULL,
2374                                   page_array);
2375 }
2376
2377 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2378 {
2379         struct mempolicy *pol = mpol_dup(vma_policy(src));
2380
2381         if (IS_ERR(pol))
2382                 return PTR_ERR(pol);
2383         dst->vm_policy = pol;
2384         return 0;
2385 }
2386
2387 /*
2388  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2389  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2390  * with the mems_allowed returned by cpuset_mems_allowed().  This
2391  * keeps mempolicies cpuset relative after its cpuset moves.  See
2392  * further kernel/cpuset.c update_nodemask().
2393  *
2394  * current's mempolicy may be rebinded by the other task(the task that changes
2395  * cpuset's mems), so we needn't do rebind work for current task.
2396  */
2397
2398 /* Slow path of a mempolicy duplicate */
2399 struct mempolicy *__mpol_dup(struct mempolicy *old)
2400 {
2401         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2402
2403         if (!new)
2404                 return ERR_PTR(-ENOMEM);
2405
2406         /* task's mempolicy is protected by alloc_lock */
2407         if (old == current->mempolicy) {
2408                 task_lock(current);
2409                 *new = *old;
2410                 task_unlock(current);
2411         } else
2412                 *new = *old;
2413
2414         if (current_cpuset_is_being_rebound()) {
2415                 nodemask_t mems = cpuset_mems_allowed(current);
2416                 mpol_rebind_policy(new, &mems);
2417         }
2418         atomic_set(&new->refcnt, 1);
2419         return new;
2420 }
2421
2422 /* Slow path of a mempolicy comparison */
2423 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2424 {
2425         if (!a || !b)
2426                 return false;
2427         if (a->mode != b->mode)
2428                 return false;
2429         if (a->flags != b->flags)
2430                 return false;
2431         if (a->home_node != b->home_node)
2432                 return false;
2433         if (mpol_store_user_nodemask(a))
2434                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2435                         return false;
2436
2437         switch (a->mode) {
2438         case MPOL_BIND:
2439         case MPOL_INTERLEAVE:
2440         case MPOL_PREFERRED:
2441         case MPOL_PREFERRED_MANY:
2442                 return !!nodes_equal(a->nodes, b->nodes);
2443         case MPOL_LOCAL:
2444                 return true;
2445         default:
2446                 BUG();
2447                 return false;
2448         }
2449 }
2450
2451 /*
2452  * Shared memory backing store policy support.
2453  *
2454  * Remember policies even when nobody has shared memory mapped.
2455  * The policies are kept in Red-Black tree linked from the inode.
2456  * They are protected by the sp->lock rwlock, which should be held
2457  * for any accesses to the tree.
2458  */
2459
2460 /*
2461  * lookup first element intersecting start-end.  Caller holds sp->lock for
2462  * reading or for writing
2463  */
2464 static struct sp_node *
2465 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2466 {
2467         struct rb_node *n = sp->root.rb_node;
2468
2469         while (n) {
2470                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2471
2472                 if (start >= p->end)
2473                         n = n->rb_right;
2474                 else if (end <= p->start)
2475                         n = n->rb_left;
2476                 else
2477                         break;
2478         }
2479         if (!n)
2480                 return NULL;
2481         for (;;) {
2482                 struct sp_node *w = NULL;
2483                 struct rb_node *prev = rb_prev(n);
2484                 if (!prev)
2485                         break;
2486                 w = rb_entry(prev, struct sp_node, nd);
2487                 if (w->end <= start)
2488                         break;
2489                 n = prev;
2490         }
2491         return rb_entry(n, struct sp_node, nd);
2492 }
2493
2494 /*
2495  * Insert a new shared policy into the list.  Caller holds sp->lock for
2496  * writing.
2497  */
2498 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2499 {
2500         struct rb_node **p = &sp->root.rb_node;
2501         struct rb_node *parent = NULL;
2502         struct sp_node *nd;
2503
2504         while (*p) {
2505                 parent = *p;
2506                 nd = rb_entry(parent, struct sp_node, nd);
2507                 if (new->start < nd->start)
2508                         p = &(*p)->rb_left;
2509                 else if (new->end > nd->end)
2510                         p = &(*p)->rb_right;
2511                 else
2512                         BUG();
2513         }
2514         rb_link_node(&new->nd, parent, p);
2515         rb_insert_color(&new->nd, &sp->root);
2516         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2517                  new->policy ? new->policy->mode : 0);
2518 }
2519
2520 /* Find shared policy intersecting idx */
2521 struct mempolicy *
2522 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2523 {
2524         struct mempolicy *pol = NULL;
2525         struct sp_node *sn;
2526
2527         if (!sp->root.rb_node)
2528                 return NULL;
2529         read_lock(&sp->lock);
2530         sn = sp_lookup(sp, idx, idx+1);
2531         if (sn) {
2532                 mpol_get(sn->policy);
2533                 pol = sn->policy;
2534         }
2535         read_unlock(&sp->lock);
2536         return pol;
2537 }
2538
2539 static void sp_free(struct sp_node *n)
2540 {
2541         mpol_put(n->policy);
2542         kmem_cache_free(sn_cache, n);
2543 }
2544
2545 /**
2546  * mpol_misplaced - check whether current page node is valid in policy
2547  *
2548  * @page: page to be checked
2549  * @vma: vm area where page mapped
2550  * @addr: virtual address where page mapped
2551  *
2552  * Lookup current policy node id for vma,addr and "compare to" page's
2553  * node id.  Policy determination "mimics" alloc_page_vma().
2554  * Called from fault path where we know the vma and faulting address.
2555  *
2556  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2557  * policy, or a suitable node ID to allocate a replacement page from.
2558  */
2559 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2560 {
2561         struct mempolicy *pol;
2562         struct zoneref *z;
2563         int curnid = page_to_nid(page);
2564         unsigned long pgoff;
2565         int thiscpu = raw_smp_processor_id();
2566         int thisnid = cpu_to_node(thiscpu);
2567         int polnid = NUMA_NO_NODE;
2568         int ret = NUMA_NO_NODE;
2569
2570         pol = get_vma_policy(vma, addr);
2571         if (!(pol->flags & MPOL_F_MOF))
2572                 goto out;
2573
2574         switch (pol->mode) {
2575         case MPOL_INTERLEAVE:
2576                 pgoff = vma->vm_pgoff;
2577                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2578                 polnid = offset_il_node(pol, pgoff);
2579                 break;
2580
2581         case MPOL_PREFERRED:
2582                 if (node_isset(curnid, pol->nodes))
2583                         goto out;
2584                 polnid = first_node(pol->nodes);
2585                 break;
2586
2587         case MPOL_LOCAL:
2588                 polnid = numa_node_id();
2589                 break;
2590
2591         case MPOL_BIND:
2592                 /* Optimize placement among multiple nodes via NUMA balancing */
2593                 if (pol->flags & MPOL_F_MORON) {
2594                         if (node_isset(thisnid, pol->nodes))
2595                                 break;
2596                         goto out;
2597                 }
2598                 fallthrough;
2599
2600         case MPOL_PREFERRED_MANY:
2601                 /*
2602                  * use current page if in policy nodemask,
2603                  * else select nearest allowed node, if any.
2604                  * If no allowed nodes, use current [!misplaced].
2605                  */
2606                 if (node_isset(curnid, pol->nodes))
2607                         goto out;
2608                 z = first_zones_zonelist(
2609                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2610                                 gfp_zone(GFP_HIGHUSER),
2611                                 &pol->nodes);
2612                 polnid = zone_to_nid(z->zone);
2613                 break;
2614
2615         default:
2616                 BUG();
2617         }
2618
2619         /* Migrate the page towards the node whose CPU is referencing it */
2620         if (pol->flags & MPOL_F_MORON) {
2621                 polnid = thisnid;
2622
2623                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2624                         goto out;
2625         }
2626
2627         if (curnid != polnid)
2628                 ret = polnid;
2629 out:
2630         mpol_cond_put(pol);
2631
2632         return ret;
2633 }
2634
2635 /*
2636  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2637  * dropped after task->mempolicy is set to NULL so that any allocation done as
2638  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2639  * policy.
2640  */
2641 void mpol_put_task_policy(struct task_struct *task)
2642 {
2643         struct mempolicy *pol;
2644
2645         task_lock(task);
2646         pol = task->mempolicy;
2647         task->mempolicy = NULL;
2648         task_unlock(task);
2649         mpol_put(pol);
2650 }
2651
2652 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2653 {
2654         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2655         rb_erase(&n->nd, &sp->root);
2656         sp_free(n);
2657 }
2658
2659 static void sp_node_init(struct sp_node *node, unsigned long start,
2660                         unsigned long end, struct mempolicy *pol)
2661 {
2662         node->start = start;
2663         node->end = end;
2664         node->policy = pol;
2665 }
2666
2667 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2668                                 struct mempolicy *pol)
2669 {
2670         struct sp_node *n;
2671         struct mempolicy *newpol;
2672
2673         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2674         if (!n)
2675                 return NULL;
2676
2677         newpol = mpol_dup(pol);
2678         if (IS_ERR(newpol)) {
2679                 kmem_cache_free(sn_cache, n);
2680                 return NULL;
2681         }
2682         newpol->flags |= MPOL_F_SHARED;
2683         sp_node_init(n, start, end, newpol);
2684
2685         return n;
2686 }
2687
2688 /* Replace a policy range. */
2689 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2690                                  unsigned long end, struct sp_node *new)
2691 {
2692         struct sp_node *n;
2693         struct sp_node *n_new = NULL;
2694         struct mempolicy *mpol_new = NULL;
2695         int ret = 0;
2696
2697 restart:
2698         write_lock(&sp->lock);
2699         n = sp_lookup(sp, start, end);
2700         /* Take care of old policies in the same range. */
2701         while (n && n->start < end) {
2702                 struct rb_node *next = rb_next(&n->nd);
2703                 if (n->start >= start) {
2704                         if (n->end <= end)
2705                                 sp_delete(sp, n);
2706                         else
2707                                 n->start = end;
2708                 } else {
2709                         /* Old policy spanning whole new range. */
2710                         if (n->end > end) {
2711                                 if (!n_new)
2712                                         goto alloc_new;
2713
2714                                 *mpol_new = *n->policy;
2715                                 atomic_set(&mpol_new->refcnt, 1);
2716                                 sp_node_init(n_new, end, n->end, mpol_new);
2717                                 n->end = start;
2718                                 sp_insert(sp, n_new);
2719                                 n_new = NULL;
2720                                 mpol_new = NULL;
2721                                 break;
2722                         } else
2723                                 n->end = start;
2724                 }
2725                 if (!next)
2726                         break;
2727                 n = rb_entry(next, struct sp_node, nd);
2728         }
2729         if (new)
2730                 sp_insert(sp, new);
2731         write_unlock(&sp->lock);
2732         ret = 0;
2733
2734 err_out:
2735         if (mpol_new)
2736                 mpol_put(mpol_new);
2737         if (n_new)
2738                 kmem_cache_free(sn_cache, n_new);
2739
2740         return ret;
2741
2742 alloc_new:
2743         write_unlock(&sp->lock);
2744         ret = -ENOMEM;
2745         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2746         if (!n_new)
2747                 goto err_out;
2748         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2749         if (!mpol_new)
2750                 goto err_out;
2751         atomic_set(&mpol_new->refcnt, 1);
2752         goto restart;
2753 }
2754
2755 /**
2756  * mpol_shared_policy_init - initialize shared policy for inode
2757  * @sp: pointer to inode shared policy
2758  * @mpol:  struct mempolicy to install
2759  *
2760  * Install non-NULL @mpol in inode's shared policy rb-tree.
2761  * On entry, the current task has a reference on a non-NULL @mpol.
2762  * This must be released on exit.
2763  * This is called at get_inode() calls and we can use GFP_KERNEL.
2764  */
2765 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2766 {
2767         int ret;
2768
2769         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2770         rwlock_init(&sp->lock);
2771
2772         if (mpol) {
2773                 struct vm_area_struct pvma;
2774                 struct mempolicy *new;
2775                 NODEMASK_SCRATCH(scratch);
2776
2777                 if (!scratch)
2778                         goto put_mpol;
2779                 /* contextualize the tmpfs mount point mempolicy */
2780                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2781                 if (IS_ERR(new))
2782                         goto free_scratch; /* no valid nodemask intersection */
2783
2784                 task_lock(current);
2785                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2786                 task_unlock(current);
2787                 if (ret)
2788                         goto put_new;
2789
2790                 /* Create pseudo-vma that contains just the policy */
2791                 vma_init(&pvma, NULL);
2792                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2793                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2794
2795 put_new:
2796                 mpol_put(new);                  /* drop initial ref */
2797 free_scratch:
2798                 NODEMASK_SCRATCH_FREE(scratch);
2799 put_mpol:
2800                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2801         }
2802 }
2803
2804 int mpol_set_shared_policy(struct shared_policy *info,
2805                         struct vm_area_struct *vma, struct mempolicy *npol)
2806 {
2807         int err;
2808         struct sp_node *new = NULL;
2809         unsigned long sz = vma_pages(vma);
2810
2811         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2812                  vma->vm_pgoff,
2813                  sz, npol ? npol->mode : -1,
2814                  npol ? npol->flags : -1,
2815                  npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
2816
2817         if (npol) {
2818                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2819                 if (!new)
2820                         return -ENOMEM;
2821         }
2822         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2823         if (err && new)
2824                 sp_free(new);
2825         return err;
2826 }
2827
2828 /* Free a backing policy store on inode delete. */
2829 void mpol_free_shared_policy(struct shared_policy *p)
2830 {
2831         struct sp_node *n;
2832         struct rb_node *next;
2833
2834         if (!p->root.rb_node)
2835                 return;
2836         write_lock(&p->lock);
2837         next = rb_first(&p->root);
2838         while (next) {
2839                 n = rb_entry(next, struct sp_node, nd);
2840                 next = rb_next(&n->nd);
2841                 sp_delete(p, n);
2842         }
2843         write_unlock(&p->lock);
2844 }
2845
2846 #ifdef CONFIG_NUMA_BALANCING
2847 static int __initdata numabalancing_override;
2848
2849 static void __init check_numabalancing_enable(void)
2850 {
2851         bool numabalancing_default = false;
2852
2853         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2854                 numabalancing_default = true;
2855
2856         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2857         if (numabalancing_override)
2858                 set_numabalancing_state(numabalancing_override == 1);
2859
2860         if (num_online_nodes() > 1 && !numabalancing_override) {
2861                 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2862                         numabalancing_default ? "Enabling" : "Disabling");
2863                 set_numabalancing_state(numabalancing_default);
2864         }
2865 }
2866
2867 static int __init setup_numabalancing(char *str)
2868 {
2869         int ret = 0;
2870         if (!str)
2871                 goto out;
2872
2873         if (!strcmp(str, "enable")) {
2874                 numabalancing_override = 1;
2875                 ret = 1;
2876         } else if (!strcmp(str, "disable")) {
2877                 numabalancing_override = -1;
2878                 ret = 1;
2879         }
2880 out:
2881         if (!ret)
2882                 pr_warn("Unable to parse numa_balancing=\n");
2883
2884         return ret;
2885 }
2886 __setup("numa_balancing=", setup_numabalancing);
2887 #else
2888 static inline void __init check_numabalancing_enable(void)
2889 {
2890 }
2891 #endif /* CONFIG_NUMA_BALANCING */
2892
2893 /* assumes fs == KERNEL_DS */
2894 void __init numa_policy_init(void)
2895 {
2896         nodemask_t interleave_nodes;
2897         unsigned long largest = 0;
2898         int nid, prefer = 0;
2899
2900         policy_cache = kmem_cache_create("numa_policy",
2901                                          sizeof(struct mempolicy),
2902                                          0, SLAB_PANIC, NULL);
2903
2904         sn_cache = kmem_cache_create("shared_policy_node",
2905                                      sizeof(struct sp_node),
2906                                      0, SLAB_PANIC, NULL);
2907
2908         for_each_node(nid) {
2909                 preferred_node_policy[nid] = (struct mempolicy) {
2910                         .refcnt = ATOMIC_INIT(1),
2911                         .mode = MPOL_PREFERRED,
2912                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2913                         .nodes = nodemask_of_node(nid),
2914                 };
2915         }
2916
2917         /*
2918          * Set interleaving policy for system init. Interleaving is only
2919          * enabled across suitably sized nodes (default is >= 16MB), or
2920          * fall back to the largest node if they're all smaller.
2921          */
2922         nodes_clear(interleave_nodes);
2923         for_each_node_state(nid, N_MEMORY) {
2924                 unsigned long total_pages = node_present_pages(nid);
2925
2926                 /* Preserve the largest node */
2927                 if (largest < total_pages) {
2928                         largest = total_pages;
2929                         prefer = nid;
2930                 }
2931
2932                 /* Interleave this node? */
2933                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2934                         node_set(nid, interleave_nodes);
2935         }
2936
2937         /* All too small, use the largest */
2938         if (unlikely(nodes_empty(interleave_nodes)))
2939                 node_set(prefer, interleave_nodes);
2940
2941         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2942                 pr_err("%s: interleaving failed\n", __func__);
2943
2944         check_numabalancing_enable();
2945 }
2946
2947 /* Reset policy of current process to default */
2948 void numa_default_policy(void)
2949 {
2950         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2951 }
2952
2953 /*
2954  * Parse and format mempolicy from/to strings
2955  */
2956
2957 static const char * const policy_modes[] =
2958 {
2959         [MPOL_DEFAULT]    = "default",
2960         [MPOL_PREFERRED]  = "prefer",
2961         [MPOL_BIND]       = "bind",
2962         [MPOL_INTERLEAVE] = "interleave",
2963         [MPOL_LOCAL]      = "local",
2964         [MPOL_PREFERRED_MANY]  = "prefer (many)",
2965 };
2966
2967
2968 #ifdef CONFIG_TMPFS
2969 /**
2970  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2971  * @str:  string containing mempolicy to parse
2972  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2973  *
2974  * Format of input:
2975  *      <mode>[=<flags>][:<nodelist>]
2976  *
2977  * Return: %0 on success, else %1
2978  */
2979 int mpol_parse_str(char *str, struct mempolicy **mpol)
2980 {
2981         struct mempolicy *new = NULL;
2982         unsigned short mode_flags;
2983         nodemask_t nodes;
2984         char *nodelist = strchr(str, ':');
2985         char *flags = strchr(str, '=');
2986         int err = 1, mode;
2987
2988         if (flags)
2989                 *flags++ = '\0';        /* terminate mode string */
2990
2991         if (nodelist) {
2992                 /* NUL-terminate mode or flags string */
2993                 *nodelist++ = '\0';
2994                 if (nodelist_parse(nodelist, nodes))
2995                         goto out;
2996                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2997                         goto out;
2998         } else
2999                 nodes_clear(nodes);
3000
3001         mode = match_string(policy_modes, MPOL_MAX, str);
3002         if (mode < 0)
3003                 goto out;
3004
3005         switch (mode) {
3006         case MPOL_PREFERRED:
3007                 /*
3008                  * Insist on a nodelist of one node only, although later
3009                  * we use first_node(nodes) to grab a single node, so here
3010                  * nodelist (or nodes) cannot be empty.
3011                  */
3012                 if (nodelist) {
3013                         char *rest = nodelist;
3014                         while (isdigit(*rest))
3015                                 rest++;
3016                         if (*rest)
3017                                 goto out;
3018                         if (nodes_empty(nodes))
3019                                 goto out;
3020                 }
3021                 break;
3022         case MPOL_INTERLEAVE:
3023                 /*
3024                  * Default to online nodes with memory if no nodelist
3025                  */
3026                 if (!nodelist)
3027                         nodes = node_states[N_MEMORY];
3028                 break;
3029         case MPOL_LOCAL:
3030                 /*
3031                  * Don't allow a nodelist;  mpol_new() checks flags
3032                  */
3033                 if (nodelist)
3034                         goto out;
3035                 break;
3036         case MPOL_DEFAULT:
3037                 /*
3038                  * Insist on a empty nodelist
3039                  */
3040                 if (!nodelist)
3041                         err = 0;
3042                 goto out;
3043         case MPOL_PREFERRED_MANY:
3044         case MPOL_BIND:
3045                 /*
3046                  * Insist on a nodelist
3047                  */
3048                 if (!nodelist)
3049                         goto out;
3050         }
3051
3052         mode_flags = 0;
3053         if (flags) {
3054                 /*
3055                  * Currently, we only support two mutually exclusive
3056                  * mode flags.
3057                  */
3058                 if (!strcmp(flags, "static"))
3059                         mode_flags |= MPOL_F_STATIC_NODES;
3060                 else if (!strcmp(flags, "relative"))
3061                         mode_flags |= MPOL_F_RELATIVE_NODES;
3062                 else
3063                         goto out;
3064         }
3065
3066         new = mpol_new(mode, mode_flags, &nodes);
3067         if (IS_ERR(new))
3068                 goto out;
3069
3070         /*
3071          * Save nodes for mpol_to_str() to show the tmpfs mount options
3072          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3073          */
3074         if (mode != MPOL_PREFERRED) {
3075                 new->nodes = nodes;
3076         } else if (nodelist) {
3077                 nodes_clear(new->nodes);
3078                 node_set(first_node(nodes), new->nodes);
3079         } else {
3080                 new->mode = MPOL_LOCAL;
3081         }
3082
3083         /*
3084          * Save nodes for contextualization: this will be used to "clone"
3085          * the mempolicy in a specific context [cpuset] at a later time.
3086          */
3087         new->w.user_nodemask = nodes;
3088
3089         err = 0;
3090
3091 out:
3092         /* Restore string for error message */
3093         if (nodelist)
3094                 *--nodelist = ':';
3095         if (flags)
3096                 *--flags = '=';
3097         if (!err)
3098                 *mpol = new;
3099         return err;
3100 }
3101 #endif /* CONFIG_TMPFS */
3102
3103 /**
3104  * mpol_to_str - format a mempolicy structure for printing
3105  * @buffer:  to contain formatted mempolicy string
3106  * @maxlen:  length of @buffer
3107  * @pol:  pointer to mempolicy to be formatted
3108  *
3109  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3110  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3111  * longest flag, "relative", and to display at least a few node ids.
3112  */
3113 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3114 {
3115         char *p = buffer;
3116         nodemask_t nodes = NODE_MASK_NONE;
3117         unsigned short mode = MPOL_DEFAULT;
3118         unsigned short flags = 0;
3119
3120         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3121                 mode = pol->mode;
3122                 flags = pol->flags;
3123         }
3124
3125         switch (mode) {
3126         case MPOL_DEFAULT:
3127         case MPOL_LOCAL:
3128                 break;
3129         case MPOL_PREFERRED:
3130         case MPOL_PREFERRED_MANY:
3131         case MPOL_BIND:
3132         case MPOL_INTERLEAVE:
3133                 nodes = pol->nodes;
3134                 break;
3135         default:
3136                 WARN_ON_ONCE(1);
3137                 snprintf(p, maxlen, "unknown");
3138                 return;
3139         }
3140
3141         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3142
3143         if (flags & MPOL_MODE_FLAGS) {
3144                 p += snprintf(p, buffer + maxlen - p, "=");
3145
3146                 /*
3147                  * Currently, the only defined flags are mutually exclusive
3148                  */
3149                 if (flags & MPOL_F_STATIC_NODES)
3150                         p += snprintf(p, buffer + maxlen - p, "static");
3151                 else if (flags & MPOL_F_RELATIVE_NODES)
3152                         p += snprintf(p, buffer + maxlen - p, "relative");
3153         }
3154
3155         if (!nodes_empty(nodes))
3156                 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3157                                nodemask_pr_args(&nodes));
3158 }