mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/slab.h>
  77 #include <linux/string.h>
  78 #include <linux/export.h>
  79 #include <linux/nsproxy.h>
  80 #include <linux/interrupt.h>
  81 #include <linux/init.h>
  82 #include <linux/compat.h>
  83 #include <linux/swap.h>
  84 #include <linux/seq_file.h>
  85 #include <linux/proc_fs.h>
  86 #include <linux/migrate.h>
  87 #include <linux/ksm.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91 #include <linux/ctype.h>
  92 #include <linux/mm_inline.h>
  93 #include <linux/mmu_notifier.h>
  94
  95 #include <asm/tlbflush.h>
  96 #include <asm/uaccess.h>
  97 #include <linux/random.h>
  98
  99 #include "internal.h"
 100
 101 /* Internal flags */
 102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 104
 105 static struct kmem_cache *policy_cache;
 106 static struct kmem_cache *sn_cache;
 107
 108 /* Highest zone. An specific allocation for a zone below that is not
 109    policied. */
 110 enum zone_type policy_zone = 0;
 111
 112 /*
 113  * run-time system-wide default policy => local allocation
 114  */
 115 static struct mempolicy default_policy = {
 116         .refcnt = ATOMIC_INIT(1), /* never free it */
 117         .mode = MPOL_PREFERRED,
 118         .flags = MPOL_F_LOCAL,
 119 };
 120
 121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 122
 123 static struct mempolicy *get_task_policy(struct task_struct *p)
 124 {
 125         struct mempolicy *pol = p->mempolicy;
 126         int node;
 127
 128         if (!pol) {
 129                 node = numa_node_id();
 130                 if (node != -1)
 131                         pol = &preferred_node_policy[node];
 132
 133                 /* preferred_node_policy is not initialised early in boot */
 134                 if (!pol->mode)
 135                         pol = NULL;
 136         }
 137
 138         return pol;
 139 }
 140
 141 static const struct mempolicy_operations {
 142         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 143         /*
 144          * If read-side task has no lock to protect task->mempolicy, write-side
 145          * task will rebind the task->mempolicy by two step. The first step is
 146          * setting all the newly nodes, and the second step is cleaning all the
 147          * disallowed nodes. In this way, we can avoid finding no node to alloc
 148          * page.
 149          * If we have a lock to protect task->mempolicy in read-side, we do
 150          * rebind directly.
 151          *
 152          * step:
 153          *      MPOL_REBIND_ONCE - do rebind work at once
 154          *      MPOL_REBIND_STEP1 - set all the newly nodes
 155          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 156          */
 157         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 158                         enum mpol_rebind_step step);
 159 } mpol_ops[MPOL_MAX];
 160
 161 /* Check that the nodemask contains at least one populated zone */
 162 static int is_valid_nodemask(const nodemask_t *nodemask)
 163 {
 164         return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 165 }
 166
 167 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 168 {
 169         return pol->flags & MPOL_MODE_FLAGS;
 170 }
 171
 172 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 173                                    const nodemask_t *rel)
 174 {
 175         nodemask_t tmp;
 176         nodes_fold(tmp, *orig, nodes_weight(*rel));
 177         nodes_onto(*ret, tmp, *rel);
 178 }
 179
 180 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 181 {
 182         if (nodes_empty(*nodes))
 183                 return -EINVAL;
 184         pol->v.nodes = *nodes;
 185         return 0;
 186 }
 187
 188 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 189 {
 190         if (!nodes)
 191                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 192         else if (nodes_empty(*nodes))
 193                 return -EINVAL;                 /*  no allowed nodes */
 194         else
 195                 pol->v.preferred_node = first_node(*nodes);
 196         return 0;
 197 }
 198
 199 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 200 {
 201         if (!is_valid_nodemask(nodes))
 202                 return -EINVAL;
 203         pol->v.nodes = *nodes;
 204         return 0;
 205 }
 206
 207 /*
 208  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 209  * any, for the new policy.  mpol_new() has already validated the nodes
 210  * parameter with respect to the policy mode and flags.  But, we need to
 211  * handle an empty nodemask with MPOL_PREFERRED here.
 212  *
 213  * Must be called holding task's alloc_lock to protect task's mems_allowed
 214  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 215  */
 216 static int mpol_set_nodemask(struct mempolicy *pol,
 217                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 218 {
 219         int ret;
 220
 221         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 222         if (pol == NULL)
 223                 return 0;
 224         /* Check N_MEMORY */
 225         nodes_and(nsc->mask1,
 226                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 227
 228         VM_BUG_ON(!nodes);
 229         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 230                 nodes = NULL;   /* explicit local allocation */
 231         else {
 232                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 233                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 234                 else
 235                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 236
 237                 if (mpol_store_user_nodemask(pol))
 238                         pol->w.user_nodemask = *nodes;
 239                 else
 240                         pol->w.cpuset_mems_allowed =
 241                                                 cpuset_current_mems_allowed;
 242         }
 243
 244         if (nodes)
 245                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 246         else
 247                 ret = mpol_ops[pol->mode].create(pol, NULL);
 248         return ret;
 249 }
 250
 251 /*
 252  * This function just creates a new policy, does some check and simple
 253  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 254  */
 255 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 256                                   nodemask_t *nodes)
 257 {
 258         struct mempolicy *policy;
 259
 260         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 261                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 262
 263         if (mode == MPOL_DEFAULT) {
 264                 if (nodes && !nodes_empty(*nodes))
 265                         return ERR_PTR(-EINVAL);
 266                 return NULL;
 267         }
 268         VM_BUG_ON(!nodes);
 269
 270         /*
 271          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 272          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 273          * All other modes require a valid pointer to a non-empty nodemask.
 274          */
 275         if (mode == MPOL_PREFERRED) {
 276                 if (nodes_empty(*nodes)) {
 277                         if (((flags & MPOL_F_STATIC_NODES) ||
 278                              (flags & MPOL_F_RELATIVE_NODES)))
 279                                 return ERR_PTR(-EINVAL);
 280                 }
 281         } else if (mode == MPOL_LOCAL) {
 282                 if (!nodes_empty(*nodes))
 283                         return ERR_PTR(-EINVAL);
 284                 mode = MPOL_PREFERRED;
 285         } else if (nodes_empty(*nodes))
 286                 return ERR_PTR(-EINVAL);
 287         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 288         if (!policy)
 289                 return ERR_PTR(-ENOMEM);
 290         atomic_set(&policy->refcnt, 1);
 291         policy->mode = mode;
 292         policy->flags = flags;
 293
 294         return policy;
 295 }
 296
 297 /* Slow path of a mpol destructor. */
 298 void __mpol_put(struct mempolicy *p)
 299 {
 300         if (!atomic_dec_and_test(&p->refcnt))
 301                 return;
 302         kmem_cache_free(policy_cache, p);
 303 }
 304
 305 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 306                                 enum mpol_rebind_step step)
 307 {
 308 }
 309
 310 /*
 311  * step:
 312  *      MPOL_REBIND_ONCE  - do rebind work at once
 313  *      MPOL_REBIND_STEP1 - set all the newly nodes
 314  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 315  */
 316 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 317                                  enum mpol_rebind_step step)
 318 {
 319         nodemask_t tmp;
 320
 321         if (pol->flags & MPOL_F_STATIC_NODES)
 322                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 323         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 324                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 325         else {
 326                 /*
 327                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 328                  * result
 329                  */
 330                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 331                         nodes_remap(tmp, pol->v.nodes,
 332                                         pol->w.cpuset_mems_allowed, *nodes);
 333                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 334                 } else if (step == MPOL_REBIND_STEP2) {
 335                         tmp = pol->w.cpuset_mems_allowed;
 336                         pol->w.cpuset_mems_allowed = *nodes;
 337                 } else
 338                         BUG();
 339         }
 340
 341         if (nodes_empty(tmp))
 342                 tmp = *nodes;
 343
 344         if (step == MPOL_REBIND_STEP1)
 345                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 346         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 347                 pol->v.nodes = tmp;
 348         else
 349                 BUG();
 350
 351         if (!node_isset(current->il_next, tmp)) {
 352                 current->il_next = next_node(current->il_next, tmp);
 353                 if (current->il_next >= MAX_NUMNODES)
 354                         current->il_next = first_node(tmp);
 355                 if (current->il_next >= MAX_NUMNODES)
 356                         current->il_next = numa_node_id();
 357         }
 358 }
 359
 360 static void mpol_rebind_preferred(struct mempolicy *pol,
 361                                   const nodemask_t *nodes,
 362                                   enum mpol_rebind_step step)
 363 {
 364         nodemask_t tmp;
 365
 366         if (pol->flags & MPOL_F_STATIC_NODES) {
 367                 int node = first_node(pol->w.user_nodemask);
 368
 369                 if (node_isset(node, *nodes)) {
 370                         pol->v.preferred_node = node;
 371                         pol->flags &= ~MPOL_F_LOCAL;
 372                 } else
 373                         pol->flags |= MPOL_F_LOCAL;
 374         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 375                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 376                 pol->v.preferred_node = first_node(tmp);
 377         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 378                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 379                                                    pol->w.cpuset_mems_allowed,
 380                                                    *nodes);
 381                 pol->w.cpuset_mems_allowed = *nodes;
 382         }
 383 }
 384
 385 /*
 386  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 387  *
 388  * If read-side task has no lock to protect task->mempolicy, write-side
 389  * task will rebind the task->mempolicy by two step. The first step is
 390  * setting all the newly nodes, and the second step is cleaning all the
 391  * disallowed nodes. In this way, we can avoid finding no node to alloc
 392  * page.
 393  * If we have a lock to protect task->mempolicy in read-side, we do
 394  * rebind directly.
 395  *
 396  * step:
 397  *      MPOL_REBIND_ONCE  - do rebind work at once
 398  *      MPOL_REBIND_STEP1 - set all the newly nodes
 399  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 400  */
 401 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 402                                 enum mpol_rebind_step step)
 403 {
 404         if (!pol)
 405                 return;
 406         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 407             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 408                 return;
 409
 410         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 411                 return;
 412
 413         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 414                 BUG();
 415
 416         if (step == MPOL_REBIND_STEP1)
 417                 pol->flags |= MPOL_F_REBINDING;
 418         else if (step == MPOL_REBIND_STEP2)
 419                 pol->flags &= ~MPOL_F_REBINDING;
 420         else if (step >= MPOL_REBIND_NSTEP)
 421                 BUG();
 422
 423         mpol_ops[pol->mode].rebind(pol, newmask, step);
 424 }
 425
 426 /*
 427  * Wrapper for mpol_rebind_policy() that just requires task
 428  * pointer, and updates task mempolicy.
 429  *
 430  * Called with task's alloc_lock held.
 431  */
 432
 433 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 434                         enum mpol_rebind_step step)
 435 {
 436         mpol_rebind_policy(tsk->mempolicy, new, step);
 437 }
 438
 439 /*
 440  * Rebind each vma in mm to new nodemask.
 441  *
 442  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 443  */
 444
 445 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 446 {
 447         struct vm_area_struct *vma;
 448
 449         down_write(&mm->mmap_sem);
 450         for (vma = mm->mmap; vma; vma = vma->vm_next)
 451                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 452         up_write(&mm->mmap_sem);
 453 }
 454
 455 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 456         [MPOL_DEFAULT] = {
 457                 .rebind = mpol_rebind_default,
 458         },
 459         [MPOL_INTERLEAVE] = {
 460                 .create = mpol_new_interleave,
 461                 .rebind = mpol_rebind_nodemask,
 462         },
 463         [MPOL_PREFERRED] = {
 464                 .create = mpol_new_preferred,
 465                 .rebind = mpol_rebind_preferred,
 466         },
 467         [MPOL_BIND] = {
 468                 .create = mpol_new_bind,
 469                 .rebind = mpol_rebind_nodemask,
 470         },
 471 };
 472
 473 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 474                                 unsigned long flags);
 475
 476 /* Scan through pages checking if pages follow certain conditions. */
 477 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 478                 unsigned long addr, unsigned long end,
 479                 const nodemask_t *nodes, unsigned long flags,
 480                 void *private)
 481 {
 482         pte_t *orig_pte;
 483         pte_t *pte;
 484         spinlock_t *ptl;
 485
 486         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 487         do {
 488                 struct page *page;
 489                 int nid;
 490
 491                 if (!pte_present(*pte))
 492                         continue;
 493                 page = vm_normal_page(vma, addr, *pte);
 494                 if (!page)
 495                         continue;
 496                 /*
 497                  * vm_normal_page() filters out zero pages, but there might
 498                  * still be PageReserved pages to skip, perhaps in a VDSO.
 499                  * And we cannot move PageKsm pages sensibly or safely yet.
 500                  */
 501                 if (PageReserved(page) || PageKsm(page))
 502                         continue;
 503                 nid = page_to_nid(page);
 504                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 505                         continue;
 506
 507                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 508                         migrate_page_add(page, private, flags);
 509                 else
 510                         break;
 511         } while (pte++, addr += PAGE_SIZE, addr != end);
 512         pte_unmap_unlock(orig_pte, ptl);
 513         return addr != end;
 514 }
 515
 516 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 517                 unsigned long addr, unsigned long end,
 518                 const nodemask_t *nodes, unsigned long flags,
 519                 void *private)
 520 {
 521         pmd_t *pmd;
 522         unsigned long next;
 523
 524         pmd = pmd_offset(pud, addr);
 525         do {
 526                 next = pmd_addr_end(addr, end);
 527                 split_huge_page_pmd(vma, addr, pmd);
 528                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 529                         continue;
 530                 if (check_pte_range(vma, pmd, addr, next, nodes,
 531                                     flags, private))
 532                         return -EIO;
 533         } while (pmd++, addr = next, addr != end);
 534         return 0;
 535 }
 536
 537 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 538                 unsigned long addr, unsigned long end,
 539                 const nodemask_t *nodes, unsigned long flags,
 540                 void *private)
 541 {
 542         pud_t *pud;
 543         unsigned long next;
 544
 545         pud = pud_offset(pgd, addr);
 546         do {
 547                 next = pud_addr_end(addr, end);
 548                 if (pud_none_or_clear_bad(pud))
 549                         continue;
 550                 if (check_pmd_range(vma, pud, addr, next, nodes,
 551                                     flags, private))
 552                         return -EIO;
 553         } while (pud++, addr = next, addr != end);
 554         return 0;
 555 }
 556
 557 static inline int check_pgd_range(struct vm_area_struct *vma,
 558                 unsigned long addr, unsigned long end,
 559                 const nodemask_t *nodes, unsigned long flags,
 560                 void *private)
 561 {
 562         pgd_t *pgd;
 563         unsigned long next;
 564
 565         pgd = pgd_offset(vma->vm_mm, addr);
 566         do {
 567                 next = pgd_addr_end(addr, end);
 568                 if (pgd_none_or_clear_bad(pgd))
 569                         continue;
 570                 if (check_pud_range(vma, pgd, addr, next, nodes,
 571                                     flags, private))
 572                         return -EIO;
 573         } while (pgd++, addr = next, addr != end);
 574         return 0;
 575 }
 576
 577 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 578 /*
 579  * This is used to mark a range of virtual addresses to be inaccessible.
 580  * These are later cleared by a NUMA hinting fault. Depending on these
 581  * faults, pages may be migrated for better NUMA placement.
 582  *
 583  * This is assuming that NUMA faults are handled using PROT_NONE. If
 584  * an architecture makes a different choice, it will need further
 585  * changes to the core.
 586  */
 587 unsigned long change_prot_numa(struct vm_area_struct *vma,
 588                         unsigned long addr, unsigned long end)
 589 {
 590         int nr_updated;
 591         BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 592
 593         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 594         if (nr_updated)
 595                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 596
 597         return nr_updated;
 598 }
 599 #else
 600 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 601                         unsigned long addr, unsigned long end)
 602 {
 603         return 0;
 604 }
 605 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 606
 607 /*
 608  * Check if all pages in a range are on a set of nodes.
 609  * If pagelist != NULL then isolate pages from the LRU and
 610  * put them on the pagelist.
 611  */
 612 static struct vm_area_struct *
 613 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 614                 const nodemask_t *nodes, unsigned long flags, void *private)
 615 {
 616         int err;
 617         struct vm_area_struct *first, *vma, *prev;
 618
 619
 620         first = find_vma(mm, start);
 621         if (!first)
 622                 return ERR_PTR(-EFAULT);
 623         prev = NULL;
 624         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 625                 unsigned long endvma = vma->vm_end;
 626
 627                 if (endvma > end)
 628                         endvma = end;
 629                 if (vma->vm_start > start)
 630                         start = vma->vm_start;
 631
 632                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 633                         if (!vma->vm_next && vma->vm_end < end)
 634                                 return ERR_PTR(-EFAULT);
 635                         if (prev && prev->vm_end < vma->vm_start)
 636                                 return ERR_PTR(-EFAULT);
 637                 }
 638
 639                 if (is_vm_hugetlb_page(vma))
 640                         goto next;
 641
 642                 if (flags & MPOL_MF_LAZY) {
 643                         change_prot_numa(vma, start, endvma);
 644                         goto next;
 645                 }
 646
 647                 if ((flags & MPOL_MF_STRICT) ||
 648                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 649                       vma_migratable(vma))) {
 650
 651                         err = check_pgd_range(vma, start, endvma, nodes,
 652                                                 flags, private);
 653                         if (err) {
 654                                 first = ERR_PTR(err);
 655                                 break;
 656                         }
 657                 }
 658 next:
 659                 prev = vma;
 660         }
 661         return first;
 662 }
 663
 664 /*
 665  * Apply policy to a single VMA
 666  * This must be called with the mmap_sem held for writing.
 667  */
 668 static int vma_replace_policy(struct vm_area_struct *vma,
 669                                                 struct mempolicy *pol)
 670 {
 671         int err;
 672         struct mempolicy *old;
 673         struct mempolicy *new;
 674
 675         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 676                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 677                  vma->vm_ops, vma->vm_file,
 678                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 679
 680         new = mpol_dup(pol);
 681         if (IS_ERR(new))
 682                 return PTR_ERR(new);
 683
 684         if (vma->vm_ops && vma->vm_ops->set_policy) {
 685                 err = vma->vm_ops->set_policy(vma, new);
 686                 if (err)
 687                         goto err_out;
 688         }
 689
 690         old = vma->vm_policy;
 691         vma->vm_policy = new; /* protected by mmap_sem */
 692         mpol_put(old);
 693
 694         return 0;
 695  err_out:
 696         mpol_put(new);
 697         return err;
 698 }
 699
 700 /* Step 2: apply policy to a range and do splits. */
 701 static int mbind_range(struct mm_struct *mm, unsigned long start,
 702                        unsigned long end, struct mempolicy *new_pol)
 703 {
 704         struct vm_area_struct *next;
 705         struct vm_area_struct *prev;
 706         struct vm_area_struct *vma;
 707         int err = 0;
 708         pgoff_t pgoff;
 709         unsigned long vmstart;
 710         unsigned long vmend;
 711
 712         vma = find_vma(mm, start);
 713         if (!vma || vma->vm_start > start)
 714                 return -EFAULT;
 715
 716         prev = vma->vm_prev;
 717         if (start > vma->vm_start)
 718                 prev = vma;
 719
 720         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 721                 next = vma->vm_next;
 722                 vmstart = max(start, vma->vm_start);
 723                 vmend   = min(end, vma->vm_end);
 724
 725                 if (mpol_equal(vma_policy(vma), new_pol))
 726                         continue;
 727
 728                 pgoff = vma->vm_pgoff +
 729                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 730                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 731                                   vma->anon_vma, vma->vm_file, pgoff,
 732                                   new_pol);
 733                 if (prev) {
 734                         vma = prev;
 735                         next = vma->vm_next;
 736                         continue;
 737                 }
 738                 if (vma->vm_start != vmstart) {
 739                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 740                         if (err)
 741                                 goto out;
 742                 }
 743                 if (vma->vm_end != vmend) {
 744                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 745                         if (err)
 746                                 goto out;
 747                 }
 748                 err = vma_replace_policy(vma, new_pol);
 749                 if (err)
 750                         goto out;
 751         }
 752
 753  out:
 754         return err;
 755 }
 756
 757 /*
 758  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 759  * mempolicy.  Allows more rapid checking of this (combined perhaps
 760  * with other PF_* flag bits) on memory allocation hot code paths.
 761  *
 762  * If called from outside this file, the task 'p' should -only- be
 763  * a newly forked child not yet visible on the task list, because
 764  * manipulating the task flags of a visible task is not safe.
 765  *
 766  * The above limitation is why this routine has the funny name
 767  * mpol_fix_fork_child_flag().
 768  *
 769  * It is also safe to call this with a task pointer of current,
 770  * which the static wrapper mpol_set_task_struct_flag() does,
 771  * for use within this file.
 772  */
 773
 774 void mpol_fix_fork_child_flag(struct task_struct *p)
 775 {
 776         if (p->mempolicy)
 777                 p->flags |= PF_MEMPOLICY;
 778         else
 779                 p->flags &= ~PF_MEMPOLICY;
 780 }
 781
 782 static void mpol_set_task_struct_flag(void)
 783 {
 784         mpol_fix_fork_child_flag(current);
 785 }
 786
 787 /* Set the process memory policy */
 788 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 789                              nodemask_t *nodes)
 790 {
 791         struct mempolicy *new, *old;
 792         struct mm_struct *mm = current->mm;
 793         NODEMASK_SCRATCH(scratch);
 794         int ret;
 795
 796         if (!scratch)
 797                 return -ENOMEM;
 798
 799         new = mpol_new(mode, flags, nodes);
 800         if (IS_ERR(new)) {
 801                 ret = PTR_ERR(new);
 802                 goto out;
 803         }
 804         /*
 805          * prevent changing our mempolicy while show_numa_maps()
 806          * is using it.
 807          * Note:  do_set_mempolicy() can be called at init time
 808          * with no 'mm'.
 809          */
 810         if (mm)
 811                 down_write(&mm->mmap_sem);
 812         task_lock(current);
 813         ret = mpol_set_nodemask(new, nodes, scratch);
 814         if (ret) {
 815                 task_unlock(current);
 816                 if (mm)
 817                         up_write(&mm->mmap_sem);
 818                 mpol_put(new);
 819                 goto out;
 820         }
 821         old = current->mempolicy;
 822         current->mempolicy = new;
 823         mpol_set_task_struct_flag();
 824         if (new && new->mode == MPOL_INTERLEAVE &&
 825             nodes_weight(new->v.nodes))
 826                 current->il_next = first_node(new->v.nodes);
 827         task_unlock(current);
 828         if (mm)
 829                 up_write(&mm->mmap_sem);
 830
 831         mpol_put(old);
 832         ret = 0;
 833 out:
 834         NODEMASK_SCRATCH_FREE(scratch);
 835         return ret;
 836 }
 837
 838 /*
 839  * Return nodemask for policy for get_mempolicy() query
 840  *
 841  * Called with task's alloc_lock held
 842  */
 843 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 844 {
 845         nodes_clear(*nodes);
 846         if (p == &default_policy)
 847                 return;
 848
 849         switch (p->mode) {
 850         case MPOL_BIND:
 851                 /* Fall through */
 852         case MPOL_INTERLEAVE:
 853                 *nodes = p->v.nodes;
 854                 break;
 855         case MPOL_PREFERRED:
 856                 if (!(p->flags & MPOL_F_LOCAL))
 857                         node_set(p->v.preferred_node, *nodes);
 858                 /* else return empty node mask for local allocation */
 859                 break;
 860         default:
 861                 BUG();
 862         }
 863 }
 864
 865 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 866 {
 867         struct page *p;
 868         int err;
 869
 870         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 871         if (err >= 0) {
 872                 err = page_to_nid(p);
 873                 put_page(p);
 874         }
 875         return err;
 876 }
 877
 878 /* Retrieve NUMA policy */
 879 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 880                              unsigned long addr, unsigned long flags)
 881 {
 882         int err;
 883         struct mm_struct *mm = current->mm;
 884         struct vm_area_struct *vma = NULL;
 885         struct mempolicy *pol = current->mempolicy;
 886
 887         if (flags &
 888                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 889                 return -EINVAL;
 890
 891         if (flags & MPOL_F_MEMS_ALLOWED) {
 892                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 893                         return -EINVAL;
 894                 *policy = 0;    /* just so it's initialized */
 895                 task_lock(current);
 896                 *nmask  = cpuset_current_mems_allowed;
 897                 task_unlock(current);
 898                 return 0;
 899         }
 900
 901         if (flags & MPOL_F_ADDR) {
 902                 /*
 903                  * Do NOT fall back to task policy if the
 904                  * vma/shared policy at addr is NULL.  We
 905                  * want to return MPOL_DEFAULT in this case.
 906                  */
 907                 down_read(&mm->mmap_sem);
 908                 vma = find_vma_intersection(mm, addr, addr+1);
 909                 if (!vma) {
 910                         up_read(&mm->mmap_sem);
 911                         return -EFAULT;
 912                 }
 913                 if (vma->vm_ops && vma->vm_ops->get_policy)
 914                         pol = vma->vm_ops->get_policy(vma, addr);
 915                 else
 916                         pol = vma->vm_policy;
 917         } else if (addr)
 918                 return -EINVAL;
 919
 920         if (!pol)
 921                 pol = &default_policy;  /* indicates default behavior */
 922
 923         if (flags & MPOL_F_NODE) {
 924                 if (flags & MPOL_F_ADDR) {
 925                         err = lookup_node(mm, addr);
 926                         if (err < 0)
 927                                 goto out;
 928                         *policy = err;
 929                 } else if (pol == current->mempolicy &&
 930                                 pol->mode == MPOL_INTERLEAVE) {
 931                         *policy = current->il_next;
 932                 } else {
 933                         err = -EINVAL;
 934                         goto out;
 935                 }
 936         } else {
 937                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 938                                                 pol->mode;
 939                 /*
 940                  * Internal mempolicy flags must be masked off before exposing
 941                  * the policy to userspace.
 942                  */
 943                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 944         }
 945
 946         if (vma) {
 947                 up_read(&current->mm->mmap_sem);
 948                 vma = NULL;
 949         }
 950
 951         err = 0;
 952         if (nmask) {
 953                 if (mpol_store_user_nodemask(pol)) {
 954                         *nmask = pol->w.user_nodemask;
 955                 } else {
 956                         task_lock(current);
 957                         get_policy_nodemask(pol, nmask);
 958                         task_unlock(current);
 959                 }
 960         }
 961
 962  out:
 963         mpol_cond_put(pol);
 964         if (vma)
 965                 up_read(&current->mm->mmap_sem);
 966         return err;
 967 }
 968
 969 #ifdef CONFIG_MIGRATION
 970 /*
 971  * page migration
 972  */
 973 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 974                                 unsigned long flags)
 975 {
 976         /*
 977          * Avoid migrating a page that is shared with others.
 978          */
 979         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 980                 if (!isolate_lru_page(page)) {
 981                         list_add_tail(&page->lru, pagelist);
 982                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 983                                             page_is_file_cache(page));
 984                 }
 985         }
 986 }
 987
 988 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 989 {
 990         return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 991 }
 992
 993 /*
 994  * Migrate pages from one node to a target node.
 995  * Returns error or the number of pages not migrated.
 996  */
 997 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 998                            int flags)
 999 {
1000         nodemask_t nmask;
1001         LIST_HEAD(pagelist);
1002         int err = 0;
1003
1004         nodes_clear(nmask);
1005         node_set(source, nmask);
1006
1007         /*
1008          * This does not "check" the range but isolates all pages that
1009          * need migration.  Between passing in the full user address
1010          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1011          */
1012         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1013         check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1014                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1015
1016         if (!list_empty(&pagelist)) {
1017                 err = migrate_pages(&pagelist, new_node_page, dest,
1018                                                         false, MIGRATE_SYNC,
1019                                                         MR_SYSCALL);
1020                 if (err)
1021                         putback_lru_pages(&pagelist);
1022         }
1023
1024         return err;
1025 }
1026
1027 /*
1028  * Move pages between the two nodesets so as to preserve the physical
1029  * layout as much as possible.
1030  *
1031  * Returns the number of page that could not be moved.
1032  */
1033 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1034                      const nodemask_t *to, int flags)
1035 {
1036         int busy = 0;
1037         int err;
1038         nodemask_t tmp;
1039
1040         err = migrate_prep();
1041         if (err)
1042                 return err;
1043
1044         down_read(&mm->mmap_sem);
1045
1046         err = migrate_vmas(mm, from, to, flags);
1047         if (err)
1048                 goto out;
1049
1050         /*
1051          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1052          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1053          * bit in 'tmp', and return that <source, dest> pair for migration.
1054          * The pair of nodemasks 'to' and 'from' define the map.
1055          *
1056          * If no pair of bits is found that way, fallback to picking some
1057          * pair of 'source' and 'dest' bits that are not the same.  If the
1058          * 'source' and 'dest' bits are the same, this represents a node
1059          * that will be migrating to itself, so no pages need move.
1060          *
1061          * If no bits are left in 'tmp', or if all remaining bits left
1062          * in 'tmp' correspond to the same bit in 'to', return false
1063          * (nothing left to migrate).
1064          *
1065          * This lets us pick a pair of nodes to migrate between, such that
1066          * if possible the dest node is not already occupied by some other
1067          * source node, minimizing the risk of overloading the memory on a
1068          * node that would happen if we migrated incoming memory to a node
1069          * before migrating outgoing memory source that same node.
1070          *
1071          * A single scan of tmp is sufficient.  As we go, we remember the
1072          * most recent <s, d> pair that moved (s != d).  If we find a pair
1073          * that not only moved, but what's better, moved to an empty slot
1074          * (d is not set in tmp), then we break out then, with that pair.
1075          * Otherwise when we finish scanning from_tmp, we at least have the
1076          * most recent <s, d> pair that moved.  If we get all the way through
1077          * the scan of tmp without finding any node that moved, much less
1078          * moved to an empty node, then there is nothing left worth migrating.
1079          */
1080
1081         tmp = *from;
1082         while (!nodes_empty(tmp)) {
1083                 int s,d;
1084                 int source = -1;
1085                 int dest = 0;
1086
1087                 for_each_node_mask(s, tmp) {
1088
1089                         /*
1090                          * do_migrate_pages() tries to maintain the relative
1091                          * node relationship of the pages established between
1092                          * threads and memory areas.
1093                          *
1094                          * However if the number of source nodes is not equal to
1095                          * the number of destination nodes we can not preserve
1096                          * this node relative relationship.  In that case, skip
1097                          * copying memory from a node that is in the destination
1098                          * mask.
1099                          *
1100                          * Example: [2,3,4] -> [3,4,5] moves everything.
1101                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1102                          */
1103
1104                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1105                                                 (node_isset(s, *to)))
1106                                 continue;
1107
1108                         d = node_remap(s, *from, *to);
1109                         if (s == d)
1110                                 continue;
1111
1112                         source = s;     /* Node moved. Memorize */
1113                         dest = d;
1114
1115                         /* dest not in remaining from nodes? */
1116                         if (!node_isset(dest, tmp))
1117                                 break;
1118                 }
1119                 if (source == -1)
1120                         break;
1121
1122                 node_clear(source, tmp);
1123                 err = migrate_to_node(mm, source, dest, flags);
1124                 if (err > 0)
1125                         busy += err;
1126                 if (err < 0)
1127                         break;
1128         }
1129 out:
1130         up_read(&mm->mmap_sem);
1131         if (err < 0)
1132                 return err;
1133         return busy;
1134
1135 }
1136
1137 /*
1138  * Allocate a new page for page migration based on vma policy.
1139  * Start assuming that page is mapped by vma pointed to by @private.
1140  * Search forward from there, if not.  N.B., this assumes that the
1141  * list of pages handed to migrate_pages()--which is how we get here--
1142  * is in virtual address order.
1143  */
1144 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1145 {
1146         struct vm_area_struct *vma = (struct vm_area_struct *)private;
1147         unsigned long uninitialized_var(address);
1148
1149         while (vma) {
1150                 address = page_address_in_vma(page, vma);
1151                 if (address != -EFAULT)
1152                         break;
1153                 vma = vma->vm_next;
1154         }
1155
1156         /*
1157          * if !vma, alloc_page_vma() will use task or system default policy
1158          */
1159         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1160 }
1161 #else
1162
1163 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1164                                 unsigned long flags)
1165 {
1166 }
1167
1168 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1169                      const nodemask_t *to, int flags)
1170 {
1171         return -ENOSYS;
1172 }
1173
1174 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1175 {
1176         return NULL;
1177 }
1178 #endif
1179
1180 static long do_mbind(unsigned long start, unsigned long len,
1181                      unsigned short mode, unsigned short mode_flags,
1182                      nodemask_t *nmask, unsigned long flags)
1183 {
1184         struct vm_area_struct *vma;
1185         struct mm_struct *mm = current->mm;
1186         struct mempolicy *new;
1187         unsigned long end;
1188         int err;
1189         LIST_HEAD(pagelist);
1190
1191         if (flags & ~(unsigned long)MPOL_MF_VALID)
1192                 return -EINVAL;
1193         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1194                 return -EPERM;
1195
1196         if (start & ~PAGE_MASK)
1197                 return -EINVAL;
1198
1199         if (mode == MPOL_DEFAULT)
1200                 flags &= ~MPOL_MF_STRICT;
1201
1202         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1203         end = start + len;
1204
1205         if (end < start)
1206                 return -EINVAL;
1207         if (end == start)
1208                 return 0;
1209
1210         new = mpol_new(mode, mode_flags, nmask);
1211         if (IS_ERR(new))
1212                 return PTR_ERR(new);
1213
1214         if (flags & MPOL_MF_LAZY)
1215                 new->flags |= MPOL_F_MOF;
1216
1217         /*
1218          * If we are using the default policy then operation
1219          * on discontinuous address spaces is okay after all
1220          */
1221         if (!new)
1222                 flags |= MPOL_MF_DISCONTIG_OK;
1223
1224         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1225                  start, start + len, mode, mode_flags,
1226                  nmask ? nodes_addr(*nmask)[0] : -1);
1227
1228         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1229
1230                 err = migrate_prep();
1231                 if (err)
1232                         goto mpol_out;
1233         }
1234         {
1235                 NODEMASK_SCRATCH(scratch);
1236                 if (scratch) {
1237                         down_write(&mm->mmap_sem);
1238                         task_lock(current);
1239                         err = mpol_set_nodemask(new, nmask, scratch);
1240                         task_unlock(current);
1241                         if (err)
1242                                 up_write(&mm->mmap_sem);
1243                 } else
1244                         err = -ENOMEM;
1245                 NODEMASK_SCRATCH_FREE(scratch);
1246         }
1247         if (err)
1248                 goto mpol_out;
1249
1250         vma = check_range(mm, start, end, nmask,
1251                           flags | MPOL_MF_INVERT, &pagelist);
1252
1253         err = PTR_ERR(vma);     /* maybe ... */
1254         if (!IS_ERR(vma))
1255                 err = mbind_range(mm, start, end, new);
1256
1257         if (!err) {
1258                 int nr_failed = 0;
1259
1260                 if (!list_empty(&pagelist)) {
1261                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1262                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1263                                                 (unsigned long)vma,
1264                                                 false, MIGRATE_SYNC,
1265                                                 MR_MEMPOLICY_MBIND);
1266                         if (nr_failed)
1267                                 putback_lru_pages(&pagelist);
1268                 }
1269
1270                 if (nr_failed && (flags & MPOL_MF_STRICT))
1271                         err = -EIO;
1272         } else
1273                 putback_lru_pages(&pagelist);
1274
1275         up_write(&mm->mmap_sem);
1276  mpol_out:
1277         mpol_put(new);
1278         return err;
1279 }
1280
1281 /*
1282  * User space interface with variable sized bitmaps for nodelists.
1283  */
1284
1285 /* Copy a node mask from user space. */
1286 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1287                      unsigned long maxnode)
1288 {
1289         unsigned long k;
1290         unsigned long nlongs;
1291         unsigned long endmask;
1292
1293         --maxnode;
1294         nodes_clear(*nodes);
1295         if (maxnode == 0 || !nmask)
1296                 return 0;
1297         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1298                 return -EINVAL;
1299
1300         nlongs = BITS_TO_LONGS(maxnode);
1301         if ((maxnode % BITS_PER_LONG) == 0)
1302                 endmask = ~0UL;
1303         else
1304                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1305
1306         /* When the user specified more nodes than supported just check
1307            if the non supported part is all zero. */
1308         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1309                 if (nlongs > PAGE_SIZE/sizeof(long))
1310                         return -EINVAL;
1311                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1312                         unsigned long t;
1313                         if (get_user(t, nmask + k))
1314                                 return -EFAULT;
1315                         if (k == nlongs - 1) {
1316                                 if (t & endmask)
1317                                         return -EINVAL;
1318                         } else if (t)
1319                                 return -EINVAL;
1320                 }
1321                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1322                 endmask = ~0UL;
1323         }
1324
1325         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1326                 return -EFAULT;
1327         nodes_addr(*nodes)[nlongs-1] &= endmask;
1328         return 0;
1329 }
1330
1331 /* Copy a kernel node mask to user space */
1332 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1333                               nodemask_t *nodes)
1334 {
1335         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1336         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1337
1338         if (copy > nbytes) {
1339                 if (copy > PAGE_SIZE)
1340                         return -EINVAL;
1341                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1342                         return -EFAULT;
1343                 copy = nbytes;
1344         }
1345         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1346 }
1347
1348 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1349                 unsigned long, mode, unsigned long __user *, nmask,
1350                 unsigned long, maxnode, unsigned, flags)
1351 {
1352         nodemask_t nodes;
1353         int err;
1354         unsigned short mode_flags;
1355
1356         mode_flags = mode & MPOL_MODE_FLAGS;
1357         mode &= ~MPOL_MODE_FLAGS;
1358         if (mode >= MPOL_MAX)
1359                 return -EINVAL;
1360         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1361             (mode_flags & MPOL_F_RELATIVE_NODES))
1362                 return -EINVAL;
1363         err = get_nodes(&nodes, nmask, maxnode);
1364         if (err)
1365                 return err;
1366         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1367 }
1368
1369 /* Set the process memory policy */
1370 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1371                 unsigned long, maxnode)
1372 {
1373         int err;
1374         nodemask_t nodes;
1375         unsigned short flags;
1376
1377         flags = mode & MPOL_MODE_FLAGS;
1378         mode &= ~MPOL_MODE_FLAGS;
1379         if ((unsigned int)mode >= MPOL_MAX)
1380                 return -EINVAL;
1381         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1382                 return -EINVAL;
1383         err = get_nodes(&nodes, nmask, maxnode);
1384         if (err)
1385                 return err;
1386         return do_set_mempolicy(mode, flags, &nodes);
1387 }
1388
1389 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1390                 const unsigned long __user *, old_nodes,
1391                 const unsigned long __user *, new_nodes)
1392 {
1393         const struct cred *cred = current_cred(), *tcred;
1394         struct mm_struct *mm = NULL;
1395         struct task_struct *task;
1396         nodemask_t task_nodes;
1397         int err;
1398         nodemask_t *old;
1399         nodemask_t *new;
1400         NODEMASK_SCRATCH(scratch);
1401
1402         if (!scratch)
1403                 return -ENOMEM;
1404
1405         old = &scratch->mask1;
1406         new = &scratch->mask2;
1407
1408         err = get_nodes(old, old_nodes, maxnode);
1409         if (err)
1410                 goto out;
1411
1412         err = get_nodes(new, new_nodes, maxnode);
1413         if (err)
1414                 goto out;
1415
1416         /* Find the mm_struct */
1417         rcu_read_lock();
1418         task = pid ? find_task_by_vpid(pid) : current;
1419         if (!task) {
1420                 rcu_read_unlock();
1421                 err = -ESRCH;
1422                 goto out;
1423         }
1424         get_task_struct(task);
1425
1426         err = -EINVAL;
1427
1428         /*
1429          * Check if this process has the right to modify the specified
1430          * process. The right exists if the process has administrative
1431          * capabilities, superuser privileges or the same
1432          * userid as the target process.
1433          */
1434         tcred = __task_cred(task);
1435         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1436             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1437             !capable(CAP_SYS_NICE)) {
1438                 rcu_read_unlock();
1439                 err = -EPERM;
1440                 goto out_put;
1441         }
1442         rcu_read_unlock();
1443
1444         task_nodes = cpuset_mems_allowed(task);
1445         /* Is the user allowed to access the target nodes? */
1446         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1447                 err = -EPERM;
1448                 goto out_put;
1449         }
1450
1451         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1452                 err = -EINVAL;
1453                 goto out_put;
1454         }
1455
1456         err = security_task_movememory(task);
1457         if (err)
1458                 goto out_put;
1459
1460         mm = get_task_mm(task);
1461         put_task_struct(task);
1462
1463         if (!mm) {
1464                 err = -EINVAL;
1465                 goto out;
1466         }
1467
1468         err = do_migrate_pages(mm, old, new,
1469                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1470
1471         mmput(mm);
1472 out:
1473         NODEMASK_SCRATCH_FREE(scratch);
1474
1475         return err;
1476
1477 out_put:
1478         put_task_struct(task);
1479         goto out;
1480
1481 }
1482
1483
1484 /* Retrieve NUMA policy */
1485 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1486                 unsigned long __user *, nmask, unsigned long, maxnode,
1487                 unsigned long, addr, unsigned long, flags)
1488 {
1489         int err;
1490         int uninitialized_var(pval);
1491         nodemask_t nodes;
1492
1493         if (nmask != NULL && maxnode < MAX_NUMNODES)
1494                 return -EINVAL;
1495
1496         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1497
1498         if (err)
1499                 return err;
1500
1501         if (policy && put_user(pval, policy))
1502                 return -EFAULT;
1503
1504         if (nmask)
1505                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1506
1507         return err;
1508 }
1509
1510 #ifdef CONFIG_COMPAT
1511
1512 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1513                                      compat_ulong_t __user *nmask,
1514                                      compat_ulong_t maxnode,
1515                                      compat_ulong_t addr, compat_ulong_t flags)
1516 {
1517         long err;
1518         unsigned long __user *nm = NULL;
1519         unsigned long nr_bits, alloc_size;
1520         DECLARE_BITMAP(bm, MAX_NUMNODES);
1521
1522         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1523         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1524
1525         if (nmask)
1526                 nm = compat_alloc_user_space(alloc_size);
1527
1528         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1529
1530         if (!err && nmask) {
1531                 unsigned long copy_size;
1532                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1533                 err = copy_from_user(bm, nm, copy_size);
1534                 /* ensure entire bitmap is zeroed */
1535                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1536                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1537         }
1538
1539         return err;
1540 }
1541
1542 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1543                                      compat_ulong_t maxnode)
1544 {
1545         long err = 0;
1546         unsigned long __user *nm = NULL;
1547         unsigned long nr_bits, alloc_size;
1548         DECLARE_BITMAP(bm, MAX_NUMNODES);
1549
1550         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1551         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1552
1553         if (nmask) {
1554                 err = compat_get_bitmap(bm, nmask, nr_bits);
1555                 nm = compat_alloc_user_space(alloc_size);
1556                 err |= copy_to_user(nm, bm, alloc_size);
1557         }
1558
1559         if (err)
1560                 return -EFAULT;
1561
1562         return sys_set_mempolicy(mode, nm, nr_bits+1);
1563 }
1564
1565 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1566                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1567                              compat_ulong_t maxnode, compat_ulong_t flags)
1568 {
1569         long err = 0;
1570         unsigned long __user *nm = NULL;
1571         unsigned long nr_bits, alloc_size;
1572         nodemask_t bm;
1573
1574         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1575         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1576
1577         if (nmask) {
1578                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1579                 nm = compat_alloc_user_space(alloc_size);
1580                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1581         }
1582
1583         if (err)
1584                 return -EFAULT;
1585
1586         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1587 }
1588
1589 #endif
1590
1591 /*
1592  * get_vma_policy(@task, @vma, @addr)
1593  * @task - task for fallback if vma policy == default
1594  * @vma   - virtual memory area whose policy is sought
1595  * @addr  - address in @vma for shared policy lookup
1596  *
1597  * Returns effective policy for a VMA at specified address.
1598  * Falls back to @task or system default policy, as necessary.
1599  * Current or other task's task mempolicy and non-shared vma policies must be
1600  * protected by task_lock(task) by the caller.
1601  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1602  * count--added by the get_policy() vm_op, as appropriate--to protect against
1603  * freeing by another task.  It is the caller's responsibility to free the
1604  * extra reference for shared policies.
1605  */
1606 struct mempolicy *get_vma_policy(struct task_struct *task,
1607                 struct vm_area_struct *vma, unsigned long addr)
1608 {
1609         struct mempolicy *pol = get_task_policy(task);
1610
1611         if (vma) {
1612                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1613                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1614                                                                         addr);
1615                         if (vpol)
1616                                 pol = vpol;
1617                 } else if (vma->vm_policy) {
1618                         pol = vma->vm_policy;
1619
1620                         /*
1621                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1622                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1623                          * count on these policies which will be dropped by
1624                          * mpol_cond_put() later
1625                          */
1626                         if (mpol_needs_cond_ref(pol))
1627                                 mpol_get(pol);
1628                 }
1629         }
1630         if (!pol)
1631                 pol = &default_policy;
1632         return pol;
1633 }
1634
1635 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1636 {
1637         enum zone_type dynamic_policy_zone = policy_zone;
1638
1639         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1640
1641         /*
1642          * if policy->v.nodes has movable memory only,
1643          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1644          *
1645          * policy->v.nodes is intersect with node_states[N_MEMORY].
1646          * so if the following test faile, it implies
1647          * policy->v.nodes has movable memory only.
1648          */
1649         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1650                 dynamic_policy_zone = ZONE_MOVABLE;
1651
1652         return zone >= dynamic_policy_zone;
1653 }
1654
1655 /*
1656  * Return a nodemask representing a mempolicy for filtering nodes for
1657  * page allocation
1658  */
1659 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1660 {
1661         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1662         if (unlikely(policy->mode == MPOL_BIND) &&
1663                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1664                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1665                 return &policy->v.nodes;
1666
1667         return NULL;
1668 }
1669
1670 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1671 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1672         int nd)
1673 {
1674         switch (policy->mode) {
1675         case MPOL_PREFERRED:
1676                 if (!(policy->flags & MPOL_F_LOCAL))
1677                         nd = policy->v.preferred_node;
1678                 break;
1679         case MPOL_BIND:
1680                 /*
1681                  * Normally, MPOL_BIND allocations are node-local within the
1682                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1683                  * current node isn't part of the mask, we use the zonelist for
1684                  * the first node in the mask instead.
1685                  */
1686                 if (unlikely(gfp & __GFP_THISNODE) &&
1687                                 unlikely(!node_isset(nd, policy->v.nodes)))
1688                         nd = first_node(policy->v.nodes);
1689                 break;
1690         default:
1691                 BUG();
1692         }
1693         return node_zonelist(nd, gfp);
1694 }
1695
1696 /* Do dynamic interleaving for a process */
1697 static unsigned interleave_nodes(struct mempolicy *policy)
1698 {
1699         unsigned nid, next;
1700         struct task_struct *me = current;
1701
1702         nid = me->il_next;
1703         next = next_node(nid, policy->v.nodes);
1704         if (next >= MAX_NUMNODES)
1705                 next = first_node(policy->v.nodes);
1706         if (next < MAX_NUMNODES)
1707                 me->il_next = next;
1708         return nid;
1709 }
1710
1711 /*
1712  * Depending on the memory policy provide a node from which to allocate the
1713  * next slab entry.
1714  * @policy must be protected by freeing by the caller.  If @policy is
1715  * the current task's mempolicy, this protection is implicit, as only the
1716  * task can change it's policy.  The system default policy requires no
1717  * such protection.
1718  */
1719 unsigned slab_node(void)
1720 {
1721         struct mempolicy *policy;
1722
1723         if (in_interrupt())
1724                 return numa_node_id();
1725
1726         policy = current->mempolicy;
1727         if (!policy || policy->flags & MPOL_F_LOCAL)
1728                 return numa_node_id();
1729
1730         switch (policy->mode) {
1731         case MPOL_PREFERRED:
1732                 /*
1733                  * handled MPOL_F_LOCAL above
1734                  */
1735                 return policy->v.preferred_node;
1736
1737         case MPOL_INTERLEAVE:
1738                 return interleave_nodes(policy);
1739
1740         case MPOL_BIND: {
1741                 /*
1742                  * Follow bind policy behavior and start allocation at the
1743                  * first node.
1744                  */
1745                 struct zonelist *zonelist;
1746                 struct zone *zone;
1747                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1748                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1749                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1750                                                         &policy->v.nodes,
1751                                                         &zone);
1752                 return zone ? zone->node : numa_node_id();
1753         }
1754
1755         default:
1756                 BUG();
1757         }
1758 }
1759
1760 /* Do static interleaving for a VMA with known offset. */
1761 static unsigned offset_il_node(struct mempolicy *pol,
1762                 struct vm_area_struct *vma, unsigned long off)
1763 {
1764         unsigned nnodes = nodes_weight(pol->v.nodes);
1765         unsigned target;
1766         int c;
1767         int nid = -1;
1768
1769         if (!nnodes)
1770                 return numa_node_id();
1771         target = (unsigned int)off % nnodes;
1772         c = 0;
1773         do {
1774                 nid = next_node(nid, pol->v.nodes);
1775                 c++;
1776         } while (c <= target);
1777         return nid;
1778 }
1779
1780 /* Determine a node number for interleave */
1781 static inline unsigned interleave_nid(struct mempolicy *pol,
1782                  struct vm_area_struct *vma, unsigned long addr, int shift)
1783 {
1784         if (vma) {
1785                 unsigned long off;
1786
1787                 /*
1788                  * for small pages, there is no difference between
1789                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1790                  * for huge pages, since vm_pgoff is in units of small
1791                  * pages, we need to shift off the always 0 bits to get
1792                  * a useful offset.
1793                  */
1794                 BUG_ON(shift < PAGE_SHIFT);
1795                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1796                 off += (addr - vma->vm_start) >> shift;
1797                 return offset_il_node(pol, vma, off);
1798         } else
1799                 return interleave_nodes(pol);
1800 }
1801
1802 /*
1803  * Return the bit number of a random bit set in the nodemask.
1804  * (returns -1 if nodemask is empty)
1805  */
1806 int node_random(const nodemask_t *maskp)
1807 {
1808         int w, bit = -1;
1809
1810         w = nodes_weight(*maskp);
1811         if (w)
1812                 bit = bitmap_ord_to_pos(maskp->bits,
1813                         get_random_int() % w, MAX_NUMNODES);
1814         return bit;
1815 }
1816
1817 #ifdef CONFIG_HUGETLBFS
1818 /*
1819  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1820  * @vma = virtual memory area whose policy is sought
1821  * @addr = address in @vma for shared policy lookup and interleave policy
1822  * @gfp_flags = for requested zone
1823  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1824  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1825  *
1826  * Returns a zonelist suitable for a huge page allocation and a pointer
1827  * to the struct mempolicy for conditional unref after allocation.
1828  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1829  * @nodemask for filtering the zonelist.
1830  *
1831  * Must be protected by get_mems_allowed()
1832  */
1833 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1834                                 gfp_t gfp_flags, struct mempolicy **mpol,
1835                                 nodemask_t **nodemask)
1836 {
1837         struct zonelist *zl;
1838
1839         *mpol = get_vma_policy(current, vma, addr);
1840         *nodemask = NULL;       /* assume !MPOL_BIND */
1841
1842         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1843                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1844                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1845         } else {
1846                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1847                 if ((*mpol)->mode == MPOL_BIND)
1848                         *nodemask = &(*mpol)->v.nodes;
1849         }
1850         return zl;
1851 }
1852
1853 /*
1854  * init_nodemask_of_mempolicy
1855  *
1856  * If the current task's mempolicy is "default" [NULL], return 'false'
1857  * to indicate default policy.  Otherwise, extract the policy nodemask
1858  * for 'bind' or 'interleave' policy into the argument nodemask, or
1859  * initialize the argument nodemask to contain the single node for
1860  * 'preferred' or 'local' policy and return 'true' to indicate presence
1861  * of non-default mempolicy.
1862  *
1863  * We don't bother with reference counting the mempolicy [mpol_get/put]
1864  * because the current task is examining it's own mempolicy and a task's
1865  * mempolicy is only ever changed by the task itself.
1866  *
1867  * N.B., it is the caller's responsibility to free a returned nodemask.
1868  */
1869 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1870 {
1871         struct mempolicy *mempolicy;
1872         int nid;
1873
1874         if (!(mask && current->mempolicy))
1875                 return false;
1876
1877         task_lock(current);
1878         mempolicy = current->mempolicy;
1879         switch (mempolicy->mode) {
1880         case MPOL_PREFERRED:
1881                 if (mempolicy->flags & MPOL_F_LOCAL)
1882                         nid = numa_node_id();
1883                 else
1884                         nid = mempolicy->v.preferred_node;
1885                 init_nodemask_of_node(mask, nid);
1886                 break;
1887
1888         case MPOL_BIND:
1889                 /* Fall through */
1890         case MPOL_INTERLEAVE:
1891                 *mask =  mempolicy->v.nodes;
1892                 break;
1893
1894         default:
1895                 BUG();
1896         }
1897         task_unlock(current);
1898
1899         return true;
1900 }
1901 #endif
1902
1903 /*
1904  * mempolicy_nodemask_intersects
1905  *
1906  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1907  * policy.  Otherwise, check for intersection between mask and the policy
1908  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1909  * policy, always return true since it may allocate elsewhere on fallback.
1910  *
1911  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1912  */
1913 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1914                                         const nodemask_t *mask)
1915 {
1916         struct mempolicy *mempolicy;
1917         bool ret = true;
1918
1919         if (!mask)
1920                 return ret;
1921         task_lock(tsk);
1922         mempolicy = tsk->mempolicy;
1923         if (!mempolicy)
1924                 goto out;
1925
1926         switch (mempolicy->mode) {
1927         case MPOL_PREFERRED:
1928                 /*
1929                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1930                  * allocate from, they may fallback to other nodes when oom.
1931                  * Thus, it's possible for tsk to have allocated memory from
1932                  * nodes in mask.
1933                  */
1934                 break;
1935         case MPOL_BIND:
1936         case MPOL_INTERLEAVE:
1937                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1938                 break;
1939         default:
1940                 BUG();
1941         }
1942 out:
1943         task_unlock(tsk);
1944         return ret;
1945 }
1946
1947 /* Allocate a page in interleaved policy.
1948    Own path because it needs to do special accounting. */
1949 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1950                                         unsigned nid)
1951 {
1952         struct zonelist *zl;
1953         struct page *page;
1954
1955         zl = node_zonelist(nid, gfp);
1956         page = __alloc_pages(gfp, order, zl);
1957         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1958                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1959         return page;
1960 }
1961
1962 /**
1963  *      alloc_pages_vma - Allocate a page for a VMA.
1964  *
1965  *      @gfp:
1966  *      %GFP_USER    user allocation.
1967  *      %GFP_KERNEL  kernel allocations,
1968  *      %GFP_HIGHMEM highmem/user allocations,
1969  *      %GFP_FS      allocation should not call back into a file system.
1970  *      %GFP_ATOMIC  don't sleep.
1971  *
1972  *      @order:Order of the GFP allocation.
1973  *      @vma:  Pointer to VMA or NULL if not available.
1974  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1975  *
1976  *      This function allocates a page from the kernel page pool and applies
1977  *      a NUMA policy associated with the VMA or the current process.
1978  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1979  *      mm_struct of the VMA to prevent it from going away. Should be used for
1980  *      all allocations for pages that will be mapped into
1981  *      user space. Returns NULL when no page can be allocated.
1982  *
1983  *      Should be called with the mm_sem of the vma hold.
1984  */
1985 struct page *
1986 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1987                 unsigned long addr, int node)
1988 {
1989         struct mempolicy *pol;
1990         struct page *page;
1991         unsigned int cpuset_mems_cookie;
1992
1993 retry_cpuset:
1994         pol = get_vma_policy(current, vma, addr);
1995         cpuset_mems_cookie = get_mems_allowed();
1996
1997         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1998                 unsigned nid;
1999
2000                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2001                 mpol_cond_put(pol);
2002                 page = alloc_page_interleave(gfp, order, nid);
2003                 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2004                         goto retry_cpuset;
2005
2006                 return page;
2007         }
2008         page = __alloc_pages_nodemask(gfp, order,
2009                                       policy_zonelist(gfp, pol, node),
2010                                       policy_nodemask(gfp, pol));
2011         if (unlikely(mpol_needs_cond_ref(pol)))
2012                 __mpol_put(pol);
2013         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2014                 goto retry_cpuset;
2015         return page;
2016 }
2017
2018 /**
2019  *      alloc_pages_current - Allocate pages.
2020  *
2021  *      @gfp:
2022  *              %GFP_USER   user allocation,
2023  *              %GFP_KERNEL kernel allocation,
2024  *              %GFP_HIGHMEM highmem allocation,
2025  *              %GFP_FS     don't call back into a file system.
2026  *              %GFP_ATOMIC don't sleep.
2027  *      @order: Power of two of allocation size in pages. 0 is a single page.
2028  *
2029  *      Allocate a page from the kernel page pool.  When not in
2030  *      interrupt context and apply the current process NUMA policy.
2031  *      Returns NULL when no page can be allocated.
2032  *
2033  *      Don't call cpuset_update_task_memory_state() unless
2034  *      1) it's ok to take cpuset_sem (can WAIT), and
2035  *      2) allocating for current task (not interrupt).
2036  */
2037 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2038 {
2039         struct mempolicy *pol = get_task_policy(current);
2040         struct page *page;
2041         unsigned int cpuset_mems_cookie;
2042
2043         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2044                 pol = &default_policy;
2045
2046 retry_cpuset:
2047         cpuset_mems_cookie = get_mems_allowed();
2048
2049         /*
2050          * No reference counting needed for current->mempolicy
2051          * nor system default_policy
2052          */
2053         if (pol->mode == MPOL_INTERLEAVE)
2054                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2055         else
2056                 page = __alloc_pages_nodemask(gfp, order,
2057                                 policy_zonelist(gfp, pol, numa_node_id()),
2058                                 policy_nodemask(gfp, pol));
2059
2060         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2061                 goto retry_cpuset;
2062
2063         return page;
2064 }
2065 EXPORT_SYMBOL(alloc_pages_current);
2066
2067 /*
2068  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2069  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2070  * with the mems_allowed returned by cpuset_mems_allowed().  This
2071  * keeps mempolicies cpuset relative after its cpuset moves.  See
2072  * further kernel/cpuset.c update_nodemask().
2073  *
2074  * current's mempolicy may be rebinded by the other task(the task that changes
2075  * cpuset's mems), so we needn't do rebind work for current task.
2076  */
2077
2078 /* Slow path of a mempolicy duplicate */
2079 struct mempolicy *__mpol_dup(struct mempolicy *old)
2080 {
2081         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2082
2083         if (!new)
2084                 return ERR_PTR(-ENOMEM);
2085
2086         /* task's mempolicy is protected by alloc_lock */
2087         if (old == current->mempolicy) {
2088                 task_lock(current);
2089                 *new = *old;
2090                 task_unlock(current);
2091         } else
2092                 *new = *old;
2093
2094         rcu_read_lock();
2095         if (current_cpuset_is_being_rebound()) {
2096                 nodemask_t mems = cpuset_mems_allowed(current);
2097                 if (new->flags & MPOL_F_REBINDING)
2098                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2099                 else
2100                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2101         }
2102         rcu_read_unlock();
2103         atomic_set(&new->refcnt, 1);
2104         return new;
2105 }
2106
2107 /* Slow path of a mempolicy comparison */
2108 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2109 {
2110         if (!a || !b)
2111                 return false;
2112         if (a->mode != b->mode)
2113                 return false;
2114         if (a->flags != b->flags)
2115                 return false;
2116         if (mpol_store_user_nodemask(a))
2117                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2118                         return false;
2119
2120         switch (a->mode) {
2121         case MPOL_BIND:
2122                 /* Fall through */
2123         case MPOL_INTERLEAVE:
2124                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2125         case MPOL_PREFERRED:
2126                 return a->v.preferred_node == b->v.preferred_node;
2127         default:
2128                 BUG();
2129                 return false;
2130         }
2131 }
2132
2133 /*
2134  * Shared memory backing store policy support.
2135  *
2136  * Remember policies even when nobody has shared memory mapped.
2137  * The policies are kept in Red-Black tree linked from the inode.
2138  * They are protected by the sp->lock spinlock, which should be held
2139  * for any accesses to the tree.
2140  */
2141
2142 /* lookup first element intersecting start-end */
2143 /* Caller holds sp->lock */
2144 static struct sp_node *
2145 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2146 {
2147         struct rb_node *n = sp->root.rb_node;
2148
2149         while (n) {
2150                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2151
2152                 if (start >= p->end)
2153                         n = n->rb_right;
2154                 else if (end <= p->start)
2155                         n = n->rb_left;
2156                 else
2157                         break;
2158         }
2159         if (!n)
2160                 return NULL;
2161         for (;;) {
2162                 struct sp_node *w = NULL;
2163                 struct rb_node *prev = rb_prev(n);
2164                 if (!prev)
2165                         break;
2166                 w = rb_entry(prev, struct sp_node, nd);
2167                 if (w->end <= start)
2168                         break;
2169                 n = prev;
2170         }
2171         return rb_entry(n, struct sp_node, nd);
2172 }
2173
2174 /* Insert a new shared policy into the list. */
2175 /* Caller holds sp->lock */
2176 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2177 {
2178         struct rb_node **p = &sp->root.rb_node;
2179         struct rb_node *parent = NULL;
2180         struct sp_node *nd;
2181
2182         while (*p) {
2183                 parent = *p;
2184                 nd = rb_entry(parent, struct sp_node, nd);
2185                 if (new->start < nd->start)
2186                         p = &(*p)->rb_left;
2187                 else if (new->end > nd->end)
2188                         p = &(*p)->rb_right;
2189                 else
2190                         BUG();
2191         }
2192         rb_link_node(&new->nd, parent, p);
2193         rb_insert_color(&new->nd, &sp->root);
2194         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2195                  new->policy ? new->policy->mode : 0);
2196 }
2197
2198 /* Find shared policy intersecting idx */
2199 struct mempolicy *
2200 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2201 {
2202         struct mempolicy *pol = NULL;
2203         struct sp_node *sn;
2204
2205         if (!sp->root.rb_node)
2206                 return NULL;
2207         spin_lock(&sp->lock);
2208         sn = sp_lookup(sp, idx, idx+1);
2209         if (sn) {
2210                 mpol_get(sn->policy);
2211                 pol = sn->policy;
2212         }
2213         spin_unlock(&sp->lock);
2214         return pol;
2215 }
2216
2217 static void sp_free(struct sp_node *n)
2218 {
2219         mpol_put(n->policy);
2220         kmem_cache_free(sn_cache, n);
2221 }
2222
2223 /**
2224  * mpol_misplaced - check whether current page node is valid in policy
2225  *
2226  * @page   - page to be checked
2227  * @vma    - vm area where page mapped
2228  * @addr   - virtual address where page mapped
2229  *
2230  * Lookup current policy node id for vma,addr and "compare to" page's
2231  * node id.
2232  *
2233  * Returns:
2234  *      -1      - not misplaced, page is in the right node
2235  *      node    - node id where the page should be
2236  *
2237  * Policy determination "mimics" alloc_page_vma().
2238  * Called from fault path where we know the vma and faulting address.
2239  */
2240 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2241 {
2242         struct mempolicy *pol;
2243         struct zone *zone;
2244         int curnid = page_to_nid(page);
2245         unsigned long pgoff;
2246         int polnid = -1;
2247         int ret = -1;
2248
2249         BUG_ON(!vma);
2250
2251         pol = get_vma_policy(current, vma, addr);
2252         if (!(pol->flags & MPOL_F_MOF))
2253                 goto out;
2254
2255         switch (pol->mode) {
2256         case MPOL_INTERLEAVE:
2257                 BUG_ON(addr >= vma->vm_end);
2258                 BUG_ON(addr < vma->vm_start);
2259
2260                 pgoff = vma->vm_pgoff;
2261                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2262                 polnid = offset_il_node(pol, vma, pgoff);
2263                 break;
2264
2265         case MPOL_PREFERRED:
2266                 if (pol->flags & MPOL_F_LOCAL)
2267                         polnid = numa_node_id();
2268                 else
2269                         polnid = pol->v.preferred_node;
2270                 break;
2271
2272         case MPOL_BIND:
2273                 /*
2274                  * allows binding to multiple nodes.
2275                  * use current page if in policy nodemask,
2276                  * else select nearest allowed node, if any.
2277                  * If no allowed nodes, use current [!misplaced].
2278                  */
2279                 if (node_isset(curnid, pol->v.nodes))
2280                         goto out;
2281                 (void)first_zones_zonelist(
2282                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2283                                 gfp_zone(GFP_HIGHUSER),
2284                                 &pol->v.nodes, &zone);
2285                 polnid = zone->node;
2286                 break;
2287
2288         default:
2289                 BUG();
2290         }
2291
2292         /* Migrate the page towards the node whose CPU is referencing it */
2293         if (pol->flags & MPOL_F_MORON) {
2294                 int last_nid;
2295
2296                 polnid = numa_node_id();
2297
2298                 /*
2299                  * Multi-stage node selection is used in conjunction
2300                  * with a periodic migration fault to build a temporal
2301                  * task<->page relation. By using a two-stage filter we
2302                  * remove short/unlikely relations.
2303                  *
2304                  * Using P(p) ~ n_p / n_t as per frequentist
2305                  * probability, we can equate a task's usage of a
2306                  * particular page (n_p) per total usage of this
2307                  * page (n_t) (in a given time-span) to a probability.
2308                  *
2309                  * Our periodic faults will sample this probability and
2310                  * getting the same result twice in a row, given these
2311                  * samples are fully independent, is then given by
2312                  * P(n)^2, provided our sample period is sufficiently
2313                  * short compared to the usage pattern.
2314                  *
2315                  * This quadric squishes small probabilities, making
2316                  * it less likely we act on an unlikely task<->page
2317                  * relation.
2318                  */
2319                 last_nid = page_nid_xchg_last(page, polnid);
2320                 if (last_nid != polnid)
2321                         goto out;
2322         }
2323
2324         if (curnid != polnid)
2325                 ret = polnid;
2326 out:
2327         mpol_cond_put(pol);
2328
2329         return ret;
2330 }
2331
2332 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2333 {
2334         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2335         rb_erase(&n->nd, &sp->root);
2336         sp_free(n);
2337 }
2338
2339 static void sp_node_init(struct sp_node *node, unsigned long start,
2340                         unsigned long end, struct mempolicy *pol)
2341 {
2342         node->start = start;
2343         node->end = end;
2344         node->policy = pol;
2345 }
2346
2347 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2348                                 struct mempolicy *pol)
2349 {
2350         struct sp_node *n;
2351         struct mempolicy *newpol;
2352
2353         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2354         if (!n)
2355                 return NULL;
2356
2357         newpol = mpol_dup(pol);
2358         if (IS_ERR(newpol)) {
2359                 kmem_cache_free(sn_cache, n);
2360                 return NULL;
2361         }
2362         newpol->flags |= MPOL_F_SHARED;
2363         sp_node_init(n, start, end, newpol);
2364
2365         return n;
2366 }
2367
2368 /* Replace a policy range. */
2369 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2370                                  unsigned long end, struct sp_node *new)
2371 {
2372         struct sp_node *n;
2373         struct sp_node *n_new = NULL;
2374         struct mempolicy *mpol_new = NULL;
2375         int ret = 0;
2376
2377 restart:
2378         spin_lock(&sp->lock);
2379         n = sp_lookup(sp, start, end);
2380         /* Take care of old policies in the same range. */
2381         while (n && n->start < end) {
2382                 struct rb_node *next = rb_next(&n->nd);
2383                 if (n->start >= start) {
2384                         if (n->end <= end)
2385                                 sp_delete(sp, n);
2386                         else
2387                                 n->start = end;
2388                 } else {
2389                         /* Old policy spanning whole new range. */
2390                         if (n->end > end) {
2391                                 if (!n_new)
2392                                         goto alloc_new;
2393
2394                                 *mpol_new = *n->policy;
2395                                 atomic_set(&mpol_new->refcnt, 1);
2396                                 sp_node_init(n_new, n->end, end, mpol_new);
2397                                 sp_insert(sp, n_new);
2398                                 n->end = start;
2399                                 n_new = NULL;
2400                                 mpol_new = NULL;
2401                                 break;
2402                         } else
2403                                 n->end = start;
2404                 }
2405                 if (!next)
2406                         break;
2407                 n = rb_entry(next, struct sp_node, nd);
2408         }
2409         if (new)
2410                 sp_insert(sp, new);
2411         spin_unlock(&sp->lock);
2412         ret = 0;
2413
2414 err_out:
2415         if (mpol_new)
2416                 mpol_put(mpol_new);
2417         if (n_new)
2418                 kmem_cache_free(sn_cache, n_new);
2419
2420         return ret;
2421
2422 alloc_new:
2423         spin_unlock(&sp->lock);
2424         ret = -ENOMEM;
2425         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2426         if (!n_new)
2427                 goto err_out;
2428         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2429         if (!mpol_new)
2430                 goto err_out;
2431         goto restart;
2432 }
2433
2434 /**
2435  * mpol_shared_policy_init - initialize shared policy for inode
2436  * @sp: pointer to inode shared policy
2437  * @mpol:  struct mempolicy to install
2438  *
2439  * Install non-NULL @mpol in inode's shared policy rb-tree.
2440  * On entry, the current task has a reference on a non-NULL @mpol.
2441  * This must be released on exit.
2442  * This is called at get_inode() calls and we can use GFP_KERNEL.
2443  */
2444 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2445 {
2446         int ret;
2447
2448         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2449         spin_lock_init(&sp->lock);
2450
2451         if (mpol) {
2452                 struct vm_area_struct pvma;
2453                 struct mempolicy *new;
2454                 NODEMASK_SCRATCH(scratch);
2455
2456                 if (!scratch)
2457                         goto put_mpol;
2458                 /* contextualize the tmpfs mount point mempolicy */
2459                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2460                 if (IS_ERR(new))
2461                         goto free_scratch; /* no valid nodemask intersection */
2462
2463                 task_lock(current);
2464                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2465                 task_unlock(current);
2466                 if (ret)
2467                         goto put_new;
2468
2469                 /* Create pseudo-vma that contains just the policy */
2470                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2471                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2472                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2473
2474 put_new:
2475                 mpol_put(new);                  /* drop initial ref */
2476 free_scratch:
2477                 NODEMASK_SCRATCH_FREE(scratch);
2478 put_mpol:
2479                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2480         }
2481 }
2482
2483 int mpol_set_shared_policy(struct shared_policy *info,
2484                         struct vm_area_struct *vma, struct mempolicy *npol)
2485 {
2486         int err;
2487         struct sp_node *new = NULL;
2488         unsigned long sz = vma_pages(vma);
2489
2490         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2491                  vma->vm_pgoff,
2492                  sz, npol ? npol->mode : -1,
2493                  npol ? npol->flags : -1,
2494                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
2495
2496         if (npol) {
2497                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2498                 if (!new)
2499                         return -ENOMEM;
2500         }
2501         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2502         if (err && new)
2503                 sp_free(new);
2504         return err;
2505 }
2506
2507 /* Free a backing policy store on inode delete. */
2508 void mpol_free_shared_policy(struct shared_policy *p)
2509 {
2510         struct sp_node *n;
2511         struct rb_node *next;
2512
2513         if (!p->root.rb_node)
2514                 return;
2515         spin_lock(&p->lock);
2516         next = rb_first(&p->root);
2517         while (next) {
2518                 n = rb_entry(next, struct sp_node, nd);
2519                 next = rb_next(&n->nd);
2520                 sp_delete(p, n);
2521         }
2522         spin_unlock(&p->lock);
2523 }
2524
2525 #ifdef CONFIG_NUMA_BALANCING
2526 static bool __initdata numabalancing_override;
2527
2528 static void __init check_numabalancing_enable(void)
2529 {
2530         bool numabalancing_default = false;
2531
2532         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2533                 numabalancing_default = true;
2534
2535         if (nr_node_ids > 1 && !numabalancing_override) {
2536                 printk(KERN_INFO "Enabling automatic NUMA balancing. "
2537                         "Configure with numa_balancing= or sysctl");
2538                 set_numabalancing_state(numabalancing_default);
2539         }
2540 }
2541
2542 static int __init setup_numabalancing(char *str)
2543 {
2544         int ret = 0;
2545         if (!str)
2546                 goto out;
2547         numabalancing_override = true;
2548
2549         if (!strcmp(str, "enable")) {
2550                 set_numabalancing_state(true);
2551                 ret = 1;
2552         } else if (!strcmp(str, "disable")) {
2553                 set_numabalancing_state(false);
2554                 ret = 1;
2555         }
2556 out:
2557         if (!ret)
2558                 printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2559
2560         return ret;
2561 }
2562 __setup("numa_balancing=", setup_numabalancing);
2563 #else
2564 static inline void __init check_numabalancing_enable(void)
2565 {
2566 }
2567 #endif /* CONFIG_NUMA_BALANCING */
2568
2569 /* assumes fs == KERNEL_DS */
2570 void __init numa_policy_init(void)
2571 {
2572         nodemask_t interleave_nodes;
2573         unsigned long largest = 0;
2574         int nid, prefer = 0;
2575
2576         policy_cache = kmem_cache_create("numa_policy",
2577                                          sizeof(struct mempolicy),
2578                                          0, SLAB_PANIC, NULL);
2579
2580         sn_cache = kmem_cache_create("shared_policy_node",
2581                                      sizeof(struct sp_node),
2582                                      0, SLAB_PANIC, NULL);
2583
2584         for_each_node(nid) {
2585                 preferred_node_policy[nid] = (struct mempolicy) {
2586                         .refcnt = ATOMIC_INIT(1),
2587                         .mode = MPOL_PREFERRED,
2588                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2589                         .v = { .preferred_node = nid, },
2590                 };
2591         }
2592
2593         /*
2594          * Set interleaving policy for system init. Interleaving is only
2595          * enabled across suitably sized nodes (default is >= 16MB), or
2596          * fall back to the largest node if they're all smaller.
2597          */
2598         nodes_clear(interleave_nodes);
2599         for_each_node_state(nid, N_MEMORY) {
2600                 unsigned long total_pages = node_present_pages(nid);
2601
2602                 /* Preserve the largest node */
2603                 if (largest < total_pages) {
2604                         largest = total_pages;
2605                         prefer = nid;
2606                 }
2607
2608                 /* Interleave this node? */
2609                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2610                         node_set(nid, interleave_nodes);
2611         }
2612
2613         /* All too small, use the largest */
2614         if (unlikely(nodes_empty(interleave_nodes)))
2615                 node_set(prefer, interleave_nodes);
2616
2617         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2618                 printk("numa_policy_init: interleaving failed\n");
2619
2620         check_numabalancing_enable();
2621 }
2622
2623 /* Reset policy of current process to default */
2624 void numa_default_policy(void)
2625 {
2626         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2627 }
2628
2629 /*
2630  * Parse and format mempolicy from/to strings
2631  */
2632
2633 /*
2634  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2635  */
2636 static const char * const policy_modes[] =
2637 {
2638         [MPOL_DEFAULT]    = "default",
2639         [MPOL_PREFERRED]  = "prefer",
2640         [MPOL_BIND]       = "bind",
2641         [MPOL_INTERLEAVE] = "interleave",
2642         [MPOL_LOCAL]      = "local",
2643 };
2644
2645
2646 #ifdef CONFIG_TMPFS
2647 /**
2648  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2649  * @str:  string containing mempolicy to parse
2650  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2651  *
2652  * Format of input:
2653  *      <mode>[=<flags>][:<nodelist>]
2654  *
2655  * On success, returns 0, else 1
2656  */
2657 int mpol_parse_str(char *str, struct mempolicy **mpol)
2658 {
2659         struct mempolicy *new = NULL;
2660         unsigned short mode;
2661         unsigned short mode_flags;
2662         nodemask_t nodes;
2663         char *nodelist = strchr(str, ':');
2664         char *flags = strchr(str, '=');
2665         int err = 1;
2666
2667         if (nodelist) {
2668                 /* NUL-terminate mode or flags string */
2669                 *nodelist++ = '\0';
2670                 if (nodelist_parse(nodelist, nodes))
2671                         goto out;
2672                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2673                         goto out;
2674         } else
2675                 nodes_clear(nodes);
2676
2677         if (flags)
2678                 *flags++ = '\0';        /* terminate mode string */
2679
2680         for (mode = 0; mode < MPOL_MAX; mode++) {
2681                 if (!strcmp(str, policy_modes[mode])) {
2682                         break;
2683                 }
2684         }
2685         if (mode >= MPOL_MAX)
2686                 goto out;
2687
2688         switch (mode) {
2689         case MPOL_PREFERRED:
2690                 /*
2691                  * Insist on a nodelist of one node only
2692                  */
2693                 if (nodelist) {
2694                         char *rest = nodelist;
2695                         while (isdigit(*rest))
2696                                 rest++;
2697                         if (*rest)
2698                                 goto out;
2699                 }
2700                 break;
2701         case MPOL_INTERLEAVE:
2702                 /*
2703                  * Default to online nodes with memory if no nodelist
2704                  */
2705                 if (!nodelist)
2706                         nodes = node_states[N_MEMORY];
2707                 break;
2708         case MPOL_LOCAL:
2709                 /*
2710                  * Don't allow a nodelist;  mpol_new() checks flags
2711                  */
2712                 if (nodelist)
2713                         goto out;
2714                 mode = MPOL_PREFERRED;
2715                 break;
2716         case MPOL_DEFAULT:
2717                 /*
2718                  * Insist on a empty nodelist
2719                  */
2720                 if (!nodelist)
2721                         err = 0;
2722                 goto out;
2723         case MPOL_BIND:
2724                 /*
2725                  * Insist on a nodelist
2726                  */
2727                 if (!nodelist)
2728                         goto out;
2729         }
2730
2731         mode_flags = 0;
2732         if (flags) {
2733                 /*
2734                  * Currently, we only support two mutually exclusive
2735                  * mode flags.
2736                  */
2737                 if (!strcmp(flags, "static"))
2738                         mode_flags |= MPOL_F_STATIC_NODES;
2739                 else if (!strcmp(flags, "relative"))
2740                         mode_flags |= MPOL_F_RELATIVE_NODES;
2741                 else
2742                         goto out;
2743         }
2744
2745         new = mpol_new(mode, mode_flags, &nodes);
2746         if (IS_ERR(new))
2747                 goto out;
2748
2749         /*
2750          * Save nodes for mpol_to_str() to show the tmpfs mount options
2751          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2752          */
2753         if (mode != MPOL_PREFERRED)
2754                 new->v.nodes = nodes;
2755         else if (nodelist)
2756                 new->v.preferred_node = first_node(nodes);
2757         else
2758                 new->flags |= MPOL_F_LOCAL;
2759
2760         /*
2761          * Save nodes for contextualization: this will be used to "clone"
2762          * the mempolicy in a specific context [cpuset] at a later time.
2763          */
2764         new->w.user_nodemask = nodes;
2765
2766         err = 0;
2767
2768 out:
2769         /* Restore string for error message */
2770         if (nodelist)
2771                 *--nodelist = ':';
2772         if (flags)
2773                 *--flags = '=';
2774         if (!err)
2775                 *mpol = new;
2776         return err;
2777 }
2778 #endif /* CONFIG_TMPFS */
2779
2780 /**
2781  * mpol_to_str - format a mempolicy structure for printing
2782  * @buffer:  to contain formatted mempolicy string
2783  * @maxlen:  length of @buffer
2784  * @pol:  pointer to mempolicy to be formatted
2785  *
2786  * Convert a mempolicy into a string.
2787  * Returns the number of characters in buffer (if positive)
2788  * or an error (negative)
2789  */
2790 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2791 {
2792         char *p = buffer;
2793         int l;
2794         nodemask_t nodes;
2795         unsigned short mode;
2796         unsigned short flags = pol ? pol->flags : 0;
2797
2798         /*
2799          * Sanity check:  room for longest mode, flag and some nodes
2800          */
2801         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2802
2803         if (!pol || pol == &default_policy)
2804                 mode = MPOL_DEFAULT;
2805         else
2806                 mode = pol->mode;
2807
2808         switch (mode) {
2809         case MPOL_DEFAULT:
2810                 nodes_clear(nodes);
2811                 break;
2812
2813         case MPOL_PREFERRED:
2814                 nodes_clear(nodes);
2815                 if (flags & MPOL_F_LOCAL)
2816                         mode = MPOL_LOCAL;
2817                 else
2818                         node_set(pol->v.preferred_node, nodes);
2819                 break;
2820
2821         case MPOL_BIND:
2822                 /* Fall through */
2823         case MPOL_INTERLEAVE:
2824                 nodes = pol->v.nodes;
2825                 break;
2826
2827         default:
2828                 return -EINVAL;
2829         }
2830
2831         l = strlen(policy_modes[mode]);
2832         if (buffer + maxlen < p + l + 1)
2833                 return -ENOSPC;
2834
2835         strcpy(p, policy_modes[mode]);
2836         p += l;
2837
2838         if (flags & MPOL_MODE_FLAGS) {
2839                 if (buffer + maxlen < p + 2)
2840                         return -ENOSPC;
2841                 *p++ = '=';
2842
2843                 /*
2844                  * Currently, the only defined flags are mutually exclusive
2845                  */
2846                 if (flags & MPOL_F_STATIC_NODES)
2847                         p += snprintf(p, buffer + maxlen - p, "static");
2848                 else if (flags & MPOL_F_RELATIVE_NODES)
2849                         p += snprintf(p, buffer + maxlen - p, "relative");
2850         }
2851
2852         if (!nodes_empty(nodes)) {
2853                 if (buffer + maxlen < p + 2)
2854                         return -ENOSPC;
2855                 *p++ = ':';
2856                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2857         }
2858         return p - buffer;
2859 }