mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/slab.h>
  77 #include <linux/string.h>
  78 #include <linux/export.h>
  79 #include <linux/nsproxy.h>
  80 #include <linux/interrupt.h>
  81 #include <linux/init.h>
  82 #include <linux/compat.h>
  83 #include <linux/swap.h>
  84 #include <linux/seq_file.h>
  85 #include <linux/proc_fs.h>
  86 #include <linux/migrate.h>
  87 #include <linux/ksm.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91 #include <linux/ctype.h>
  92 #include <linux/mm_inline.h>
  93 #include <linux/mmu_notifier.h>
  94
  95 #include <asm/tlbflush.h>
  96 #include <asm/uaccess.h>
  97 #include <linux/random.h>
  98
  99 #include "internal.h"
 100
 101 /* Internal flags */
 102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 104
 105 static struct kmem_cache *policy_cache;
 106 static struct kmem_cache *sn_cache;
 107
 108 /* Highest zone. An specific allocation for a zone below that is not
 109    policied. */
 110 enum zone_type policy_zone = 0;
 111
 112 /*
 113  * run-time system-wide default policy => local allocation
 114  */
 115 static struct mempolicy default_policy = {
 116         .refcnt = ATOMIC_INIT(1), /* never free it */
 117         .mode = MPOL_PREFERRED,
 118         .flags = MPOL_F_LOCAL,
 119 };
 120
 121 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 122
 123 static struct mempolicy *get_task_policy(struct task_struct *p)
 124 {
 125         struct mempolicy *pol = p->mempolicy;
 126
 127         if (!pol) {
 128                 int node = numa_node_id();
 129
 130                 if (node != NUMA_NO_NODE) {
 131                         pol = &preferred_node_policy[node];
 132                         /*
 133                          * preferred_node_policy is not initialised early in
 134                          * boot
 135                          */
 136                         if (!pol->mode)
 137                                 pol = NULL;
 138                 }
 139         }
 140
 141         return pol;
 142 }
 143
 144 static const struct mempolicy_operations {
 145         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 146         /*
 147          * If read-side task has no lock to protect task->mempolicy, write-side
 148          * task will rebind the task->mempolicy by two step. The first step is
 149          * setting all the newly nodes, and the second step is cleaning all the
 150          * disallowed nodes. In this way, we can avoid finding no node to alloc
 151          * page.
 152          * If we have a lock to protect task->mempolicy in read-side, we do
 153          * rebind directly.
 154          *
 155          * step:
 156          *      MPOL_REBIND_ONCE - do rebind work at once
 157          *      MPOL_REBIND_STEP1 - set all the newly nodes
 158          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 159          */
 160         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 161                         enum mpol_rebind_step step);
 162 } mpol_ops[MPOL_MAX];
 163
 164 /* Check that the nodemask contains at least one populated zone */
 165 static int is_valid_nodemask(const nodemask_t *nodemask)
 166 {
 167         return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 168 }
 169
 170 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 171 {
 172         return pol->flags & MPOL_MODE_FLAGS;
 173 }
 174
 175 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 176                                    const nodemask_t *rel)
 177 {
 178         nodemask_t tmp;
 179         nodes_fold(tmp, *orig, nodes_weight(*rel));
 180         nodes_onto(*ret, tmp, *rel);
 181 }
 182
 183 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 184 {
 185         if (nodes_empty(*nodes))
 186                 return -EINVAL;
 187         pol->v.nodes = *nodes;
 188         return 0;
 189 }
 190
 191 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 192 {
 193         if (!nodes)
 194                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 195         else if (nodes_empty(*nodes))
 196                 return -EINVAL;                 /*  no allowed nodes */
 197         else
 198                 pol->v.preferred_node = first_node(*nodes);
 199         return 0;
 200 }
 201
 202 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 203 {
 204         if (!is_valid_nodemask(nodes))
 205                 return -EINVAL;
 206         pol->v.nodes = *nodes;
 207         return 0;
 208 }
 209
 210 /*
 211  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 212  * any, for the new policy.  mpol_new() has already validated the nodes
 213  * parameter with respect to the policy mode and flags.  But, we need to
 214  * handle an empty nodemask with MPOL_PREFERRED here.
 215  *
 216  * Must be called holding task's alloc_lock to protect task's mems_allowed
 217  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 218  */
 219 static int mpol_set_nodemask(struct mempolicy *pol,
 220                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 221 {
 222         int ret;
 223
 224         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 225         if (pol == NULL)
 226                 return 0;
 227         /* Check N_MEMORY */
 228         nodes_and(nsc->mask1,
 229                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 230
 231         VM_BUG_ON(!nodes);
 232         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 233                 nodes = NULL;   /* explicit local allocation */
 234         else {
 235                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 236                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 237                 else
 238                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 239
 240                 if (mpol_store_user_nodemask(pol))
 241                         pol->w.user_nodemask = *nodes;
 242                 else
 243                         pol->w.cpuset_mems_allowed =
 244                                                 cpuset_current_mems_allowed;
 245         }
 246
 247         if (nodes)
 248                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 249         else
 250                 ret = mpol_ops[pol->mode].create(pol, NULL);
 251         return ret;
 252 }
 253
 254 /*
 255  * This function just creates a new policy, does some check and simple
 256  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 257  */
 258 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 259                                   nodemask_t *nodes)
 260 {
 261         struct mempolicy *policy;
 262
 263         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 264                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 265
 266         if (mode == MPOL_DEFAULT) {
 267                 if (nodes && !nodes_empty(*nodes))
 268                         return ERR_PTR(-EINVAL);
 269                 return NULL;
 270         }
 271         VM_BUG_ON(!nodes);
 272
 273         /*
 274          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 275          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 276          * All other modes require a valid pointer to a non-empty nodemask.
 277          */
 278         if (mode == MPOL_PREFERRED) {
 279                 if (nodes_empty(*nodes)) {
 280                         if (((flags & MPOL_F_STATIC_NODES) ||
 281                              (flags & MPOL_F_RELATIVE_NODES)))
 282                                 return ERR_PTR(-EINVAL);
 283                 }
 284         } else if (mode == MPOL_LOCAL) {
 285                 if (!nodes_empty(*nodes))
 286                         return ERR_PTR(-EINVAL);
 287                 mode = MPOL_PREFERRED;
 288         } else if (nodes_empty(*nodes))
 289                 return ERR_PTR(-EINVAL);
 290         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 291         if (!policy)
 292                 return ERR_PTR(-ENOMEM);
 293         atomic_set(&policy->refcnt, 1);
 294         policy->mode = mode;
 295         policy->flags = flags;
 296
 297         return policy;
 298 }
 299
 300 /* Slow path of a mpol destructor. */
 301 void __mpol_put(struct mempolicy *p)
 302 {
 303         if (!atomic_dec_and_test(&p->refcnt))
 304                 return;
 305         kmem_cache_free(policy_cache, p);
 306 }
 307
 308 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 309                                 enum mpol_rebind_step step)
 310 {
 311 }
 312
 313 /*
 314  * step:
 315  *      MPOL_REBIND_ONCE  - do rebind work at once
 316  *      MPOL_REBIND_STEP1 - set all the newly nodes
 317  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 318  */
 319 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 320                                  enum mpol_rebind_step step)
 321 {
 322         nodemask_t tmp;
 323
 324         if (pol->flags & MPOL_F_STATIC_NODES)
 325                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 326         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 327                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 328         else {
 329                 /*
 330                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 331                  * result
 332                  */
 333                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 334                         nodes_remap(tmp, pol->v.nodes,
 335                                         pol->w.cpuset_mems_allowed, *nodes);
 336                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 337                 } else if (step == MPOL_REBIND_STEP2) {
 338                         tmp = pol->w.cpuset_mems_allowed;
 339                         pol->w.cpuset_mems_allowed = *nodes;
 340                 } else
 341                         BUG();
 342         }
 343
 344         if (nodes_empty(tmp))
 345                 tmp = *nodes;
 346
 347         if (step == MPOL_REBIND_STEP1)
 348                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 349         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 350                 pol->v.nodes = tmp;
 351         else
 352                 BUG();
 353
 354         if (!node_isset(current->il_next, tmp)) {
 355                 current->il_next = next_node(current->il_next, tmp);
 356                 if (current->il_next >= MAX_NUMNODES)
 357                         current->il_next = first_node(tmp);
 358                 if (current->il_next >= MAX_NUMNODES)
 359                         current->il_next = numa_node_id();
 360         }
 361 }
 362
 363 static void mpol_rebind_preferred(struct mempolicy *pol,
 364                                   const nodemask_t *nodes,
 365                                   enum mpol_rebind_step step)
 366 {
 367         nodemask_t tmp;
 368
 369         if (pol->flags & MPOL_F_STATIC_NODES) {
 370                 int node = first_node(pol->w.user_nodemask);
 371
 372                 if (node_isset(node, *nodes)) {
 373                         pol->v.preferred_node = node;
 374                         pol->flags &= ~MPOL_F_LOCAL;
 375                 } else
 376                         pol->flags |= MPOL_F_LOCAL;
 377         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 378                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 379                 pol->v.preferred_node = first_node(tmp);
 380         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 381                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 382                                                    pol->w.cpuset_mems_allowed,
 383                                                    *nodes);
 384                 pol->w.cpuset_mems_allowed = *nodes;
 385         }
 386 }
 387
 388 /*
 389  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 390  *
 391  * If read-side task has no lock to protect task->mempolicy, write-side
 392  * task will rebind the task->mempolicy by two step. The first step is
 393  * setting all the newly nodes, and the second step is cleaning all the
 394  * disallowed nodes. In this way, we can avoid finding no node to alloc
 395  * page.
 396  * If we have a lock to protect task->mempolicy in read-side, we do
 397  * rebind directly.
 398  *
 399  * step:
 400  *      MPOL_REBIND_ONCE  - do rebind work at once
 401  *      MPOL_REBIND_STEP1 - set all the newly nodes
 402  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 403  */
 404 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 405                                 enum mpol_rebind_step step)
 406 {
 407         if (!pol)
 408                 return;
 409         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 410             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 411                 return;
 412
 413         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 414                 return;
 415
 416         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 417                 BUG();
 418
 419         if (step == MPOL_REBIND_STEP1)
 420                 pol->flags |= MPOL_F_REBINDING;
 421         else if (step == MPOL_REBIND_STEP2)
 422                 pol->flags &= ~MPOL_F_REBINDING;
 423         else if (step >= MPOL_REBIND_NSTEP)
 424                 BUG();
 425
 426         mpol_ops[pol->mode].rebind(pol, newmask, step);
 427 }
 428
 429 /*
 430  * Wrapper for mpol_rebind_policy() that just requires task
 431  * pointer, and updates task mempolicy.
 432  *
 433  * Called with task's alloc_lock held.
 434  */
 435
 436 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 437                         enum mpol_rebind_step step)
 438 {
 439         mpol_rebind_policy(tsk->mempolicy, new, step);
 440 }
 441
 442 /*
 443  * Rebind each vma in mm to new nodemask.
 444  *
 445  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 446  */
 447
 448 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 449 {
 450         struct vm_area_struct *vma;
 451
 452         down_write(&mm->mmap_sem);
 453         for (vma = mm->mmap; vma; vma = vma->vm_next)
 454                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 455         up_write(&mm->mmap_sem);
 456 }
 457
 458 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 459         [MPOL_DEFAULT] = {
 460                 .rebind = mpol_rebind_default,
 461         },
 462         [MPOL_INTERLEAVE] = {
 463                 .create = mpol_new_interleave,
 464                 .rebind = mpol_rebind_nodemask,
 465         },
 466         [MPOL_PREFERRED] = {
 467                 .create = mpol_new_preferred,
 468                 .rebind = mpol_rebind_preferred,
 469         },
 470         [MPOL_BIND] = {
 471                 .create = mpol_new_bind,
 472                 .rebind = mpol_rebind_nodemask,
 473         },
 474 };
 475
 476 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 477                                 unsigned long flags);
 478
 479 /* Scan through pages checking if pages follow certain conditions. */
 480 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 481                 unsigned long addr, unsigned long end,
 482                 const nodemask_t *nodes, unsigned long flags,
 483                 void *private)
 484 {
 485         pte_t *orig_pte;
 486         pte_t *pte;
 487         spinlock_t *ptl;
 488
 489         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 490         do {
 491                 struct page *page;
 492                 int nid;
 493
 494                 if (!pte_present(*pte))
 495                         continue;
 496                 page = vm_normal_page(vma, addr, *pte);
 497                 if (!page)
 498                         continue;
 499                 /*
 500                  * vm_normal_page() filters out zero pages, but there might
 501                  * still be PageReserved pages to skip, perhaps in a VDSO.
 502                  */
 503                 if (PageReserved(page))
 504                         continue;
 505                 nid = page_to_nid(page);
 506                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 507                         continue;
 508
 509                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 510                         migrate_page_add(page, private, flags);
 511                 else
 512                         break;
 513         } while (pte++, addr += PAGE_SIZE, addr != end);
 514         pte_unmap_unlock(orig_pte, ptl);
 515         return addr != end;
 516 }
 517
 518 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 519                 unsigned long addr, unsigned long end,
 520                 const nodemask_t *nodes, unsigned long flags,
 521                 void *private)
 522 {
 523         pmd_t *pmd;
 524         unsigned long next;
 525
 526         pmd = pmd_offset(pud, addr);
 527         do {
 528                 next = pmd_addr_end(addr, end);
 529                 split_huge_page_pmd(vma, addr, pmd);
 530                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 531                         continue;
 532                 if (check_pte_range(vma, pmd, addr, next, nodes,
 533                                     flags, private))
 534                         return -EIO;
 535         } while (pmd++, addr = next, addr != end);
 536         return 0;
 537 }
 538
 539 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 540                 unsigned long addr, unsigned long end,
 541                 const nodemask_t *nodes, unsigned long flags,
 542                 void *private)
 543 {
 544         pud_t *pud;
 545         unsigned long next;
 546
 547         pud = pud_offset(pgd, addr);
 548         do {
 549                 next = pud_addr_end(addr, end);
 550                 if (pud_none_or_clear_bad(pud))
 551                         continue;
 552                 if (check_pmd_range(vma, pud, addr, next, nodes,
 553                                     flags, private))
 554                         return -EIO;
 555         } while (pud++, addr = next, addr != end);
 556         return 0;
 557 }
 558
 559 static inline int check_pgd_range(struct vm_area_struct *vma,
 560                 unsigned long addr, unsigned long end,
 561                 const nodemask_t *nodes, unsigned long flags,
 562                 void *private)
 563 {
 564         pgd_t *pgd;
 565         unsigned long next;
 566
 567         pgd = pgd_offset(vma->vm_mm, addr);
 568         do {
 569                 next = pgd_addr_end(addr, end);
 570                 if (pgd_none_or_clear_bad(pgd))
 571                         continue;
 572                 if (check_pud_range(vma, pgd, addr, next, nodes,
 573                                     flags, private))
 574                         return -EIO;
 575         } while (pgd++, addr = next, addr != end);
 576         return 0;
 577 }
 578
 579 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 580 /*
 581  * This is used to mark a range of virtual addresses to be inaccessible.
 582  * These are later cleared by a NUMA hinting fault. Depending on these
 583  * faults, pages may be migrated for better NUMA placement.
 584  *
 585  * This is assuming that NUMA faults are handled using PROT_NONE. If
 586  * an architecture makes a different choice, it will need further
 587  * changes to the core.
 588  */
 589 unsigned long change_prot_numa(struct vm_area_struct *vma,
 590                         unsigned long addr, unsigned long end)
 591 {
 592         int nr_updated;
 593         BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 594
 595         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 596         if (nr_updated)
 597                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 598
 599         return nr_updated;
 600 }
 601 #else
 602 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 603                         unsigned long addr, unsigned long end)
 604 {
 605         return 0;
 606 }
 607 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 608
 609 /*
 610  * Check if all pages in a range are on a set of nodes.
 611  * If pagelist != NULL then isolate pages from the LRU and
 612  * put them on the pagelist.
 613  */
 614 static struct vm_area_struct *
 615 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 616                 const nodemask_t *nodes, unsigned long flags, void *private)
 617 {
 618         int err;
 619         struct vm_area_struct *first, *vma, *prev;
 620
 621
 622         first = find_vma(mm, start);
 623         if (!first)
 624                 return ERR_PTR(-EFAULT);
 625         prev = NULL;
 626         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 627                 unsigned long endvma = vma->vm_end;
 628
 629                 if (endvma > end)
 630                         endvma = end;
 631                 if (vma->vm_start > start)
 632                         start = vma->vm_start;
 633
 634                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 635                         if (!vma->vm_next && vma->vm_end < end)
 636                                 return ERR_PTR(-EFAULT);
 637                         if (prev && prev->vm_end < vma->vm_start)
 638                                 return ERR_PTR(-EFAULT);
 639                 }
 640
 641                 if (is_vm_hugetlb_page(vma))
 642                         goto next;
 643
 644                 if (flags & MPOL_MF_LAZY) {
 645                         change_prot_numa(vma, start, endvma);
 646                         goto next;
 647                 }
 648
 649                 if ((flags & MPOL_MF_STRICT) ||
 650                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 651                       vma_migratable(vma))) {
 652
 653                         err = check_pgd_range(vma, start, endvma, nodes,
 654                                                 flags, private);
 655                         if (err) {
 656                                 first = ERR_PTR(err);
 657                                 break;
 658                         }
 659                 }
 660 next:
 661                 prev = vma;
 662         }
 663         return first;
 664 }
 665
 666 /*
 667  * Apply policy to a single VMA
 668  * This must be called with the mmap_sem held for writing.
 669  */
 670 static int vma_replace_policy(struct vm_area_struct *vma,
 671                                                 struct mempolicy *pol)
 672 {
 673         int err;
 674         struct mempolicy *old;
 675         struct mempolicy *new;
 676
 677         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 678                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 679                  vma->vm_ops, vma->vm_file,
 680                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 681
 682         new = mpol_dup(pol);
 683         if (IS_ERR(new))
 684                 return PTR_ERR(new);
 685
 686         if (vma->vm_ops && vma->vm_ops->set_policy) {
 687                 err = vma->vm_ops->set_policy(vma, new);
 688                 if (err)
 689                         goto err_out;
 690         }
 691
 692         old = vma->vm_policy;
 693         vma->vm_policy = new; /* protected by mmap_sem */
 694         mpol_put(old);
 695
 696         return 0;
 697  err_out:
 698         mpol_put(new);
 699         return err;
 700 }
 701
 702 /* Step 2: apply policy to a range and do splits. */
 703 static int mbind_range(struct mm_struct *mm, unsigned long start,
 704                        unsigned long end, struct mempolicy *new_pol)
 705 {
 706         struct vm_area_struct *next;
 707         struct vm_area_struct *prev;
 708         struct vm_area_struct *vma;
 709         int err = 0;
 710         pgoff_t pgoff;
 711         unsigned long vmstart;
 712         unsigned long vmend;
 713
 714         vma = find_vma(mm, start);
 715         if (!vma || vma->vm_start > start)
 716                 return -EFAULT;
 717
 718         prev = vma->vm_prev;
 719         if (start > vma->vm_start)
 720                 prev = vma;
 721
 722         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 723                 next = vma->vm_next;
 724                 vmstart = max(start, vma->vm_start);
 725                 vmend   = min(end, vma->vm_end);
 726
 727                 if (mpol_equal(vma_policy(vma), new_pol))
 728                         continue;
 729
 730                 pgoff = vma->vm_pgoff +
 731                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 732                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 733                                   vma->anon_vma, vma->vm_file, pgoff,
 734                                   new_pol);
 735                 if (prev) {
 736                         vma = prev;
 737                         next = vma->vm_next;
 738                         if (mpol_equal(vma_policy(vma), new_pol))
 739                                 continue;
 740                         /* vma_merge() joined vma && vma->next, case 8 */
 741                         goto replace;
 742                 }
 743                 if (vma->vm_start != vmstart) {
 744                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 745                         if (err)
 746                                 goto out;
 747                 }
 748                 if (vma->vm_end != vmend) {
 749                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 750                         if (err)
 751                                 goto out;
 752                 }
 753  replace:
 754                 err = vma_replace_policy(vma, new_pol);
 755                 if (err)
 756                         goto out;
 757         }
 758
 759  out:
 760         return err;
 761 }
 762
 763 /*
 764  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 765  * mempolicy.  Allows more rapid checking of this (combined perhaps
 766  * with other PF_* flag bits) on memory allocation hot code paths.
 767  *
 768  * If called from outside this file, the task 'p' should -only- be
 769  * a newly forked child not yet visible on the task list, because
 770  * manipulating the task flags of a visible task is not safe.
 771  *
 772  * The above limitation is why this routine has the funny name
 773  * mpol_fix_fork_child_flag().
 774  *
 775  * It is also safe to call this with a task pointer of current,
 776  * which the static wrapper mpol_set_task_struct_flag() does,
 777  * for use within this file.
 778  */
 779
 780 void mpol_fix_fork_child_flag(struct task_struct *p)
 781 {
 782         if (p->mempolicy)
 783                 p->flags |= PF_MEMPOLICY;
 784         else
 785                 p->flags &= ~PF_MEMPOLICY;
 786 }
 787
 788 static void mpol_set_task_struct_flag(void)
 789 {
 790         mpol_fix_fork_child_flag(current);
 791 }
 792
 793 /* Set the process memory policy */
 794 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 795                              nodemask_t *nodes)
 796 {
 797         struct mempolicy *new, *old;
 798         struct mm_struct *mm = current->mm;
 799         NODEMASK_SCRATCH(scratch);
 800         int ret;
 801
 802         if (!scratch)
 803                 return -ENOMEM;
 804
 805         new = mpol_new(mode, flags, nodes);
 806         if (IS_ERR(new)) {
 807                 ret = PTR_ERR(new);
 808                 goto out;
 809         }
 810         /*
 811          * prevent changing our mempolicy while show_numa_maps()
 812          * is using it.
 813          * Note:  do_set_mempolicy() can be called at init time
 814          * with no 'mm'.
 815          */
 816         if (mm)
 817                 down_write(&mm->mmap_sem);
 818         task_lock(current);
 819         ret = mpol_set_nodemask(new, nodes, scratch);
 820         if (ret) {
 821                 task_unlock(current);
 822                 if (mm)
 823                         up_write(&mm->mmap_sem);
 824                 mpol_put(new);
 825                 goto out;
 826         }
 827         old = current->mempolicy;
 828         current->mempolicy = new;
 829         mpol_set_task_struct_flag();
 830         if (new && new->mode == MPOL_INTERLEAVE &&
 831             nodes_weight(new->v.nodes))
 832                 current->il_next = first_node(new->v.nodes);
 833         task_unlock(current);
 834         if (mm)
 835                 up_write(&mm->mmap_sem);
 836
 837         mpol_put(old);
 838         ret = 0;
 839 out:
 840         NODEMASK_SCRATCH_FREE(scratch);
 841         return ret;
 842 }
 843
 844 /*
 845  * Return nodemask for policy for get_mempolicy() query
 846  *
 847  * Called with task's alloc_lock held
 848  */
 849 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 850 {
 851         nodes_clear(*nodes);
 852         if (p == &default_policy)
 853                 return;
 854
 855         switch (p->mode) {
 856         case MPOL_BIND:
 857                 /* Fall through */
 858         case MPOL_INTERLEAVE:
 859                 *nodes = p->v.nodes;
 860                 break;
 861         case MPOL_PREFERRED:
 862                 if (!(p->flags & MPOL_F_LOCAL))
 863                         node_set(p->v.preferred_node, *nodes);
 864                 /* else return empty node mask for local allocation */
 865                 break;
 866         default:
 867                 BUG();
 868         }
 869 }
 870
 871 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 872 {
 873         struct page *p;
 874         int err;
 875
 876         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 877         if (err >= 0) {
 878                 err = page_to_nid(p);
 879                 put_page(p);
 880         }
 881         return err;
 882 }
 883
 884 /* Retrieve NUMA policy */
 885 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 886                              unsigned long addr, unsigned long flags)
 887 {
 888         int err;
 889         struct mm_struct *mm = current->mm;
 890         struct vm_area_struct *vma = NULL;
 891         struct mempolicy *pol = current->mempolicy;
 892
 893         if (flags &
 894                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 895                 return -EINVAL;
 896
 897         if (flags & MPOL_F_MEMS_ALLOWED) {
 898                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 899                         return -EINVAL;
 900                 *policy = 0;    /* just so it's initialized */
 901                 task_lock(current);
 902                 *nmask  = cpuset_current_mems_allowed;
 903                 task_unlock(current);
 904                 return 0;
 905         }
 906
 907         if (flags & MPOL_F_ADDR) {
 908                 /*
 909                  * Do NOT fall back to task policy if the
 910                  * vma/shared policy at addr is NULL.  We
 911                  * want to return MPOL_DEFAULT in this case.
 912                  */
 913                 down_read(&mm->mmap_sem);
 914                 vma = find_vma_intersection(mm, addr, addr+1);
 915                 if (!vma) {
 916                         up_read(&mm->mmap_sem);
 917                         return -EFAULT;
 918                 }
 919                 if (vma->vm_ops && vma->vm_ops->get_policy)
 920                         pol = vma->vm_ops->get_policy(vma, addr);
 921                 else
 922                         pol = vma->vm_policy;
 923         } else if (addr)
 924                 return -EINVAL;
 925
 926         if (!pol)
 927                 pol = &default_policy;  /* indicates default behavior */
 928
 929         if (flags & MPOL_F_NODE) {
 930                 if (flags & MPOL_F_ADDR) {
 931                         err = lookup_node(mm, addr);
 932                         if (err < 0)
 933                                 goto out;
 934                         *policy = err;
 935                 } else if (pol == current->mempolicy &&
 936                                 pol->mode == MPOL_INTERLEAVE) {
 937                         *policy = current->il_next;
 938                 } else {
 939                         err = -EINVAL;
 940                         goto out;
 941                 }
 942         } else {
 943                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 944                                                 pol->mode;
 945                 /*
 946                  * Internal mempolicy flags must be masked off before exposing
 947                  * the policy to userspace.
 948                  */
 949                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 950         }
 951
 952         if (vma) {
 953                 up_read(&current->mm->mmap_sem);
 954                 vma = NULL;
 955         }
 956
 957         err = 0;
 958         if (nmask) {
 959                 if (mpol_store_user_nodemask(pol)) {
 960                         *nmask = pol->w.user_nodemask;
 961                 } else {
 962                         task_lock(current);
 963                         get_policy_nodemask(pol, nmask);
 964                         task_unlock(current);
 965                 }
 966         }
 967
 968  out:
 969         mpol_cond_put(pol);
 970         if (vma)
 971                 up_read(&current->mm->mmap_sem);
 972         return err;
 973 }
 974
 975 #ifdef CONFIG_MIGRATION
 976 /*
 977  * page migration
 978  */
 979 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 980                                 unsigned long flags)
 981 {
 982         /*
 983          * Avoid migrating a page that is shared with others.
 984          */
 985         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 986                 if (!isolate_lru_page(page)) {
 987                         list_add_tail(&page->lru, pagelist);
 988                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 989                                             page_is_file_cache(page));
 990                 }
 991         }
 992 }
 993
 994 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 995 {
 996         return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 997 }
 998
 999 /*
1000  * Migrate pages from one node to a target node.
1001  * Returns error or the number of pages not migrated.
1002  */
1003 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1004                            int flags)
1005 {
1006         nodemask_t nmask;
1007         LIST_HEAD(pagelist);
1008         int err = 0;
1009
1010         nodes_clear(nmask);
1011         node_set(source, nmask);
1012
1013         /*
1014          * This does not "check" the range but isolates all pages that
1015          * need migration.  Between passing in the full user address
1016          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1017          */
1018         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1019         check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1020                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1021
1022         if (!list_empty(&pagelist)) {
1023                 err = migrate_pages(&pagelist, new_node_page, dest,
1024                                         MIGRATE_SYNC, MR_SYSCALL);
1025                 if (err)
1026                         putback_lru_pages(&pagelist);
1027         }
1028
1029         return err;
1030 }
1031
1032 /*
1033  * Move pages between the two nodesets so as to preserve the physical
1034  * layout as much as possible.
1035  *
1036  * Returns the number of page that could not be moved.
1037  */
1038 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1039                      const nodemask_t *to, int flags)
1040 {
1041         int busy = 0;
1042         int err;
1043         nodemask_t tmp;
1044
1045         err = migrate_prep();
1046         if (err)
1047                 return err;
1048
1049         down_read(&mm->mmap_sem);
1050
1051         err = migrate_vmas(mm, from, to, flags);
1052         if (err)
1053                 goto out;
1054
1055         /*
1056          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1057          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1058          * bit in 'tmp', and return that <source, dest> pair for migration.
1059          * The pair of nodemasks 'to' and 'from' define the map.
1060          *
1061          * If no pair of bits is found that way, fallback to picking some
1062          * pair of 'source' and 'dest' bits that are not the same.  If the
1063          * 'source' and 'dest' bits are the same, this represents a node
1064          * that will be migrating to itself, so no pages need move.
1065          *
1066          * If no bits are left in 'tmp', or if all remaining bits left
1067          * in 'tmp' correspond to the same bit in 'to', return false
1068          * (nothing left to migrate).
1069          *
1070          * This lets us pick a pair of nodes to migrate between, such that
1071          * if possible the dest node is not already occupied by some other
1072          * source node, minimizing the risk of overloading the memory on a
1073          * node that would happen if we migrated incoming memory to a node
1074          * before migrating outgoing memory source that same node.
1075          *
1076          * A single scan of tmp is sufficient.  As we go, we remember the
1077          * most recent <s, d> pair that moved (s != d).  If we find a pair
1078          * that not only moved, but what's better, moved to an empty slot
1079          * (d is not set in tmp), then we break out then, with that pair.
1080          * Otherwise when we finish scanning from_tmp, we at least have the
1081          * most recent <s, d> pair that moved.  If we get all the way through
1082          * the scan of tmp without finding any node that moved, much less
1083          * moved to an empty node, then there is nothing left worth migrating.
1084          */
1085
1086         tmp = *from;
1087         while (!nodes_empty(tmp)) {
1088                 int s,d;
1089                 int source = -1;
1090                 int dest = 0;
1091
1092                 for_each_node_mask(s, tmp) {
1093
1094                         /*
1095                          * do_migrate_pages() tries to maintain the relative
1096                          * node relationship of the pages established between
1097                          * threads and memory areas.
1098                          *
1099                          * However if the number of source nodes is not equal to
1100                          * the number of destination nodes we can not preserve
1101                          * this node relative relationship.  In that case, skip
1102                          * copying memory from a node that is in the destination
1103                          * mask.
1104                          *
1105                          * Example: [2,3,4] -> [3,4,5] moves everything.
1106                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1107                          */
1108
1109                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1110                                                 (node_isset(s, *to)))
1111                                 continue;
1112
1113                         d = node_remap(s, *from, *to);
1114                         if (s == d)
1115                                 continue;
1116
1117                         source = s;     /* Node moved. Memorize */
1118                         dest = d;
1119
1120                         /* dest not in remaining from nodes? */
1121                         if (!node_isset(dest, tmp))
1122                                 break;
1123                 }
1124                 if (source == -1)
1125                         break;
1126
1127                 node_clear(source, tmp);
1128                 err = migrate_to_node(mm, source, dest, flags);
1129                 if (err > 0)
1130                         busy += err;
1131                 if (err < 0)
1132                         break;
1133         }
1134 out:
1135         up_read(&mm->mmap_sem);
1136         if (err < 0)
1137                 return err;
1138         return busy;
1139
1140 }
1141
1142 /*
1143  * Allocate a new page for page migration based on vma policy.
1144  * Start assuming that page is mapped by vma pointed to by @private.
1145  * Search forward from there, if not.  N.B., this assumes that the
1146  * list of pages handed to migrate_pages()--which is how we get here--
1147  * is in virtual address order.
1148  */
1149 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1150 {
1151         struct vm_area_struct *vma = (struct vm_area_struct *)private;
1152         unsigned long uninitialized_var(address);
1153
1154         while (vma) {
1155                 address = page_address_in_vma(page, vma);
1156                 if (address != -EFAULT)
1157                         break;
1158                 vma = vma->vm_next;
1159         }
1160
1161         /*
1162          * if !vma, alloc_page_vma() will use task or system default policy
1163          */
1164         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1165 }
1166 #else
1167
1168 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1169                                 unsigned long flags)
1170 {
1171 }
1172
1173 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1174                      const nodemask_t *to, int flags)
1175 {
1176         return -ENOSYS;
1177 }
1178
1179 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1180 {
1181         return NULL;
1182 }
1183 #endif
1184
1185 static long do_mbind(unsigned long start, unsigned long len,
1186                      unsigned short mode, unsigned short mode_flags,
1187                      nodemask_t *nmask, unsigned long flags)
1188 {
1189         struct vm_area_struct *vma;
1190         struct mm_struct *mm = current->mm;
1191         struct mempolicy *new;
1192         unsigned long end;
1193         int err;
1194         LIST_HEAD(pagelist);
1195
1196         if (flags & ~(unsigned long)MPOL_MF_VALID)
1197                 return -EINVAL;
1198         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1199                 return -EPERM;
1200
1201         if (start & ~PAGE_MASK)
1202                 return -EINVAL;
1203
1204         if (mode == MPOL_DEFAULT)
1205                 flags &= ~MPOL_MF_STRICT;
1206
1207         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1208         end = start + len;
1209
1210         if (end < start)
1211                 return -EINVAL;
1212         if (end == start)
1213                 return 0;
1214
1215         new = mpol_new(mode, mode_flags, nmask);
1216         if (IS_ERR(new))
1217                 return PTR_ERR(new);
1218
1219         if (flags & MPOL_MF_LAZY)
1220                 new->flags |= MPOL_F_MOF;
1221
1222         /*
1223          * If we are using the default policy then operation
1224          * on discontinuous address spaces is okay after all
1225          */
1226         if (!new)
1227                 flags |= MPOL_MF_DISCONTIG_OK;
1228
1229         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1230                  start, start + len, mode, mode_flags,
1231                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1232
1233         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1234
1235                 err = migrate_prep();
1236                 if (err)
1237                         goto mpol_out;
1238         }
1239         {
1240                 NODEMASK_SCRATCH(scratch);
1241                 if (scratch) {
1242                         down_write(&mm->mmap_sem);
1243                         task_lock(current);
1244                         err = mpol_set_nodemask(new, nmask, scratch);
1245                         task_unlock(current);
1246                         if (err)
1247                                 up_write(&mm->mmap_sem);
1248                 } else
1249                         err = -ENOMEM;
1250                 NODEMASK_SCRATCH_FREE(scratch);
1251         }
1252         if (err)
1253                 goto mpol_out;
1254
1255         vma = check_range(mm, start, end, nmask,
1256                           flags | MPOL_MF_INVERT, &pagelist);
1257
1258         err = PTR_ERR(vma);     /* maybe ... */
1259         if (!IS_ERR(vma))
1260                 err = mbind_range(mm, start, end, new);
1261
1262         if (!err) {
1263                 int nr_failed = 0;
1264
1265                 if (!list_empty(&pagelist)) {
1266                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1267                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1268                                         (unsigned long)vma,
1269                                         MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1270                         if (nr_failed)
1271                                 putback_lru_pages(&pagelist);
1272                 }
1273
1274                 if (nr_failed && (flags & MPOL_MF_STRICT))
1275                         err = -EIO;
1276         } else
1277                 putback_lru_pages(&pagelist);
1278
1279         up_write(&mm->mmap_sem);
1280  mpol_out:
1281         mpol_put(new);
1282         return err;
1283 }
1284
1285 /*
1286  * User space interface with variable sized bitmaps for nodelists.
1287  */
1288
1289 /* Copy a node mask from user space. */
1290 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1291                      unsigned long maxnode)
1292 {
1293         unsigned long k;
1294         unsigned long nlongs;
1295         unsigned long endmask;
1296
1297         --maxnode;
1298         nodes_clear(*nodes);
1299         if (maxnode == 0 || !nmask)
1300                 return 0;
1301         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1302                 return -EINVAL;
1303
1304         nlongs = BITS_TO_LONGS(maxnode);
1305         if ((maxnode % BITS_PER_LONG) == 0)
1306                 endmask = ~0UL;
1307         else
1308                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1309
1310         /* When the user specified more nodes than supported just check
1311            if the non supported part is all zero. */
1312         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1313                 if (nlongs > PAGE_SIZE/sizeof(long))
1314                         return -EINVAL;
1315                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1316                         unsigned long t;
1317                         if (get_user(t, nmask + k))
1318                                 return -EFAULT;
1319                         if (k == nlongs - 1) {
1320                                 if (t & endmask)
1321                                         return -EINVAL;
1322                         } else if (t)
1323                                 return -EINVAL;
1324                 }
1325                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1326                 endmask = ~0UL;
1327         }
1328
1329         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1330                 return -EFAULT;
1331         nodes_addr(*nodes)[nlongs-1] &= endmask;
1332         return 0;
1333 }
1334
1335 /* Copy a kernel node mask to user space */
1336 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1337                               nodemask_t *nodes)
1338 {
1339         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1340         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1341
1342         if (copy > nbytes) {
1343                 if (copy > PAGE_SIZE)
1344                         return -EINVAL;
1345                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1346                         return -EFAULT;
1347                 copy = nbytes;
1348         }
1349         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1350 }
1351
1352 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1353                 unsigned long, mode, unsigned long __user *, nmask,
1354                 unsigned long, maxnode, unsigned, flags)
1355 {
1356         nodemask_t nodes;
1357         int err;
1358         unsigned short mode_flags;
1359
1360         mode_flags = mode & MPOL_MODE_FLAGS;
1361         mode &= ~MPOL_MODE_FLAGS;
1362         if (mode >= MPOL_MAX)
1363                 return -EINVAL;
1364         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1365             (mode_flags & MPOL_F_RELATIVE_NODES))
1366                 return -EINVAL;
1367         err = get_nodes(&nodes, nmask, maxnode);
1368         if (err)
1369                 return err;
1370         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1371 }
1372
1373 /* Set the process memory policy */
1374 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1375                 unsigned long, maxnode)
1376 {
1377         int err;
1378         nodemask_t nodes;
1379         unsigned short flags;
1380
1381         flags = mode & MPOL_MODE_FLAGS;
1382         mode &= ~MPOL_MODE_FLAGS;
1383         if ((unsigned int)mode >= MPOL_MAX)
1384                 return -EINVAL;
1385         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1386                 return -EINVAL;
1387         err = get_nodes(&nodes, nmask, maxnode);
1388         if (err)
1389                 return err;
1390         return do_set_mempolicy(mode, flags, &nodes);
1391 }
1392
1393 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1394                 const unsigned long __user *, old_nodes,
1395                 const unsigned long __user *, new_nodes)
1396 {
1397         const struct cred *cred = current_cred(), *tcred;
1398         struct mm_struct *mm = NULL;
1399         struct task_struct *task;
1400         nodemask_t task_nodes;
1401         int err;
1402         nodemask_t *old;
1403         nodemask_t *new;
1404         NODEMASK_SCRATCH(scratch);
1405
1406         if (!scratch)
1407                 return -ENOMEM;
1408
1409         old = &scratch->mask1;
1410         new = &scratch->mask2;
1411
1412         err = get_nodes(old, old_nodes, maxnode);
1413         if (err)
1414                 goto out;
1415
1416         err = get_nodes(new, new_nodes, maxnode);
1417         if (err)
1418                 goto out;
1419
1420         /* Find the mm_struct */
1421         rcu_read_lock();
1422         task = pid ? find_task_by_vpid(pid) : current;
1423         if (!task) {
1424                 rcu_read_unlock();
1425                 err = -ESRCH;
1426                 goto out;
1427         }
1428         get_task_struct(task);
1429
1430         err = -EINVAL;
1431
1432         /*
1433          * Check if this process has the right to modify the specified
1434          * process. The right exists if the process has administrative
1435          * capabilities, superuser privileges or the same
1436          * userid as the target process.
1437          */
1438         tcred = __task_cred(task);
1439         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1440             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1441             !capable(CAP_SYS_NICE)) {
1442                 rcu_read_unlock();
1443                 err = -EPERM;
1444                 goto out_put;
1445         }
1446         rcu_read_unlock();
1447
1448         task_nodes = cpuset_mems_allowed(task);
1449         /* Is the user allowed to access the target nodes? */
1450         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1451                 err = -EPERM;
1452                 goto out_put;
1453         }
1454
1455         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1456                 err = -EINVAL;
1457                 goto out_put;
1458         }
1459
1460         err = security_task_movememory(task);
1461         if (err)
1462                 goto out_put;
1463
1464         mm = get_task_mm(task);
1465         put_task_struct(task);
1466
1467         if (!mm) {
1468                 err = -EINVAL;
1469                 goto out;
1470         }
1471
1472         err = do_migrate_pages(mm, old, new,
1473                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1474
1475         mmput(mm);
1476 out:
1477         NODEMASK_SCRATCH_FREE(scratch);
1478
1479         return err;
1480
1481 out_put:
1482         put_task_struct(task);
1483         goto out;
1484
1485 }
1486
1487
1488 /* Retrieve NUMA policy */
1489 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1490                 unsigned long __user *, nmask, unsigned long, maxnode,
1491                 unsigned long, addr, unsigned long, flags)
1492 {
1493         int err;
1494         int uninitialized_var(pval);
1495         nodemask_t nodes;
1496
1497         if (nmask != NULL && maxnode < MAX_NUMNODES)
1498                 return -EINVAL;
1499
1500         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1501
1502         if (err)
1503                 return err;
1504
1505         if (policy && put_user(pval, policy))
1506                 return -EFAULT;
1507
1508         if (nmask)
1509                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1510
1511         return err;
1512 }
1513
1514 #ifdef CONFIG_COMPAT
1515
1516 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1517                                      compat_ulong_t __user *nmask,
1518                                      compat_ulong_t maxnode,
1519                                      compat_ulong_t addr, compat_ulong_t flags)
1520 {
1521         long err;
1522         unsigned long __user *nm = NULL;
1523         unsigned long nr_bits, alloc_size;
1524         DECLARE_BITMAP(bm, MAX_NUMNODES);
1525
1526         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1527         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1528
1529         if (nmask)
1530                 nm = compat_alloc_user_space(alloc_size);
1531
1532         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1533
1534         if (!err && nmask) {
1535                 unsigned long copy_size;
1536                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1537                 err = copy_from_user(bm, nm, copy_size);
1538                 /* ensure entire bitmap is zeroed */
1539                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1540                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1541         }
1542
1543         return err;
1544 }
1545
1546 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1547                                      compat_ulong_t maxnode)
1548 {
1549         long err = 0;
1550         unsigned long __user *nm = NULL;
1551         unsigned long nr_bits, alloc_size;
1552         DECLARE_BITMAP(bm, MAX_NUMNODES);
1553
1554         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1555         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1556
1557         if (nmask) {
1558                 err = compat_get_bitmap(bm, nmask, nr_bits);
1559                 nm = compat_alloc_user_space(alloc_size);
1560                 err |= copy_to_user(nm, bm, alloc_size);
1561         }
1562
1563         if (err)
1564                 return -EFAULT;
1565
1566         return sys_set_mempolicy(mode, nm, nr_bits+1);
1567 }
1568
1569 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1570                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1571                              compat_ulong_t maxnode, compat_ulong_t flags)
1572 {
1573         long err = 0;
1574         unsigned long __user *nm = NULL;
1575         unsigned long nr_bits, alloc_size;
1576         nodemask_t bm;
1577
1578         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1579         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1580
1581         if (nmask) {
1582                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1583                 nm = compat_alloc_user_space(alloc_size);
1584                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1585         }
1586
1587         if (err)
1588                 return -EFAULT;
1589
1590         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1591 }
1592
1593 #endif
1594
1595 /*
1596  * get_vma_policy(@task, @vma, @addr)
1597  * @task - task for fallback if vma policy == default
1598  * @vma   - virtual memory area whose policy is sought
1599  * @addr  - address in @vma for shared policy lookup
1600  *
1601  * Returns effective policy for a VMA at specified address.
1602  * Falls back to @task or system default policy, as necessary.
1603  * Current or other task's task mempolicy and non-shared vma policies must be
1604  * protected by task_lock(task) by the caller.
1605  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1606  * count--added by the get_policy() vm_op, as appropriate--to protect against
1607  * freeing by another task.  It is the caller's responsibility to free the
1608  * extra reference for shared policies.
1609  */
1610 struct mempolicy *get_vma_policy(struct task_struct *task,
1611                 struct vm_area_struct *vma, unsigned long addr)
1612 {
1613         struct mempolicy *pol = get_task_policy(task);
1614
1615         if (vma) {
1616                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1617                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1618                                                                         addr);
1619                         if (vpol)
1620                                 pol = vpol;
1621                 } else if (vma->vm_policy) {
1622                         pol = vma->vm_policy;
1623
1624                         /*
1625                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1626                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1627                          * count on these policies which will be dropped by
1628                          * mpol_cond_put() later
1629                          */
1630                         if (mpol_needs_cond_ref(pol))
1631                                 mpol_get(pol);
1632                 }
1633         }
1634         if (!pol)
1635                 pol = &default_policy;
1636         return pol;
1637 }
1638
1639 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1640 {
1641         enum zone_type dynamic_policy_zone = policy_zone;
1642
1643         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1644
1645         /*
1646          * if policy->v.nodes has movable memory only,
1647          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1648          *
1649          * policy->v.nodes is intersect with node_states[N_MEMORY].
1650          * so if the following test faile, it implies
1651          * policy->v.nodes has movable memory only.
1652          */
1653         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1654                 dynamic_policy_zone = ZONE_MOVABLE;
1655
1656         return zone >= dynamic_policy_zone;
1657 }
1658
1659 /*
1660  * Return a nodemask representing a mempolicy for filtering nodes for
1661  * page allocation
1662  */
1663 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1664 {
1665         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1666         if (unlikely(policy->mode == MPOL_BIND) &&
1667                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1668                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1669                 return &policy->v.nodes;
1670
1671         return NULL;
1672 }
1673
1674 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1675 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1676         int nd)
1677 {
1678         switch (policy->mode) {
1679         case MPOL_PREFERRED:
1680                 if (!(policy->flags & MPOL_F_LOCAL))
1681                         nd = policy->v.preferred_node;
1682                 break;
1683         case MPOL_BIND:
1684                 /*
1685                  * Normally, MPOL_BIND allocations are node-local within the
1686                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1687                  * current node isn't part of the mask, we use the zonelist for
1688                  * the first node in the mask instead.
1689                  */
1690                 if (unlikely(gfp & __GFP_THISNODE) &&
1691                                 unlikely(!node_isset(nd, policy->v.nodes)))
1692                         nd = first_node(policy->v.nodes);
1693                 break;
1694         default:
1695                 BUG();
1696         }
1697         return node_zonelist(nd, gfp);
1698 }
1699
1700 /* Do dynamic interleaving for a process */
1701 static unsigned interleave_nodes(struct mempolicy *policy)
1702 {
1703         unsigned nid, next;
1704         struct task_struct *me = current;
1705
1706         nid = me->il_next;
1707         next = next_node(nid, policy->v.nodes);
1708         if (next >= MAX_NUMNODES)
1709                 next = first_node(policy->v.nodes);
1710         if (next < MAX_NUMNODES)
1711                 me->il_next = next;
1712         return nid;
1713 }
1714
1715 /*
1716  * Depending on the memory policy provide a node from which to allocate the
1717  * next slab entry.
1718  * @policy must be protected by freeing by the caller.  If @policy is
1719  * the current task's mempolicy, this protection is implicit, as only the
1720  * task can change it's policy.  The system default policy requires no
1721  * such protection.
1722  */
1723 unsigned slab_node(void)
1724 {
1725         struct mempolicy *policy;
1726
1727         if (in_interrupt())
1728                 return numa_node_id();
1729
1730         policy = current->mempolicy;
1731         if (!policy || policy->flags & MPOL_F_LOCAL)
1732                 return numa_node_id();
1733
1734         switch (policy->mode) {
1735         case MPOL_PREFERRED:
1736                 /*
1737                  * handled MPOL_F_LOCAL above
1738                  */
1739                 return policy->v.preferred_node;
1740
1741         case MPOL_INTERLEAVE:
1742                 return interleave_nodes(policy);
1743
1744         case MPOL_BIND: {
1745                 /*
1746                  * Follow bind policy behavior and start allocation at the
1747                  * first node.
1748                  */
1749                 struct zonelist *zonelist;
1750                 struct zone *zone;
1751                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1752                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1753                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1754                                                         &policy->v.nodes,
1755                                                         &zone);
1756                 return zone ? zone->node : numa_node_id();
1757         }
1758
1759         default:
1760                 BUG();
1761         }
1762 }
1763
1764 /* Do static interleaving for a VMA with known offset. */
1765 static unsigned offset_il_node(struct mempolicy *pol,
1766                 struct vm_area_struct *vma, unsigned long off)
1767 {
1768         unsigned nnodes = nodes_weight(pol->v.nodes);
1769         unsigned target;
1770         int c;
1771         int nid = -1;
1772
1773         if (!nnodes)
1774                 return numa_node_id();
1775         target = (unsigned int)off % nnodes;
1776         c = 0;
1777         do {
1778                 nid = next_node(nid, pol->v.nodes);
1779                 c++;
1780         } while (c <= target);
1781         return nid;
1782 }
1783
1784 /* Determine a node number for interleave */
1785 static inline unsigned interleave_nid(struct mempolicy *pol,
1786                  struct vm_area_struct *vma, unsigned long addr, int shift)
1787 {
1788         if (vma) {
1789                 unsigned long off;
1790
1791                 /*
1792                  * for small pages, there is no difference between
1793                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1794                  * for huge pages, since vm_pgoff is in units of small
1795                  * pages, we need to shift off the always 0 bits to get
1796                  * a useful offset.
1797                  */
1798                 BUG_ON(shift < PAGE_SHIFT);
1799                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1800                 off += (addr - vma->vm_start) >> shift;
1801                 return offset_il_node(pol, vma, off);
1802         } else
1803                 return interleave_nodes(pol);
1804 }
1805
1806 /*
1807  * Return the bit number of a random bit set in the nodemask.
1808  * (returns -1 if nodemask is empty)
1809  */
1810 int node_random(const nodemask_t *maskp)
1811 {
1812         int w, bit = -1;
1813
1814         w = nodes_weight(*maskp);
1815         if (w)
1816                 bit = bitmap_ord_to_pos(maskp->bits,
1817                         get_random_int() % w, MAX_NUMNODES);
1818         return bit;
1819 }
1820
1821 #ifdef CONFIG_HUGETLBFS
1822 /*
1823  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1824  * @vma = virtual memory area whose policy is sought
1825  * @addr = address in @vma for shared policy lookup and interleave policy
1826  * @gfp_flags = for requested zone
1827  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1828  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1829  *
1830  * Returns a zonelist suitable for a huge page allocation and a pointer
1831  * to the struct mempolicy for conditional unref after allocation.
1832  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1833  * @nodemask for filtering the zonelist.
1834  *
1835  * Must be protected by get_mems_allowed()
1836  */
1837 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1838                                 gfp_t gfp_flags, struct mempolicy **mpol,
1839                                 nodemask_t **nodemask)
1840 {
1841         struct zonelist *zl;
1842
1843         *mpol = get_vma_policy(current, vma, addr);
1844         *nodemask = NULL;       /* assume !MPOL_BIND */
1845
1846         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1847                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1848                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1849         } else {
1850                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1851                 if ((*mpol)->mode == MPOL_BIND)
1852                         *nodemask = &(*mpol)->v.nodes;
1853         }
1854         return zl;
1855 }
1856
1857 /*
1858  * init_nodemask_of_mempolicy
1859  *
1860  * If the current task's mempolicy is "default" [NULL], return 'false'
1861  * to indicate default policy.  Otherwise, extract the policy nodemask
1862  * for 'bind' or 'interleave' policy into the argument nodemask, or
1863  * initialize the argument nodemask to contain the single node for
1864  * 'preferred' or 'local' policy and return 'true' to indicate presence
1865  * of non-default mempolicy.
1866  *
1867  * We don't bother with reference counting the mempolicy [mpol_get/put]
1868  * because the current task is examining it's own mempolicy and a task's
1869  * mempolicy is only ever changed by the task itself.
1870  *
1871  * N.B., it is the caller's responsibility to free a returned nodemask.
1872  */
1873 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1874 {
1875         struct mempolicy *mempolicy;
1876         int nid;
1877
1878         if (!(mask && current->mempolicy))
1879                 return false;
1880
1881         task_lock(current);
1882         mempolicy = current->mempolicy;
1883         switch (mempolicy->mode) {
1884         case MPOL_PREFERRED:
1885                 if (mempolicy->flags & MPOL_F_LOCAL)
1886                         nid = numa_node_id();
1887                 else
1888                         nid = mempolicy->v.preferred_node;
1889                 init_nodemask_of_node(mask, nid);
1890                 break;
1891
1892         case MPOL_BIND:
1893                 /* Fall through */
1894         case MPOL_INTERLEAVE:
1895                 *mask =  mempolicy->v.nodes;
1896                 break;
1897
1898         default:
1899                 BUG();
1900         }
1901         task_unlock(current);
1902
1903         return true;
1904 }
1905 #endif
1906
1907 /*
1908  * mempolicy_nodemask_intersects
1909  *
1910  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1911  * policy.  Otherwise, check for intersection between mask and the policy
1912  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1913  * policy, always return true since it may allocate elsewhere on fallback.
1914  *
1915  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1916  */
1917 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1918                                         const nodemask_t *mask)
1919 {
1920         struct mempolicy *mempolicy;
1921         bool ret = true;
1922
1923         if (!mask)
1924                 return ret;
1925         task_lock(tsk);
1926         mempolicy = tsk->mempolicy;
1927         if (!mempolicy)
1928                 goto out;
1929
1930         switch (mempolicy->mode) {
1931         case MPOL_PREFERRED:
1932                 /*
1933                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1934                  * allocate from, they may fallback to other nodes when oom.
1935                  * Thus, it's possible for tsk to have allocated memory from
1936                  * nodes in mask.
1937                  */
1938                 break;
1939         case MPOL_BIND:
1940         case MPOL_INTERLEAVE:
1941                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1942                 break;
1943         default:
1944                 BUG();
1945         }
1946 out:
1947         task_unlock(tsk);
1948         return ret;
1949 }
1950
1951 /* Allocate a page in interleaved policy.
1952    Own path because it needs to do special accounting. */
1953 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1954                                         unsigned nid)
1955 {
1956         struct zonelist *zl;
1957         struct page *page;
1958
1959         zl = node_zonelist(nid, gfp);
1960         page = __alloc_pages(gfp, order, zl);
1961         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1962                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1963         return page;
1964 }
1965
1966 /**
1967  *      alloc_pages_vma - Allocate a page for a VMA.
1968  *
1969  *      @gfp:
1970  *      %GFP_USER    user allocation.
1971  *      %GFP_KERNEL  kernel allocations,
1972  *      %GFP_HIGHMEM highmem/user allocations,
1973  *      %GFP_FS      allocation should not call back into a file system.
1974  *      %GFP_ATOMIC  don't sleep.
1975  *
1976  *      @order:Order of the GFP allocation.
1977  *      @vma:  Pointer to VMA or NULL if not available.
1978  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1979  *
1980  *      This function allocates a page from the kernel page pool and applies
1981  *      a NUMA policy associated with the VMA or the current process.
1982  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1983  *      mm_struct of the VMA to prevent it from going away. Should be used for
1984  *      all allocations for pages that will be mapped into
1985  *      user space. Returns NULL when no page can be allocated.
1986  *
1987  *      Should be called with the mm_sem of the vma hold.
1988  */
1989 struct page *
1990 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1991                 unsigned long addr, int node)
1992 {
1993         struct mempolicy *pol;
1994         struct page *page;
1995         unsigned int cpuset_mems_cookie;
1996
1997 retry_cpuset:
1998         pol = get_vma_policy(current, vma, addr);
1999         cpuset_mems_cookie = get_mems_allowed();
2000
2001         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2002                 unsigned nid;
2003
2004                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2005                 mpol_cond_put(pol);
2006                 page = alloc_page_interleave(gfp, order, nid);
2007                 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2008                         goto retry_cpuset;
2009
2010                 return page;
2011         }
2012         page = __alloc_pages_nodemask(gfp, order,
2013                                       policy_zonelist(gfp, pol, node),
2014                                       policy_nodemask(gfp, pol));
2015         if (unlikely(mpol_needs_cond_ref(pol)))
2016                 __mpol_put(pol);
2017         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2018                 goto retry_cpuset;
2019         return page;
2020 }
2021
2022 /**
2023  *      alloc_pages_current - Allocate pages.
2024  *
2025  *      @gfp:
2026  *              %GFP_USER   user allocation,
2027  *              %GFP_KERNEL kernel allocation,
2028  *              %GFP_HIGHMEM highmem allocation,
2029  *              %GFP_FS     don't call back into a file system.
2030  *              %GFP_ATOMIC don't sleep.
2031  *      @order: Power of two of allocation size in pages. 0 is a single page.
2032  *
2033  *      Allocate a page from the kernel page pool.  When not in
2034  *      interrupt context and apply the current process NUMA policy.
2035  *      Returns NULL when no page can be allocated.
2036  *
2037  *      Don't call cpuset_update_task_memory_state() unless
2038  *      1) it's ok to take cpuset_sem (can WAIT), and
2039  *      2) allocating for current task (not interrupt).
2040  */
2041 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2042 {
2043         struct mempolicy *pol = get_task_policy(current);
2044         struct page *page;
2045         unsigned int cpuset_mems_cookie;
2046
2047         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2048                 pol = &default_policy;
2049
2050 retry_cpuset:
2051         cpuset_mems_cookie = get_mems_allowed();
2052
2053         /*
2054          * No reference counting needed for current->mempolicy
2055          * nor system default_policy
2056          */
2057         if (pol->mode == MPOL_INTERLEAVE)
2058                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2059         else
2060                 page = __alloc_pages_nodemask(gfp, order,
2061                                 policy_zonelist(gfp, pol, numa_node_id()),
2062                                 policy_nodemask(gfp, pol));
2063
2064         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2065                 goto retry_cpuset;
2066
2067         return page;
2068 }
2069 EXPORT_SYMBOL(alloc_pages_current);
2070
2071 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2072 {
2073         struct mempolicy *pol = mpol_dup(vma_policy(src));
2074
2075         if (IS_ERR(pol))
2076                 return PTR_ERR(pol);
2077         dst->vm_policy = pol;
2078         return 0;
2079 }
2080
2081 /*
2082  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2083  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2084  * with the mems_allowed returned by cpuset_mems_allowed().  This
2085  * keeps mempolicies cpuset relative after its cpuset moves.  See
2086  * further kernel/cpuset.c update_nodemask().
2087  *
2088  * current's mempolicy may be rebinded by the other task(the task that changes
2089  * cpuset's mems), so we needn't do rebind work for current task.
2090  */
2091
2092 /* Slow path of a mempolicy duplicate */
2093 struct mempolicy *__mpol_dup(struct mempolicy *old)
2094 {
2095         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2096
2097         if (!new)
2098                 return ERR_PTR(-ENOMEM);
2099
2100         /* task's mempolicy is protected by alloc_lock */
2101         if (old == current->mempolicy) {
2102                 task_lock(current);
2103                 *new = *old;
2104                 task_unlock(current);
2105         } else
2106                 *new = *old;
2107
2108         rcu_read_lock();
2109         if (current_cpuset_is_being_rebound()) {
2110                 nodemask_t mems = cpuset_mems_allowed(current);
2111                 if (new->flags & MPOL_F_REBINDING)
2112                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2113                 else
2114                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2115         }
2116         rcu_read_unlock();
2117         atomic_set(&new->refcnt, 1);
2118         return new;
2119 }
2120
2121 /* Slow path of a mempolicy comparison */
2122 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2123 {
2124         if (!a || !b)
2125                 return false;
2126         if (a->mode != b->mode)
2127                 return false;
2128         if (a->flags != b->flags)
2129                 return false;
2130         if (mpol_store_user_nodemask(a))
2131                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2132                         return false;
2133
2134         switch (a->mode) {
2135         case MPOL_BIND:
2136                 /* Fall through */
2137         case MPOL_INTERLEAVE:
2138                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2139         case MPOL_PREFERRED:
2140                 return a->v.preferred_node == b->v.preferred_node;
2141         default:
2142                 BUG();
2143                 return false;
2144         }
2145 }
2146
2147 /*
2148  * Shared memory backing store policy support.
2149  *
2150  * Remember policies even when nobody has shared memory mapped.
2151  * The policies are kept in Red-Black tree linked from the inode.
2152  * They are protected by the sp->lock spinlock, which should be held
2153  * for any accesses to the tree.
2154  */
2155
2156 /* lookup first element intersecting start-end */
2157 /* Caller holds sp->lock */
2158 static struct sp_node *
2159 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2160 {
2161         struct rb_node *n = sp->root.rb_node;
2162
2163         while (n) {
2164                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2165
2166                 if (start >= p->end)
2167                         n = n->rb_right;
2168                 else if (end <= p->start)
2169                         n = n->rb_left;
2170                 else
2171                         break;
2172         }
2173         if (!n)
2174                 return NULL;
2175         for (;;) {
2176                 struct sp_node *w = NULL;
2177                 struct rb_node *prev = rb_prev(n);
2178                 if (!prev)
2179                         break;
2180                 w = rb_entry(prev, struct sp_node, nd);
2181                 if (w->end <= start)
2182                         break;
2183                 n = prev;
2184         }
2185         return rb_entry(n, struct sp_node, nd);
2186 }
2187
2188 /* Insert a new shared policy into the list. */
2189 /* Caller holds sp->lock */
2190 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2191 {
2192         struct rb_node **p = &sp->root.rb_node;
2193         struct rb_node *parent = NULL;
2194         struct sp_node *nd;
2195
2196         while (*p) {
2197                 parent = *p;
2198                 nd = rb_entry(parent, struct sp_node, nd);
2199                 if (new->start < nd->start)
2200                         p = &(*p)->rb_left;
2201                 else if (new->end > nd->end)
2202                         p = &(*p)->rb_right;
2203                 else
2204                         BUG();
2205         }
2206         rb_link_node(&new->nd, parent, p);
2207         rb_insert_color(&new->nd, &sp->root);
2208         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2209                  new->policy ? new->policy->mode : 0);
2210 }
2211
2212 /* Find shared policy intersecting idx */
2213 struct mempolicy *
2214 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2215 {
2216         struct mempolicy *pol = NULL;
2217         struct sp_node *sn;
2218
2219         if (!sp->root.rb_node)
2220                 return NULL;
2221         spin_lock(&sp->lock);
2222         sn = sp_lookup(sp, idx, idx+1);
2223         if (sn) {
2224                 mpol_get(sn->policy);
2225                 pol = sn->policy;
2226         }
2227         spin_unlock(&sp->lock);
2228         return pol;
2229 }
2230
2231 static void sp_free(struct sp_node *n)
2232 {
2233         mpol_put(n->policy);
2234         kmem_cache_free(sn_cache, n);
2235 }
2236
2237 /**
2238  * mpol_misplaced - check whether current page node is valid in policy
2239  *
2240  * @page   - page to be checked
2241  * @vma    - vm area where page mapped
2242  * @addr   - virtual address where page mapped
2243  *
2244  * Lookup current policy node id for vma,addr and "compare to" page's
2245  * node id.
2246  *
2247  * Returns:
2248  *      -1      - not misplaced, page is in the right node
2249  *      node    - node id where the page should be
2250  *
2251  * Policy determination "mimics" alloc_page_vma().
2252  * Called from fault path where we know the vma and faulting address.
2253  */
2254 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2255 {
2256         struct mempolicy *pol;
2257         struct zone *zone;
2258         int curnid = page_to_nid(page);
2259         unsigned long pgoff;
2260         int polnid = -1;
2261         int ret = -1;
2262
2263         BUG_ON(!vma);
2264
2265         pol = get_vma_policy(current, vma, addr);
2266         if (!(pol->flags & MPOL_F_MOF))
2267                 goto out;
2268
2269         switch (pol->mode) {
2270         case MPOL_INTERLEAVE:
2271                 BUG_ON(addr >= vma->vm_end);
2272                 BUG_ON(addr < vma->vm_start);
2273
2274                 pgoff = vma->vm_pgoff;
2275                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2276                 polnid = offset_il_node(pol, vma, pgoff);
2277                 break;
2278
2279         case MPOL_PREFERRED:
2280                 if (pol->flags & MPOL_F_LOCAL)
2281                         polnid = numa_node_id();
2282                 else
2283                         polnid = pol->v.preferred_node;
2284                 break;
2285
2286         case MPOL_BIND:
2287                 /*
2288                  * allows binding to multiple nodes.
2289                  * use current page if in policy nodemask,
2290                  * else select nearest allowed node, if any.
2291                  * If no allowed nodes, use current [!misplaced].
2292                  */
2293                 if (node_isset(curnid, pol->v.nodes))
2294                         goto out;
2295                 (void)first_zones_zonelist(
2296                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2297                                 gfp_zone(GFP_HIGHUSER),
2298                                 &pol->v.nodes, &zone);
2299                 polnid = zone->node;
2300                 break;
2301
2302         default:
2303                 BUG();
2304         }
2305
2306         /* Migrate the page towards the node whose CPU is referencing it */
2307         if (pol->flags & MPOL_F_MORON) {
2308                 int last_nid;
2309
2310                 polnid = numa_node_id();
2311
2312                 /*
2313                  * Multi-stage node selection is used in conjunction
2314                  * with a periodic migration fault to build a temporal
2315                  * task<->page relation. By using a two-stage filter we
2316                  * remove short/unlikely relations.
2317                  *
2318                  * Using P(p) ~ n_p / n_t as per frequentist
2319                  * probability, we can equate a task's usage of a
2320                  * particular page (n_p) per total usage of this
2321                  * page (n_t) (in a given time-span) to a probability.
2322                  *
2323                  * Our periodic faults will sample this probability and
2324                  * getting the same result twice in a row, given these
2325                  * samples are fully independent, is then given by
2326                  * P(n)^2, provided our sample period is sufficiently
2327                  * short compared to the usage pattern.
2328                  *
2329                  * This quadric squishes small probabilities, making
2330                  * it less likely we act on an unlikely task<->page
2331                  * relation.
2332                  */
2333                 last_nid = page_nid_xchg_last(page, polnid);
2334                 if (last_nid != polnid)
2335                         goto out;
2336         }
2337
2338         if (curnid != polnid)
2339                 ret = polnid;
2340 out:
2341         mpol_cond_put(pol);
2342
2343         return ret;
2344 }
2345
2346 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2347 {
2348         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2349         rb_erase(&n->nd, &sp->root);
2350         sp_free(n);
2351 }
2352
2353 static void sp_node_init(struct sp_node *node, unsigned long start,
2354                         unsigned long end, struct mempolicy *pol)
2355 {
2356         node->start = start;
2357         node->end = end;
2358         node->policy = pol;
2359 }
2360
2361 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2362                                 struct mempolicy *pol)
2363 {
2364         struct sp_node *n;
2365         struct mempolicy *newpol;
2366
2367         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2368         if (!n)
2369                 return NULL;
2370
2371         newpol = mpol_dup(pol);
2372         if (IS_ERR(newpol)) {
2373                 kmem_cache_free(sn_cache, n);
2374                 return NULL;
2375         }
2376         newpol->flags |= MPOL_F_SHARED;
2377         sp_node_init(n, start, end, newpol);
2378
2379         return n;
2380 }
2381
2382 /* Replace a policy range. */
2383 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2384                                  unsigned long end, struct sp_node *new)
2385 {
2386         struct sp_node *n;
2387         struct sp_node *n_new = NULL;
2388         struct mempolicy *mpol_new = NULL;
2389         int ret = 0;
2390
2391 restart:
2392         spin_lock(&sp->lock);
2393         n = sp_lookup(sp, start, end);
2394         /* Take care of old policies in the same range. */
2395         while (n && n->start < end) {
2396                 struct rb_node *next = rb_next(&n->nd);
2397                 if (n->start >= start) {
2398                         if (n->end <= end)
2399                                 sp_delete(sp, n);
2400                         else
2401                                 n->start = end;
2402                 } else {
2403                         /* Old policy spanning whole new range. */
2404                         if (n->end > end) {
2405                                 if (!n_new)
2406                                         goto alloc_new;
2407
2408                                 *mpol_new = *n->policy;
2409                                 atomic_set(&mpol_new->refcnt, 1);
2410                                 sp_node_init(n_new, end, n->end, mpol_new);
2411                                 n->end = start;
2412                                 sp_insert(sp, n_new);
2413                                 n_new = NULL;
2414                                 mpol_new = NULL;
2415                                 break;
2416                         } else
2417                                 n->end = start;
2418                 }
2419                 if (!next)
2420                         break;
2421                 n = rb_entry(next, struct sp_node, nd);
2422         }
2423         if (new)
2424                 sp_insert(sp, new);
2425         spin_unlock(&sp->lock);
2426         ret = 0;
2427
2428 err_out:
2429         if (mpol_new)
2430                 mpol_put(mpol_new);
2431         if (n_new)
2432                 kmem_cache_free(sn_cache, n_new);
2433
2434         return ret;
2435
2436 alloc_new:
2437         spin_unlock(&sp->lock);
2438         ret = -ENOMEM;
2439         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2440         if (!n_new)
2441                 goto err_out;
2442         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2443         if (!mpol_new)
2444                 goto err_out;
2445         goto restart;
2446 }
2447
2448 /**
2449  * mpol_shared_policy_init - initialize shared policy for inode
2450  * @sp: pointer to inode shared policy
2451  * @mpol:  struct mempolicy to install
2452  *
2453  * Install non-NULL @mpol in inode's shared policy rb-tree.
2454  * On entry, the current task has a reference on a non-NULL @mpol.
2455  * This must be released on exit.
2456  * This is called at get_inode() calls and we can use GFP_KERNEL.
2457  */
2458 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2459 {
2460         int ret;
2461
2462         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2463         spin_lock_init(&sp->lock);
2464
2465         if (mpol) {
2466                 struct vm_area_struct pvma;
2467                 struct mempolicy *new;
2468                 NODEMASK_SCRATCH(scratch);
2469
2470                 if (!scratch)
2471                         goto put_mpol;
2472                 /* contextualize the tmpfs mount point mempolicy */
2473                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2474                 if (IS_ERR(new))
2475                         goto free_scratch; /* no valid nodemask intersection */
2476
2477                 task_lock(current);
2478                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2479                 task_unlock(current);
2480                 if (ret)
2481                         goto put_new;
2482
2483                 /* Create pseudo-vma that contains just the policy */
2484                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2485                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2486                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2487
2488 put_new:
2489                 mpol_put(new);                  /* drop initial ref */
2490 free_scratch:
2491                 NODEMASK_SCRATCH_FREE(scratch);
2492 put_mpol:
2493                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2494         }
2495 }
2496
2497 int mpol_set_shared_policy(struct shared_policy *info,
2498                         struct vm_area_struct *vma, struct mempolicy *npol)
2499 {
2500         int err;
2501         struct sp_node *new = NULL;
2502         unsigned long sz = vma_pages(vma);
2503
2504         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2505                  vma->vm_pgoff,
2506                  sz, npol ? npol->mode : -1,
2507                  npol ? npol->flags : -1,
2508                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2509
2510         if (npol) {
2511                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2512                 if (!new)
2513                         return -ENOMEM;
2514         }
2515         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2516         if (err && new)
2517                 sp_free(new);
2518         return err;
2519 }
2520
2521 /* Free a backing policy store on inode delete. */
2522 void mpol_free_shared_policy(struct shared_policy *p)
2523 {
2524         struct sp_node *n;
2525         struct rb_node *next;
2526
2527         if (!p->root.rb_node)
2528                 return;
2529         spin_lock(&p->lock);
2530         next = rb_first(&p->root);
2531         while (next) {
2532                 n = rb_entry(next, struct sp_node, nd);
2533                 next = rb_next(&n->nd);
2534                 sp_delete(p, n);
2535         }
2536         spin_unlock(&p->lock);
2537 }
2538
2539 #ifdef CONFIG_NUMA_BALANCING
2540 static bool __initdata numabalancing_override;
2541
2542 static void __init check_numabalancing_enable(void)
2543 {
2544         bool numabalancing_default = false;
2545
2546         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2547                 numabalancing_default = true;
2548
2549         if (nr_node_ids > 1 && !numabalancing_override) {
2550                 printk(KERN_INFO "Enabling automatic NUMA balancing. "
2551                         "Configure with numa_balancing= or sysctl");
2552                 set_numabalancing_state(numabalancing_default);
2553         }
2554 }
2555
2556 static int __init setup_numabalancing(char *str)
2557 {
2558         int ret = 0;
2559         if (!str)
2560                 goto out;
2561         numabalancing_override = true;
2562
2563         if (!strcmp(str, "enable")) {
2564                 set_numabalancing_state(true);
2565                 ret = 1;
2566         } else if (!strcmp(str, "disable")) {
2567                 set_numabalancing_state(false);
2568                 ret = 1;
2569         }
2570 out:
2571         if (!ret)
2572                 printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2573
2574         return ret;
2575 }
2576 __setup("numa_balancing=", setup_numabalancing);
2577 #else
2578 static inline void __init check_numabalancing_enable(void)
2579 {
2580 }
2581 #endif /* CONFIG_NUMA_BALANCING */
2582
2583 /* assumes fs == KERNEL_DS */
2584 void __init numa_policy_init(void)
2585 {
2586         nodemask_t interleave_nodes;
2587         unsigned long largest = 0;
2588         int nid, prefer = 0;
2589
2590         policy_cache = kmem_cache_create("numa_policy",
2591                                          sizeof(struct mempolicy),
2592                                          0, SLAB_PANIC, NULL);
2593
2594         sn_cache = kmem_cache_create("shared_policy_node",
2595                                      sizeof(struct sp_node),
2596                                      0, SLAB_PANIC, NULL);
2597
2598         for_each_node(nid) {
2599                 preferred_node_policy[nid] = (struct mempolicy) {
2600                         .refcnt = ATOMIC_INIT(1),
2601                         .mode = MPOL_PREFERRED,
2602                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2603                         .v = { .preferred_node = nid, },
2604                 };
2605         }
2606
2607         /*
2608          * Set interleaving policy for system init. Interleaving is only
2609          * enabled across suitably sized nodes (default is >= 16MB), or
2610          * fall back to the largest node if they're all smaller.
2611          */
2612         nodes_clear(interleave_nodes);
2613         for_each_node_state(nid, N_MEMORY) {
2614                 unsigned long total_pages = node_present_pages(nid);
2615
2616                 /* Preserve the largest node */
2617                 if (largest < total_pages) {
2618                         largest = total_pages;
2619                         prefer = nid;
2620                 }
2621
2622                 /* Interleave this node? */
2623                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2624                         node_set(nid, interleave_nodes);
2625         }
2626
2627         /* All too small, use the largest */
2628         if (unlikely(nodes_empty(interleave_nodes)))
2629                 node_set(prefer, interleave_nodes);
2630
2631         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2632                 printk("numa_policy_init: interleaving failed\n");
2633
2634         check_numabalancing_enable();
2635 }
2636
2637 /* Reset policy of current process to default */
2638 void numa_default_policy(void)
2639 {
2640         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2641 }
2642
2643 /*
2644  * Parse and format mempolicy from/to strings
2645  */
2646
2647 /*
2648  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2649  */
2650 static const char * const policy_modes[] =
2651 {
2652         [MPOL_DEFAULT]    = "default",
2653         [MPOL_PREFERRED]  = "prefer",
2654         [MPOL_BIND]       = "bind",
2655         [MPOL_INTERLEAVE] = "interleave",
2656         [MPOL_LOCAL]      = "local",
2657 };
2658
2659
2660 #ifdef CONFIG_TMPFS
2661 /**
2662  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2663  * @str:  string containing mempolicy to parse
2664  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2665  *
2666  * Format of input:
2667  *      <mode>[=<flags>][:<nodelist>]
2668  *
2669  * On success, returns 0, else 1
2670  */
2671 int mpol_parse_str(char *str, struct mempolicy **mpol)
2672 {
2673         struct mempolicy *new = NULL;
2674         unsigned short mode;
2675         unsigned short mode_flags;
2676         nodemask_t nodes;
2677         char *nodelist = strchr(str, ':');
2678         char *flags = strchr(str, '=');
2679         int err = 1;
2680
2681         if (nodelist) {
2682                 /* NUL-terminate mode or flags string */
2683                 *nodelist++ = '\0';
2684                 if (nodelist_parse(nodelist, nodes))
2685                         goto out;
2686                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2687                         goto out;
2688         } else
2689                 nodes_clear(nodes);
2690
2691         if (flags)
2692                 *flags++ = '\0';        /* terminate mode string */
2693
2694         for (mode = 0; mode < MPOL_MAX; mode++) {
2695                 if (!strcmp(str, policy_modes[mode])) {
2696                         break;
2697                 }
2698         }
2699         if (mode >= MPOL_MAX)
2700                 goto out;
2701
2702         switch (mode) {
2703         case MPOL_PREFERRED:
2704                 /*
2705                  * Insist on a nodelist of one node only
2706                  */
2707                 if (nodelist) {
2708                         char *rest = nodelist;
2709                         while (isdigit(*rest))
2710                                 rest++;
2711                         if (*rest)
2712                                 goto out;
2713                 }
2714                 break;
2715         case MPOL_INTERLEAVE:
2716                 /*
2717                  * Default to online nodes with memory if no nodelist
2718                  */
2719                 if (!nodelist)
2720                         nodes = node_states[N_MEMORY];
2721                 break;
2722         case MPOL_LOCAL:
2723                 /*
2724                  * Don't allow a nodelist;  mpol_new() checks flags
2725                  */
2726                 if (nodelist)
2727                         goto out;
2728                 mode = MPOL_PREFERRED;
2729                 break;
2730         case MPOL_DEFAULT:
2731                 /*
2732                  * Insist on a empty nodelist
2733                  */
2734                 if (!nodelist)
2735                         err = 0;
2736                 goto out;
2737         case MPOL_BIND:
2738                 /*
2739                  * Insist on a nodelist
2740                  */
2741                 if (!nodelist)
2742                         goto out;
2743         }
2744
2745         mode_flags = 0;
2746         if (flags) {
2747                 /*
2748                  * Currently, we only support two mutually exclusive
2749                  * mode flags.
2750                  */
2751                 if (!strcmp(flags, "static"))
2752                         mode_flags |= MPOL_F_STATIC_NODES;
2753                 else if (!strcmp(flags, "relative"))
2754                         mode_flags |= MPOL_F_RELATIVE_NODES;
2755                 else
2756                         goto out;
2757         }
2758
2759         new = mpol_new(mode, mode_flags, &nodes);
2760         if (IS_ERR(new))
2761                 goto out;
2762
2763         /*
2764          * Save nodes for mpol_to_str() to show the tmpfs mount options
2765          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2766          */
2767         if (mode != MPOL_PREFERRED)
2768                 new->v.nodes = nodes;
2769         else if (nodelist)
2770                 new->v.preferred_node = first_node(nodes);
2771         else
2772                 new->flags |= MPOL_F_LOCAL;
2773
2774         /*
2775          * Save nodes for contextualization: this will be used to "clone"
2776          * the mempolicy in a specific context [cpuset] at a later time.
2777          */
2778         new->w.user_nodemask = nodes;
2779
2780         err = 0;
2781
2782 out:
2783         /* Restore string for error message */
2784         if (nodelist)
2785                 *--nodelist = ':';
2786         if (flags)
2787                 *--flags = '=';
2788         if (!err)
2789                 *mpol = new;
2790         return err;
2791 }
2792 #endif /* CONFIG_TMPFS */
2793
2794 /**
2795  * mpol_to_str - format a mempolicy structure for printing
2796  * @buffer:  to contain formatted mempolicy string
2797  * @maxlen:  length of @buffer
2798  * @pol:  pointer to mempolicy to be formatted
2799  *
2800  * Convert a mempolicy into a string.
2801  * Returns the number of characters in buffer (if positive)
2802  * or an error (negative)
2803  */
2804 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2805 {
2806         char *p = buffer;
2807         int l;
2808         nodemask_t nodes;
2809         unsigned short mode;
2810         unsigned short flags = pol ? pol->flags : 0;
2811
2812         /*
2813          * Sanity check:  room for longest mode, flag and some nodes
2814          */
2815         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2816
2817         if (!pol || pol == &default_policy)
2818                 mode = MPOL_DEFAULT;
2819         else
2820                 mode = pol->mode;
2821
2822         switch (mode) {
2823         case MPOL_DEFAULT:
2824                 nodes_clear(nodes);
2825                 break;
2826
2827         case MPOL_PREFERRED:
2828                 nodes_clear(nodes);
2829                 if (flags & MPOL_F_LOCAL)
2830                         mode = MPOL_LOCAL;
2831                 else
2832                         node_set(pol->v.preferred_node, nodes);
2833                 break;
2834
2835         case MPOL_BIND:
2836                 /* Fall through */
2837         case MPOL_INTERLEAVE:
2838                 nodes = pol->v.nodes;
2839                 break;
2840
2841         default:
2842                 return -EINVAL;
2843         }
2844
2845         l = strlen(policy_modes[mode]);
2846         if (buffer + maxlen < p + l + 1)
2847                 return -ENOSPC;
2848
2849         strcpy(p, policy_modes[mode]);
2850         p += l;
2851
2852         if (flags & MPOL_MODE_FLAGS) {
2853                 if (buffer + maxlen < p + 2)
2854                         return -ENOSPC;
2855                 *p++ = '=';
2856
2857                 /*
2858                  * Currently, the only defined flags are mutually exclusive
2859                  */
2860                 if (flags & MPOL_F_STATIC_NODES)
2861                         p += snprintf(p, buffer + maxlen - p, "static");
2862                 else if (flags & MPOL_F_RELATIVE_NODES)
2863                         p += snprintf(p, buffer + maxlen - p, "relative");
2864         }
2865
2866         if (!nodes_empty(nodes)) {
2867                 if (buffer + maxlen < p + 2)
2868                         return -ENOSPC;
2869                 *p++ = ':';
2870                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2871         }
2872         return p - buffer;
2873 }