mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/slab.h>
  77 #include <linux/string.h>
  78 #include <linux/export.h>
  79 #include <linux/nsproxy.h>
  80 #include <linux/interrupt.h>
  81 #include <linux/init.h>
  82 #include <linux/compat.h>
  83 #include <linux/swap.h>
  84 #include <linux/seq_file.h>
  85 #include <linux/proc_fs.h>
  86 #include <linux/migrate.h>
  87 #include <linux/ksm.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91 #include <linux/ctype.h>
  92 #include <linux/mm_inline.h>
  93 #include <linux/mmu_notifier.h>
  94
  95 #include <asm/tlbflush.h>
  96 #include <asm/uaccess.h>
  97 #include <linux/random.h>
  98
  99 #include "internal.h"
 100
 101 /* Internal flags */
 102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 104
 105 static struct kmem_cache *policy_cache;
 106 static struct kmem_cache *sn_cache;
 107
 108 /* Highest zone. An specific allocation for a zone below that is not
 109    policied. */
 110 enum zone_type policy_zone = 0;
 111
 112 /*
 113  * run-time system-wide default policy => local allocation
 114  */
 115 static struct mempolicy default_policy = {
 116         .refcnt = ATOMIC_INIT(1), /* never free it */
 117         .mode = MPOL_PREFERRED,
 118         .flags = MPOL_F_LOCAL,
 119 };
 120
 121 static const struct mempolicy_operations {
 122         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 123         /*
 124          * If read-side task has no lock to protect task->mempolicy, write-side
 125          * task will rebind the task->mempolicy by two step. The first step is
 126          * setting all the newly nodes, and the second step is cleaning all the
 127          * disallowed nodes. In this way, we can avoid finding no node to alloc
 128          * page.
 129          * If we have a lock to protect task->mempolicy in read-side, we do
 130          * rebind directly.
 131          *
 132          * step:
 133          *      MPOL_REBIND_ONCE - do rebind work at once
 134          *      MPOL_REBIND_STEP1 - set all the newly nodes
 135          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 136          */
 137         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 138                         enum mpol_rebind_step step);
 139 } mpol_ops[MPOL_MAX];
 140
 141 /* Check that the nodemask contains at least one populated zone */
 142 static int is_valid_nodemask(const nodemask_t *nodemask)
 143 {
 144         int nd, k;
 145
 146         for_each_node_mask(nd, *nodemask) {
 147                 struct zone *z;
 148
 149                 for (k = 0; k <= policy_zone; k++) {
 150                         z = &NODE_DATA(nd)->node_zones[k];
 151                         if (z->present_pages > 0)
 152                                 return 1;
 153                 }
 154         }
 155
 156         return 0;
 157 }
 158
 159 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 160 {
 161         return pol->flags & MPOL_MODE_FLAGS;
 162 }
 163
 164 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 165                                    const nodemask_t *rel)
 166 {
 167         nodemask_t tmp;
 168         nodes_fold(tmp, *orig, nodes_weight(*rel));
 169         nodes_onto(*ret, tmp, *rel);
 170 }
 171
 172 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 173 {
 174         if (nodes_empty(*nodes))
 175                 return -EINVAL;
 176         pol->v.nodes = *nodes;
 177         return 0;
 178 }
 179
 180 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 181 {
 182         if (!nodes)
 183                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 184         else if (nodes_empty(*nodes))
 185                 return -EINVAL;                 /*  no allowed nodes */
 186         else
 187                 pol->v.preferred_node = first_node(*nodes);
 188         return 0;
 189 }
 190
 191 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 192 {
 193         if (!is_valid_nodemask(nodes))
 194                 return -EINVAL;
 195         pol->v.nodes = *nodes;
 196         return 0;
 197 }
 198
 199 /*
 200  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 201  * any, for the new policy.  mpol_new() has already validated the nodes
 202  * parameter with respect to the policy mode and flags.  But, we need to
 203  * handle an empty nodemask with MPOL_PREFERRED here.
 204  *
 205  * Must be called holding task's alloc_lock to protect task's mems_allowed
 206  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 207  */
 208 static int mpol_set_nodemask(struct mempolicy *pol,
 209                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 210 {
 211         int ret;
 212
 213         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 214         if (pol == NULL)
 215                 return 0;
 216         /* Check N_HIGH_MEMORY */
 217         nodes_and(nsc->mask1,
 218                   cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
 219
 220         VM_BUG_ON(!nodes);
 221         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 222                 nodes = NULL;   /* explicit local allocation */
 223         else {
 224                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 225                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 226                 else
 227                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 228
 229                 if (mpol_store_user_nodemask(pol))
 230                         pol->w.user_nodemask = *nodes;
 231                 else
 232                         pol->w.cpuset_mems_allowed =
 233                                                 cpuset_current_mems_allowed;
 234         }
 235
 236         if (nodes)
 237                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 238         else
 239                 ret = mpol_ops[pol->mode].create(pol, NULL);
 240         return ret;
 241 }
 242
 243 /*
 244  * This function just creates a new policy, does some check and simple
 245  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 246  */
 247 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 248                                   nodemask_t *nodes)
 249 {
 250         struct mempolicy *policy;
 251
 252         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 253                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 254
 255         if (mode == MPOL_DEFAULT) {
 256                 if (nodes && !nodes_empty(*nodes))
 257                         return ERR_PTR(-EINVAL);
 258                 return NULL;
 259         }
 260         VM_BUG_ON(!nodes);
 261
 262         /*
 263          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 264          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 265          * All other modes require a valid pointer to a non-empty nodemask.
 266          */
 267         if (mode == MPOL_PREFERRED) {
 268                 if (nodes_empty(*nodes)) {
 269                         if (((flags & MPOL_F_STATIC_NODES) ||
 270                              (flags & MPOL_F_RELATIVE_NODES)))
 271                                 return ERR_PTR(-EINVAL);
 272                 }
 273         } else if (mode == MPOL_LOCAL) {
 274                 if (!nodes_empty(*nodes))
 275                         return ERR_PTR(-EINVAL);
 276                 mode = MPOL_PREFERRED;
 277         } else if (nodes_empty(*nodes))
 278                 return ERR_PTR(-EINVAL);
 279         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 280         if (!policy)
 281                 return ERR_PTR(-ENOMEM);
 282         atomic_set(&policy->refcnt, 1);
 283         policy->mode = mode;
 284         policy->flags = flags;
 285
 286         return policy;
 287 }
 288
 289 /* Slow path of a mpol destructor. */
 290 void __mpol_put(struct mempolicy *p)
 291 {
 292         if (!atomic_dec_and_test(&p->refcnt))
 293                 return;
 294         kmem_cache_free(policy_cache, p);
 295 }
 296
 297 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 298                                 enum mpol_rebind_step step)
 299 {
 300 }
 301
 302 /*
 303  * step:
 304  *      MPOL_REBIND_ONCE  - do rebind work at once
 305  *      MPOL_REBIND_STEP1 - set all the newly nodes
 306  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 307  */
 308 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 309                                  enum mpol_rebind_step step)
 310 {
 311         nodemask_t tmp;
 312
 313         if (pol->flags & MPOL_F_STATIC_NODES)
 314                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 315         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 316                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 317         else {
 318                 /*
 319                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 320                  * result
 321                  */
 322                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 323                         nodes_remap(tmp, pol->v.nodes,
 324                                         pol->w.cpuset_mems_allowed, *nodes);
 325                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 326                 } else if (step == MPOL_REBIND_STEP2) {
 327                         tmp = pol->w.cpuset_mems_allowed;
 328                         pol->w.cpuset_mems_allowed = *nodes;
 329                 } else
 330                         BUG();
 331         }
 332
 333         if (nodes_empty(tmp))
 334                 tmp = *nodes;
 335
 336         if (step == MPOL_REBIND_STEP1)
 337                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 338         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 339                 pol->v.nodes = tmp;
 340         else
 341                 BUG();
 342
 343         if (!node_isset(current->il_next, tmp)) {
 344                 current->il_next = next_node(current->il_next, tmp);
 345                 if (current->il_next >= MAX_NUMNODES)
 346                         current->il_next = first_node(tmp);
 347                 if (current->il_next >= MAX_NUMNODES)
 348                         current->il_next = numa_node_id();
 349         }
 350 }
 351
 352 static void mpol_rebind_preferred(struct mempolicy *pol,
 353                                   const nodemask_t *nodes,
 354                                   enum mpol_rebind_step step)
 355 {
 356         nodemask_t tmp;
 357
 358         if (pol->flags & MPOL_F_STATIC_NODES) {
 359                 int node = first_node(pol->w.user_nodemask);
 360
 361                 if (node_isset(node, *nodes)) {
 362                         pol->v.preferred_node = node;
 363                         pol->flags &= ~MPOL_F_LOCAL;
 364                 } else
 365                         pol->flags |= MPOL_F_LOCAL;
 366         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 367                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 368                 pol->v.preferred_node = first_node(tmp);
 369         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 370                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 371                                                    pol->w.cpuset_mems_allowed,
 372                                                    *nodes);
 373                 pol->w.cpuset_mems_allowed = *nodes;
 374         }
 375 }
 376
 377 /*
 378  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 379  *
 380  * If read-side task has no lock to protect task->mempolicy, write-side
 381  * task will rebind the task->mempolicy by two step. The first step is
 382  * setting all the newly nodes, and the second step is cleaning all the
 383  * disallowed nodes. In this way, we can avoid finding no node to alloc
 384  * page.
 385  * If we have a lock to protect task->mempolicy in read-side, we do
 386  * rebind directly.
 387  *
 388  * step:
 389  *      MPOL_REBIND_ONCE  - do rebind work at once
 390  *      MPOL_REBIND_STEP1 - set all the newly nodes
 391  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 392  */
 393 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 394                                 enum mpol_rebind_step step)
 395 {
 396         if (!pol)
 397                 return;
 398         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 399             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 400                 return;
 401
 402         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 403                 return;
 404
 405         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 406                 BUG();
 407
 408         if (step == MPOL_REBIND_STEP1)
 409                 pol->flags |= MPOL_F_REBINDING;
 410         else if (step == MPOL_REBIND_STEP2)
 411                 pol->flags &= ~MPOL_F_REBINDING;
 412         else if (step >= MPOL_REBIND_NSTEP)
 413                 BUG();
 414
 415         mpol_ops[pol->mode].rebind(pol, newmask, step);
 416 }
 417
 418 /*
 419  * Wrapper for mpol_rebind_policy() that just requires task
 420  * pointer, and updates task mempolicy.
 421  *
 422  * Called with task's alloc_lock held.
 423  */
 424
 425 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 426                         enum mpol_rebind_step step)
 427 {
 428         mpol_rebind_policy(tsk->mempolicy, new, step);
 429 }
 430
 431 /*
 432  * Rebind each vma in mm to new nodemask.
 433  *
 434  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 435  */
 436
 437 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 438 {
 439         struct vm_area_struct *vma;
 440
 441         down_write(&mm->mmap_sem);
 442         for (vma = mm->mmap; vma; vma = vma->vm_next)
 443                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 444         up_write(&mm->mmap_sem);
 445 }
 446
 447 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 448         [MPOL_DEFAULT] = {
 449                 .rebind = mpol_rebind_default,
 450         },
 451         [MPOL_INTERLEAVE] = {
 452                 .create = mpol_new_interleave,
 453                 .rebind = mpol_rebind_nodemask,
 454         },
 455         [MPOL_PREFERRED] = {
 456                 .create = mpol_new_preferred,
 457                 .rebind = mpol_rebind_preferred,
 458         },
 459         [MPOL_BIND] = {
 460                 .create = mpol_new_bind,
 461                 .rebind = mpol_rebind_nodemask,
 462         },
 463 };
 464
 465 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 466                                 unsigned long flags);
 467
 468 /* Scan through pages checking if pages follow certain conditions. */
 469 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 470                 unsigned long addr, unsigned long end,
 471                 const nodemask_t *nodes, unsigned long flags,
 472                 void *private)
 473 {
 474         pte_t *orig_pte;
 475         pte_t *pte;
 476         spinlock_t *ptl;
 477
 478         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 479         do {
 480                 struct page *page;
 481                 int nid;
 482
 483                 if (!pte_present(*pte))
 484                         continue;
 485                 page = vm_normal_page(vma, addr, *pte);
 486                 if (!page)
 487                         continue;
 488                 /*
 489                  * vm_normal_page() filters out zero pages, but there might
 490                  * still be PageReserved pages to skip, perhaps in a VDSO.
 491                  * And we cannot move PageKsm pages sensibly or safely yet.
 492                  */
 493                 if (PageReserved(page) || PageKsm(page))
 494                         continue;
 495                 nid = page_to_nid(page);
 496                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 497                         continue;
 498
 499                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 500                         migrate_page_add(page, private, flags);
 501                 else
 502                         break;
 503         } while (pte++, addr += PAGE_SIZE, addr != end);
 504         pte_unmap_unlock(orig_pte, ptl);
 505         return addr != end;
 506 }
 507
 508 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 509                 unsigned long addr, unsigned long end,
 510                 const nodemask_t *nodes, unsigned long flags,
 511                 void *private)
 512 {
 513         pmd_t *pmd;
 514         unsigned long next;
 515
 516         pmd = pmd_offset(pud, addr);
 517         do {
 518                 next = pmd_addr_end(addr, end);
 519                 split_huge_page_pmd(vma->vm_mm, pmd);
 520                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 521                         continue;
 522                 if (check_pte_range(vma, pmd, addr, next, nodes,
 523                                     flags, private))
 524                         return -EIO;
 525         } while (pmd++, addr = next, addr != end);
 526         return 0;
 527 }
 528
 529 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 530                 unsigned long addr, unsigned long end,
 531                 const nodemask_t *nodes, unsigned long flags,
 532                 void *private)
 533 {
 534         pud_t *pud;
 535         unsigned long next;
 536
 537         pud = pud_offset(pgd, addr);
 538         do {
 539                 next = pud_addr_end(addr, end);
 540                 if (pud_none_or_clear_bad(pud))
 541                         continue;
 542                 if (check_pmd_range(vma, pud, addr, next, nodes,
 543                                     flags, private))
 544                         return -EIO;
 545         } while (pud++, addr = next, addr != end);
 546         return 0;
 547 }
 548
 549 static inline int check_pgd_range(struct vm_area_struct *vma,
 550                 unsigned long addr, unsigned long end,
 551                 const nodemask_t *nodes, unsigned long flags,
 552                 void *private)
 553 {
 554         pgd_t *pgd;
 555         unsigned long next;
 556
 557         pgd = pgd_offset(vma->vm_mm, addr);
 558         do {
 559                 next = pgd_addr_end(addr, end);
 560                 if (pgd_none_or_clear_bad(pgd))
 561                         continue;
 562                 if (check_pud_range(vma, pgd, addr, next, nodes,
 563                                     flags, private))
 564                         return -EIO;
 565         } while (pgd++, addr = next, addr != end);
 566         return 0;
 567 }
 568
 569 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 570 /*
 571  * This is used to mark a range of virtual addresses to be inaccessible.
 572  * These are later cleared by a NUMA hinting fault. Depending on these
 573  * faults, pages may be migrated for better NUMA placement.
 574  *
 575  * This is assuming that NUMA faults are handled using PROT_NONE. If
 576  * an architecture makes a different choice, it will need further
 577  * changes to the core.
 578  */
 579 unsigned long change_prot_numa(struct vm_area_struct *vma,
 580                         unsigned long addr, unsigned long end)
 581 {
 582         int nr_updated;
 583         BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 584
 585         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 586         if (nr_updated)
 587                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 588
 589         return nr_updated;
 590 }
 591 #else
 592 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 593                         unsigned long addr, unsigned long end)
 594 {
 595         return 0;
 596 }
 597 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 598
 599 /*
 600  * Check if all pages in a range are on a set of nodes.
 601  * If pagelist != NULL then isolate pages from the LRU and
 602  * put them on the pagelist.
 603  */
 604 static struct vm_area_struct *
 605 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 606                 const nodemask_t *nodes, unsigned long flags, void *private)
 607 {
 608         int err;
 609         struct vm_area_struct *first, *vma, *prev;
 610
 611
 612         first = find_vma(mm, start);
 613         if (!first)
 614                 return ERR_PTR(-EFAULT);
 615         prev = NULL;
 616         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 617                 unsigned long endvma = vma->vm_end;
 618
 619                 if (endvma > end)
 620                         endvma = end;
 621                 if (vma->vm_start > start)
 622                         start = vma->vm_start;
 623
 624                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 625                         if (!vma->vm_next && vma->vm_end < end)
 626                                 return ERR_PTR(-EFAULT);
 627                         if (prev && prev->vm_end < vma->vm_start)
 628                                 return ERR_PTR(-EFAULT);
 629                 }
 630
 631                 if (is_vm_hugetlb_page(vma))
 632                         goto next;
 633
 634                 if (flags & MPOL_MF_LAZY) {
 635                         change_prot_numa(vma, start, endvma);
 636                         goto next;
 637                 }
 638
 639                 if ((flags & MPOL_MF_STRICT) ||
 640                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 641                       vma_migratable(vma))) {
 642
 643                         err = check_pgd_range(vma, start, endvma, nodes,
 644                                                 flags, private);
 645                         if (err) {
 646                                 first = ERR_PTR(err);
 647                                 break;
 648                         }
 649                 }
 650 next:
 651                 prev = vma;
 652         }
 653         return first;
 654 }
 655
 656 /*
 657  * Apply policy to a single VMA
 658  * This must be called with the mmap_sem held for writing.
 659  */
 660 static int vma_replace_policy(struct vm_area_struct *vma,
 661                                                 struct mempolicy *pol)
 662 {
 663         int err;
 664         struct mempolicy *old;
 665         struct mempolicy *new;
 666
 667         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 668                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 669                  vma->vm_ops, vma->vm_file,
 670                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 671
 672         new = mpol_dup(pol);
 673         if (IS_ERR(new))
 674                 return PTR_ERR(new);
 675
 676         if (vma->vm_ops && vma->vm_ops->set_policy) {
 677                 err = vma->vm_ops->set_policy(vma, new);
 678                 if (err)
 679                         goto err_out;
 680         }
 681
 682         old = vma->vm_policy;
 683         vma->vm_policy = new; /* protected by mmap_sem */
 684         mpol_put(old);
 685
 686         return 0;
 687  err_out:
 688         mpol_put(new);
 689         return err;
 690 }
 691
 692 /* Step 2: apply policy to a range and do splits. */
 693 static int mbind_range(struct mm_struct *mm, unsigned long start,
 694                        unsigned long end, struct mempolicy *new_pol)
 695 {
 696         struct vm_area_struct *next;
 697         struct vm_area_struct *prev;
 698         struct vm_area_struct *vma;
 699         int err = 0;
 700         pgoff_t pgoff;
 701         unsigned long vmstart;
 702         unsigned long vmend;
 703
 704         vma = find_vma(mm, start);
 705         if (!vma || vma->vm_start > start)
 706                 return -EFAULT;
 707
 708         prev = vma->vm_prev;
 709         if (start > vma->vm_start)
 710                 prev = vma;
 711
 712         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 713                 next = vma->vm_next;
 714                 vmstart = max(start, vma->vm_start);
 715                 vmend   = min(end, vma->vm_end);
 716
 717                 if (mpol_equal(vma_policy(vma), new_pol))
 718                         continue;
 719
 720                 pgoff = vma->vm_pgoff +
 721                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 722                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 723                                   vma->anon_vma, vma->vm_file, pgoff,
 724                                   new_pol);
 725                 if (prev) {
 726                         vma = prev;
 727                         next = vma->vm_next;
 728                         continue;
 729                 }
 730                 if (vma->vm_start != vmstart) {
 731                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 732                         if (err)
 733                                 goto out;
 734                 }
 735                 if (vma->vm_end != vmend) {
 736                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 737                         if (err)
 738                                 goto out;
 739                 }
 740                 err = vma_replace_policy(vma, new_pol);
 741                 if (err)
 742                         goto out;
 743         }
 744
 745  out:
 746         return err;
 747 }
 748
 749 /*
 750  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 751  * mempolicy.  Allows more rapid checking of this (combined perhaps
 752  * with other PF_* flag bits) on memory allocation hot code paths.
 753  *
 754  * If called from outside this file, the task 'p' should -only- be
 755  * a newly forked child not yet visible on the task list, because
 756  * manipulating the task flags of a visible task is not safe.
 757  *
 758  * The above limitation is why this routine has the funny name
 759  * mpol_fix_fork_child_flag().
 760  *
 761  * It is also safe to call this with a task pointer of current,
 762  * which the static wrapper mpol_set_task_struct_flag() does,
 763  * for use within this file.
 764  */
 765
 766 void mpol_fix_fork_child_flag(struct task_struct *p)
 767 {
 768         if (p->mempolicy)
 769                 p->flags |= PF_MEMPOLICY;
 770         else
 771                 p->flags &= ~PF_MEMPOLICY;
 772 }
 773
 774 static void mpol_set_task_struct_flag(void)
 775 {
 776         mpol_fix_fork_child_flag(current);
 777 }
 778
 779 /* Set the process memory policy */
 780 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 781                              nodemask_t *nodes)
 782 {
 783         struct mempolicy *new, *old;
 784         struct mm_struct *mm = current->mm;
 785         NODEMASK_SCRATCH(scratch);
 786         int ret;
 787
 788         if (!scratch)
 789                 return -ENOMEM;
 790
 791         new = mpol_new(mode, flags, nodes);
 792         if (IS_ERR(new)) {
 793                 ret = PTR_ERR(new);
 794                 goto out;
 795         }
 796         /*
 797          * prevent changing our mempolicy while show_numa_maps()
 798          * is using it.
 799          * Note:  do_set_mempolicy() can be called at init time
 800          * with no 'mm'.
 801          */
 802         if (mm)
 803                 down_write(&mm->mmap_sem);
 804         task_lock(current);
 805         ret = mpol_set_nodemask(new, nodes, scratch);
 806         if (ret) {
 807                 task_unlock(current);
 808                 if (mm)
 809                         up_write(&mm->mmap_sem);
 810                 mpol_put(new);
 811                 goto out;
 812         }
 813         old = current->mempolicy;
 814         current->mempolicy = new;
 815         mpol_set_task_struct_flag();
 816         if (new && new->mode == MPOL_INTERLEAVE &&
 817             nodes_weight(new->v.nodes))
 818                 current->il_next = first_node(new->v.nodes);
 819         task_unlock(current);
 820         if (mm)
 821                 up_write(&mm->mmap_sem);
 822
 823         mpol_put(old);
 824         ret = 0;
 825 out:
 826         NODEMASK_SCRATCH_FREE(scratch);
 827         return ret;
 828 }
 829
 830 /*
 831  * Return nodemask for policy for get_mempolicy() query
 832  *
 833  * Called with task's alloc_lock held
 834  */
 835 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 836 {
 837         nodes_clear(*nodes);
 838         if (p == &default_policy)
 839                 return;
 840
 841         switch (p->mode) {
 842         case MPOL_BIND:
 843                 /* Fall through */
 844         case MPOL_INTERLEAVE:
 845                 *nodes = p->v.nodes;
 846                 break;
 847         case MPOL_PREFERRED:
 848                 if (!(p->flags & MPOL_F_LOCAL))
 849                         node_set(p->v.preferred_node, *nodes);
 850                 /* else return empty node mask for local allocation */
 851                 break;
 852         default:
 853                 BUG();
 854         }
 855 }
 856
 857 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 858 {
 859         struct page *p;
 860         int err;
 861
 862         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 863         if (err >= 0) {
 864                 err = page_to_nid(p);
 865                 put_page(p);
 866         }
 867         return err;
 868 }
 869
 870 /* Retrieve NUMA policy */
 871 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 872                              unsigned long addr, unsigned long flags)
 873 {
 874         int err;
 875         struct mm_struct *mm = current->mm;
 876         struct vm_area_struct *vma = NULL;
 877         struct mempolicy *pol = current->mempolicy;
 878
 879         if (flags &
 880                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 881                 return -EINVAL;
 882
 883         if (flags & MPOL_F_MEMS_ALLOWED) {
 884                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 885                         return -EINVAL;
 886                 *policy = 0;    /* just so it's initialized */
 887                 task_lock(current);
 888                 *nmask  = cpuset_current_mems_allowed;
 889                 task_unlock(current);
 890                 return 0;
 891         }
 892
 893         if (flags & MPOL_F_ADDR) {
 894                 /*
 895                  * Do NOT fall back to task policy if the
 896                  * vma/shared policy at addr is NULL.  We
 897                  * want to return MPOL_DEFAULT in this case.
 898                  */
 899                 down_read(&mm->mmap_sem);
 900                 vma = find_vma_intersection(mm, addr, addr+1);
 901                 if (!vma) {
 902                         up_read(&mm->mmap_sem);
 903                         return -EFAULT;
 904                 }
 905                 if (vma->vm_ops && vma->vm_ops->get_policy)
 906                         pol = vma->vm_ops->get_policy(vma, addr);
 907                 else
 908                         pol = vma->vm_policy;
 909         } else if (addr)
 910                 return -EINVAL;
 911
 912         if (!pol)
 913                 pol = &default_policy;  /* indicates default behavior */
 914
 915         if (flags & MPOL_F_NODE) {
 916                 if (flags & MPOL_F_ADDR) {
 917                         err = lookup_node(mm, addr);
 918                         if (err < 0)
 919                                 goto out;
 920                         *policy = err;
 921                 } else if (pol == current->mempolicy &&
 922                                 pol->mode == MPOL_INTERLEAVE) {
 923                         *policy = current->il_next;
 924                 } else {
 925                         err = -EINVAL;
 926                         goto out;
 927                 }
 928         } else {
 929                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 930                                                 pol->mode;
 931                 /*
 932                  * Internal mempolicy flags must be masked off before exposing
 933                  * the policy to userspace.
 934                  */
 935                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 936         }
 937
 938         if (vma) {
 939                 up_read(&current->mm->mmap_sem);
 940                 vma = NULL;
 941         }
 942
 943         err = 0;
 944         if (nmask) {
 945                 if (mpol_store_user_nodemask(pol)) {
 946                         *nmask = pol->w.user_nodemask;
 947                 } else {
 948                         task_lock(current);
 949                         get_policy_nodemask(pol, nmask);
 950                         task_unlock(current);
 951                 }
 952         }
 953
 954  out:
 955         mpol_cond_put(pol);
 956         if (vma)
 957                 up_read(&current->mm->mmap_sem);
 958         return err;
 959 }
 960
 961 #ifdef CONFIG_MIGRATION
 962 /*
 963  * page migration
 964  */
 965 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 966                                 unsigned long flags)
 967 {
 968         /*
 969          * Avoid migrating a page that is shared with others.
 970          */
 971         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 972                 if (!isolate_lru_page(page)) {
 973                         list_add_tail(&page->lru, pagelist);
 974                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 975                                             page_is_file_cache(page));
 976                 }
 977         }
 978 }
 979
 980 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 981 {
 982         return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 983 }
 984
 985 /*
 986  * Migrate pages from one node to a target node.
 987  * Returns error or the number of pages not migrated.
 988  */
 989 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 990                            int flags)
 991 {
 992         nodemask_t nmask;
 993         LIST_HEAD(pagelist);
 994         int err = 0;
 995
 996         nodes_clear(nmask);
 997         node_set(source, nmask);
 998
 999         /*
1000          * This does not "check" the range but isolates all pages that
1001          * need migration.  Between passing in the full user address
1002          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1003          */
1004         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1005         check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1006                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1007
1008         if (!list_empty(&pagelist)) {
1009                 err = migrate_pages(&pagelist, new_node_page, dest,
1010                                                         false, MIGRATE_SYNC,
1011                                                         MR_SYSCALL);
1012                 if (err)
1013                         putback_lru_pages(&pagelist);
1014         }
1015
1016         return err;
1017 }
1018
1019 /*
1020  * Move pages between the two nodesets so as to preserve the physical
1021  * layout as much as possible.
1022  *
1023  * Returns the number of page that could not be moved.
1024  */
1025 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1026                      const nodemask_t *to, int flags)
1027 {
1028         int busy = 0;
1029         int err;
1030         nodemask_t tmp;
1031
1032         err = migrate_prep();
1033         if (err)
1034                 return err;
1035
1036         down_read(&mm->mmap_sem);
1037
1038         err = migrate_vmas(mm, from, to, flags);
1039         if (err)
1040                 goto out;
1041
1042         /*
1043          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1044          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1045          * bit in 'tmp', and return that <source, dest> pair for migration.
1046          * The pair of nodemasks 'to' and 'from' define the map.
1047          *
1048          * If no pair of bits is found that way, fallback to picking some
1049          * pair of 'source' and 'dest' bits that are not the same.  If the
1050          * 'source' and 'dest' bits are the same, this represents a node
1051          * that will be migrating to itself, so no pages need move.
1052          *
1053          * If no bits are left in 'tmp', or if all remaining bits left
1054          * in 'tmp' correspond to the same bit in 'to', return false
1055          * (nothing left to migrate).
1056          *
1057          * This lets us pick a pair of nodes to migrate between, such that
1058          * if possible the dest node is not already occupied by some other
1059          * source node, minimizing the risk of overloading the memory on a
1060          * node that would happen if we migrated incoming memory to a node
1061          * before migrating outgoing memory source that same node.
1062          *
1063          * A single scan of tmp is sufficient.  As we go, we remember the
1064          * most recent <s, d> pair that moved (s != d).  If we find a pair
1065          * that not only moved, but what's better, moved to an empty slot
1066          * (d is not set in tmp), then we break out then, with that pair.
1067          * Otherwise when we finish scanning from_tmp, we at least have the
1068          * most recent <s, d> pair that moved.  If we get all the way through
1069          * the scan of tmp without finding any node that moved, much less
1070          * moved to an empty node, then there is nothing left worth migrating.
1071          */
1072
1073         tmp = *from;
1074         while (!nodes_empty(tmp)) {
1075                 int s,d;
1076                 int source = -1;
1077                 int dest = 0;
1078
1079                 for_each_node_mask(s, tmp) {
1080
1081                         /*
1082                          * do_migrate_pages() tries to maintain the relative
1083                          * node relationship of the pages established between
1084                          * threads and memory areas.
1085                          *
1086                          * However if the number of source nodes is not equal to
1087                          * the number of destination nodes we can not preserve
1088                          * this node relative relationship.  In that case, skip
1089                          * copying memory from a node that is in the destination
1090                          * mask.
1091                          *
1092                          * Example: [2,3,4] -> [3,4,5] moves everything.
1093                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1094                          */
1095
1096                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1097                                                 (node_isset(s, *to)))
1098                                 continue;
1099
1100                         d = node_remap(s, *from, *to);
1101                         if (s == d)
1102                                 continue;
1103
1104                         source = s;     /* Node moved. Memorize */
1105                         dest = d;
1106
1107                         /* dest not in remaining from nodes? */
1108                         if (!node_isset(dest, tmp))
1109                                 break;
1110                 }
1111                 if (source == -1)
1112                         break;
1113
1114                 node_clear(source, tmp);
1115                 err = migrate_to_node(mm, source, dest, flags);
1116                 if (err > 0)
1117                         busy += err;
1118                 if (err < 0)
1119                         break;
1120         }
1121 out:
1122         up_read(&mm->mmap_sem);
1123         if (err < 0)
1124                 return err;
1125         return busy;
1126
1127 }
1128
1129 /*
1130  * Allocate a new page for page migration based on vma policy.
1131  * Start assuming that page is mapped by vma pointed to by @private.
1132  * Search forward from there, if not.  N.B., this assumes that the
1133  * list of pages handed to migrate_pages()--which is how we get here--
1134  * is in virtual address order.
1135  */
1136 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1137 {
1138         struct vm_area_struct *vma = (struct vm_area_struct *)private;
1139         unsigned long uninitialized_var(address);
1140
1141         while (vma) {
1142                 address = page_address_in_vma(page, vma);
1143                 if (address != -EFAULT)
1144                         break;
1145                 vma = vma->vm_next;
1146         }
1147
1148         /*
1149          * if !vma, alloc_page_vma() will use task or system default policy
1150          */
1151         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1152 }
1153 #else
1154
1155 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1156                                 unsigned long flags)
1157 {
1158 }
1159
1160 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1161                      const nodemask_t *to, int flags)
1162 {
1163         return -ENOSYS;
1164 }
1165
1166 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1167 {
1168         return NULL;
1169 }
1170 #endif
1171
1172 static long do_mbind(unsigned long start, unsigned long len,
1173                      unsigned short mode, unsigned short mode_flags,
1174                      nodemask_t *nmask, unsigned long flags)
1175 {
1176         struct vm_area_struct *vma;
1177         struct mm_struct *mm = current->mm;
1178         struct mempolicy *new;
1179         unsigned long end;
1180         int err;
1181         LIST_HEAD(pagelist);
1182
1183         if (flags & ~(unsigned long)MPOL_MF_VALID)
1184                 return -EINVAL;
1185         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1186                 return -EPERM;
1187
1188         if (start & ~PAGE_MASK)
1189                 return -EINVAL;
1190
1191         if (mode == MPOL_DEFAULT)
1192                 flags &= ~MPOL_MF_STRICT;
1193
1194         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1195         end = start + len;
1196
1197         if (end < start)
1198                 return -EINVAL;
1199         if (end == start)
1200                 return 0;
1201
1202         new = mpol_new(mode, mode_flags, nmask);
1203         if (IS_ERR(new))
1204                 return PTR_ERR(new);
1205
1206         if (flags & MPOL_MF_LAZY)
1207                 new->flags |= MPOL_F_MOF;
1208
1209         /*
1210          * If we are using the default policy then operation
1211          * on discontinuous address spaces is okay after all
1212          */
1213         if (!new)
1214                 flags |= MPOL_MF_DISCONTIG_OK;
1215
1216         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1217                  start, start + len, mode, mode_flags,
1218                  nmask ? nodes_addr(*nmask)[0] : -1);
1219
1220         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1221
1222                 err = migrate_prep();
1223                 if (err)
1224                         goto mpol_out;
1225         }
1226         {
1227                 NODEMASK_SCRATCH(scratch);
1228                 if (scratch) {
1229                         down_write(&mm->mmap_sem);
1230                         task_lock(current);
1231                         err = mpol_set_nodemask(new, nmask, scratch);
1232                         task_unlock(current);
1233                         if (err)
1234                                 up_write(&mm->mmap_sem);
1235                 } else
1236                         err = -ENOMEM;
1237                 NODEMASK_SCRATCH_FREE(scratch);
1238         }
1239         if (err)
1240                 goto mpol_out;
1241
1242         vma = check_range(mm, start, end, nmask,
1243                           flags | MPOL_MF_INVERT, &pagelist);
1244
1245         err = PTR_ERR(vma);     /* maybe ... */
1246         if (!IS_ERR(vma))
1247                 err = mbind_range(mm, start, end, new);
1248
1249         if (!err) {
1250                 int nr_failed = 0;
1251
1252                 if (!list_empty(&pagelist)) {
1253                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1254                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1255                                                 (unsigned long)vma,
1256                                                 false, MIGRATE_SYNC,
1257                                                 MR_MEMPOLICY_MBIND);
1258                         if (nr_failed)
1259                                 putback_lru_pages(&pagelist);
1260                 }
1261
1262                 if (nr_failed && (flags & MPOL_MF_STRICT))
1263                         err = -EIO;
1264         } else
1265                 putback_lru_pages(&pagelist);
1266
1267         up_write(&mm->mmap_sem);
1268  mpol_out:
1269         mpol_put(new);
1270         return err;
1271 }
1272
1273 /*
1274  * User space interface with variable sized bitmaps for nodelists.
1275  */
1276
1277 /* Copy a node mask from user space. */
1278 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1279                      unsigned long maxnode)
1280 {
1281         unsigned long k;
1282         unsigned long nlongs;
1283         unsigned long endmask;
1284
1285         --maxnode;
1286         nodes_clear(*nodes);
1287         if (maxnode == 0 || !nmask)
1288                 return 0;
1289         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1290                 return -EINVAL;
1291
1292         nlongs = BITS_TO_LONGS(maxnode);
1293         if ((maxnode % BITS_PER_LONG) == 0)
1294                 endmask = ~0UL;
1295         else
1296                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1297
1298         /* When the user specified more nodes than supported just check
1299            if the non supported part is all zero. */
1300         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1301                 if (nlongs > PAGE_SIZE/sizeof(long))
1302                         return -EINVAL;
1303                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1304                         unsigned long t;
1305                         if (get_user(t, nmask + k))
1306                                 return -EFAULT;
1307                         if (k == nlongs - 1) {
1308                                 if (t & endmask)
1309                                         return -EINVAL;
1310                         } else if (t)
1311                                 return -EINVAL;
1312                 }
1313                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1314                 endmask = ~0UL;
1315         }
1316
1317         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1318                 return -EFAULT;
1319         nodes_addr(*nodes)[nlongs-1] &= endmask;
1320         return 0;
1321 }
1322
1323 /* Copy a kernel node mask to user space */
1324 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1325                               nodemask_t *nodes)
1326 {
1327         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1328         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1329
1330         if (copy > nbytes) {
1331                 if (copy > PAGE_SIZE)
1332                         return -EINVAL;
1333                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1334                         return -EFAULT;
1335                 copy = nbytes;
1336         }
1337         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1338 }
1339
1340 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1341                 unsigned long, mode, unsigned long __user *, nmask,
1342                 unsigned long, maxnode, unsigned, flags)
1343 {
1344         nodemask_t nodes;
1345         int err;
1346         unsigned short mode_flags;
1347
1348         mode_flags = mode & MPOL_MODE_FLAGS;
1349         mode &= ~MPOL_MODE_FLAGS;
1350         if (mode >= MPOL_MAX)
1351                 return -EINVAL;
1352         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1353             (mode_flags & MPOL_F_RELATIVE_NODES))
1354                 return -EINVAL;
1355         err = get_nodes(&nodes, nmask, maxnode);
1356         if (err)
1357                 return err;
1358         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1359 }
1360
1361 /* Set the process memory policy */
1362 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1363                 unsigned long, maxnode)
1364 {
1365         int err;
1366         nodemask_t nodes;
1367         unsigned short flags;
1368
1369         flags = mode & MPOL_MODE_FLAGS;
1370         mode &= ~MPOL_MODE_FLAGS;
1371         if ((unsigned int)mode >= MPOL_MAX)
1372                 return -EINVAL;
1373         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1374                 return -EINVAL;
1375         err = get_nodes(&nodes, nmask, maxnode);
1376         if (err)
1377                 return err;
1378         return do_set_mempolicy(mode, flags, &nodes);
1379 }
1380
1381 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1382                 const unsigned long __user *, old_nodes,
1383                 const unsigned long __user *, new_nodes)
1384 {
1385         const struct cred *cred = current_cred(), *tcred;
1386         struct mm_struct *mm = NULL;
1387         struct task_struct *task;
1388         nodemask_t task_nodes;
1389         int err;
1390         nodemask_t *old;
1391         nodemask_t *new;
1392         NODEMASK_SCRATCH(scratch);
1393
1394         if (!scratch)
1395                 return -ENOMEM;
1396
1397         old = &scratch->mask1;
1398         new = &scratch->mask2;
1399
1400         err = get_nodes(old, old_nodes, maxnode);
1401         if (err)
1402                 goto out;
1403
1404         err = get_nodes(new, new_nodes, maxnode);
1405         if (err)
1406                 goto out;
1407
1408         /* Find the mm_struct */
1409         rcu_read_lock();
1410         task = pid ? find_task_by_vpid(pid) : current;
1411         if (!task) {
1412                 rcu_read_unlock();
1413                 err = -ESRCH;
1414                 goto out;
1415         }
1416         get_task_struct(task);
1417
1418         err = -EINVAL;
1419
1420         /*
1421          * Check if this process has the right to modify the specified
1422          * process. The right exists if the process has administrative
1423          * capabilities, superuser privileges or the same
1424          * userid as the target process.
1425          */
1426         tcred = __task_cred(task);
1427         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1428             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1429             !capable(CAP_SYS_NICE)) {
1430                 rcu_read_unlock();
1431                 err = -EPERM;
1432                 goto out_put;
1433         }
1434         rcu_read_unlock();
1435
1436         task_nodes = cpuset_mems_allowed(task);
1437         /* Is the user allowed to access the target nodes? */
1438         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1439                 err = -EPERM;
1440                 goto out_put;
1441         }
1442
1443         if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1444                 err = -EINVAL;
1445                 goto out_put;
1446         }
1447
1448         err = security_task_movememory(task);
1449         if (err)
1450                 goto out_put;
1451
1452         mm = get_task_mm(task);
1453         put_task_struct(task);
1454
1455         if (!mm) {
1456                 err = -EINVAL;
1457                 goto out;
1458         }
1459
1460         err = do_migrate_pages(mm, old, new,
1461                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1462
1463         mmput(mm);
1464 out:
1465         NODEMASK_SCRATCH_FREE(scratch);
1466
1467         return err;
1468
1469 out_put:
1470         put_task_struct(task);
1471         goto out;
1472
1473 }
1474
1475
1476 /* Retrieve NUMA policy */
1477 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1478                 unsigned long __user *, nmask, unsigned long, maxnode,
1479                 unsigned long, addr, unsigned long, flags)
1480 {
1481         int err;
1482         int uninitialized_var(pval);
1483         nodemask_t nodes;
1484
1485         if (nmask != NULL && maxnode < MAX_NUMNODES)
1486                 return -EINVAL;
1487
1488         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1489
1490         if (err)
1491                 return err;
1492
1493         if (policy && put_user(pval, policy))
1494                 return -EFAULT;
1495
1496         if (nmask)
1497                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1498
1499         return err;
1500 }
1501
1502 #ifdef CONFIG_COMPAT
1503
1504 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1505                                      compat_ulong_t __user *nmask,
1506                                      compat_ulong_t maxnode,
1507                                      compat_ulong_t addr, compat_ulong_t flags)
1508 {
1509         long err;
1510         unsigned long __user *nm = NULL;
1511         unsigned long nr_bits, alloc_size;
1512         DECLARE_BITMAP(bm, MAX_NUMNODES);
1513
1514         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1515         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1516
1517         if (nmask)
1518                 nm = compat_alloc_user_space(alloc_size);
1519
1520         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1521
1522         if (!err && nmask) {
1523                 unsigned long copy_size;
1524                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1525                 err = copy_from_user(bm, nm, copy_size);
1526                 /* ensure entire bitmap is zeroed */
1527                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1528                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1529         }
1530
1531         return err;
1532 }
1533
1534 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1535                                      compat_ulong_t maxnode)
1536 {
1537         long err = 0;
1538         unsigned long __user *nm = NULL;
1539         unsigned long nr_bits, alloc_size;
1540         DECLARE_BITMAP(bm, MAX_NUMNODES);
1541
1542         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1543         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1544
1545         if (nmask) {
1546                 err = compat_get_bitmap(bm, nmask, nr_bits);
1547                 nm = compat_alloc_user_space(alloc_size);
1548                 err |= copy_to_user(nm, bm, alloc_size);
1549         }
1550
1551         if (err)
1552                 return -EFAULT;
1553
1554         return sys_set_mempolicy(mode, nm, nr_bits+1);
1555 }
1556
1557 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1558                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1559                              compat_ulong_t maxnode, compat_ulong_t flags)
1560 {
1561         long err = 0;
1562         unsigned long __user *nm = NULL;
1563         unsigned long nr_bits, alloc_size;
1564         nodemask_t bm;
1565
1566         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1567         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1568
1569         if (nmask) {
1570                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1571                 nm = compat_alloc_user_space(alloc_size);
1572                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1573         }
1574
1575         if (err)
1576                 return -EFAULT;
1577
1578         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1579 }
1580
1581 #endif
1582
1583 /*
1584  * get_vma_policy(@task, @vma, @addr)
1585  * @task - task for fallback if vma policy == default
1586  * @vma   - virtual memory area whose policy is sought
1587  * @addr  - address in @vma for shared policy lookup
1588  *
1589  * Returns effective policy for a VMA at specified address.
1590  * Falls back to @task or system default policy, as necessary.
1591  * Current or other task's task mempolicy and non-shared vma policies must be
1592  * protected by task_lock(task) by the caller.
1593  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1594  * count--added by the get_policy() vm_op, as appropriate--to protect against
1595  * freeing by another task.  It is the caller's responsibility to free the
1596  * extra reference for shared policies.
1597  */
1598 struct mempolicy *get_vma_policy(struct task_struct *task,
1599                 struct vm_area_struct *vma, unsigned long addr)
1600 {
1601         struct mempolicy *pol = task->mempolicy;
1602
1603         if (vma) {
1604                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1605                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1606                                                                         addr);
1607                         if (vpol)
1608                                 pol = vpol;
1609                 } else if (vma->vm_policy) {
1610                         pol = vma->vm_policy;
1611
1612                         /*
1613                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1614                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1615                          * count on these policies which will be dropped by
1616                          * mpol_cond_put() later
1617                          */
1618                         if (mpol_needs_cond_ref(pol))
1619                                 mpol_get(pol);
1620                 }
1621         }
1622         if (!pol)
1623                 pol = &default_policy;
1624         return pol;
1625 }
1626
1627 /*
1628  * Return a nodemask representing a mempolicy for filtering nodes for
1629  * page allocation
1630  */
1631 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1632 {
1633         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1634         if (unlikely(policy->mode == MPOL_BIND) &&
1635                         gfp_zone(gfp) >= policy_zone &&
1636                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1637                 return &policy->v.nodes;
1638
1639         return NULL;
1640 }
1641
1642 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1643 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1644         int nd)
1645 {
1646         switch (policy->mode) {
1647         case MPOL_PREFERRED:
1648                 if (!(policy->flags & MPOL_F_LOCAL))
1649                         nd = policy->v.preferred_node;
1650                 break;
1651         case MPOL_BIND:
1652                 /*
1653                  * Normally, MPOL_BIND allocations are node-local within the
1654                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1655                  * current node isn't part of the mask, we use the zonelist for
1656                  * the first node in the mask instead.
1657                  */
1658                 if (unlikely(gfp & __GFP_THISNODE) &&
1659                                 unlikely(!node_isset(nd, policy->v.nodes)))
1660                         nd = first_node(policy->v.nodes);
1661                 break;
1662         default:
1663                 BUG();
1664         }
1665         return node_zonelist(nd, gfp);
1666 }
1667
1668 /* Do dynamic interleaving for a process */
1669 static unsigned interleave_nodes(struct mempolicy *policy)
1670 {
1671         unsigned nid, next;
1672         struct task_struct *me = current;
1673
1674         nid = me->il_next;
1675         next = next_node(nid, policy->v.nodes);
1676         if (next >= MAX_NUMNODES)
1677                 next = first_node(policy->v.nodes);
1678         if (next < MAX_NUMNODES)
1679                 me->il_next = next;
1680         return nid;
1681 }
1682
1683 /*
1684  * Depending on the memory policy provide a node from which to allocate the
1685  * next slab entry.
1686  * @policy must be protected by freeing by the caller.  If @policy is
1687  * the current task's mempolicy, this protection is implicit, as only the
1688  * task can change it's policy.  The system default policy requires no
1689  * such protection.
1690  */
1691 unsigned slab_node(void)
1692 {
1693         struct mempolicy *policy;
1694
1695         if (in_interrupt())
1696                 return numa_node_id();
1697
1698         policy = current->mempolicy;
1699         if (!policy || policy->flags & MPOL_F_LOCAL)
1700                 return numa_node_id();
1701
1702         switch (policy->mode) {
1703         case MPOL_PREFERRED:
1704                 /*
1705                  * handled MPOL_F_LOCAL above
1706                  */
1707                 return policy->v.preferred_node;
1708
1709         case MPOL_INTERLEAVE:
1710                 return interleave_nodes(policy);
1711
1712         case MPOL_BIND: {
1713                 /*
1714                  * Follow bind policy behavior and start allocation at the
1715                  * first node.
1716                  */
1717                 struct zonelist *zonelist;
1718                 struct zone *zone;
1719                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1720                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1721                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1722                                                         &policy->v.nodes,
1723                                                         &zone);
1724                 return zone ? zone->node : numa_node_id();
1725         }
1726
1727         default:
1728                 BUG();
1729         }
1730 }
1731
1732 /* Do static interleaving for a VMA with known offset. */
1733 static unsigned offset_il_node(struct mempolicy *pol,
1734                 struct vm_area_struct *vma, unsigned long off)
1735 {
1736         unsigned nnodes = nodes_weight(pol->v.nodes);
1737         unsigned target;
1738         int c;
1739         int nid = -1;
1740
1741         if (!nnodes)
1742                 return numa_node_id();
1743         target = (unsigned int)off % nnodes;
1744         c = 0;
1745         do {
1746                 nid = next_node(nid, pol->v.nodes);
1747                 c++;
1748         } while (c <= target);
1749         return nid;
1750 }
1751
1752 /* Determine a node number for interleave */
1753 static inline unsigned interleave_nid(struct mempolicy *pol,
1754                  struct vm_area_struct *vma, unsigned long addr, int shift)
1755 {
1756         if (vma) {
1757                 unsigned long off;
1758
1759                 /*
1760                  * for small pages, there is no difference between
1761                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1762                  * for huge pages, since vm_pgoff is in units of small
1763                  * pages, we need to shift off the always 0 bits to get
1764                  * a useful offset.
1765                  */
1766                 BUG_ON(shift < PAGE_SHIFT);
1767                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1768                 off += (addr - vma->vm_start) >> shift;
1769                 return offset_il_node(pol, vma, off);
1770         } else
1771                 return interleave_nodes(pol);
1772 }
1773
1774 /*
1775  * Return the bit number of a random bit set in the nodemask.
1776  * (returns -1 if nodemask is empty)
1777  */
1778 int node_random(const nodemask_t *maskp)
1779 {
1780         int w, bit = -1;
1781
1782         w = nodes_weight(*maskp);
1783         if (w)
1784                 bit = bitmap_ord_to_pos(maskp->bits,
1785                         get_random_int() % w, MAX_NUMNODES);
1786         return bit;
1787 }
1788
1789 #ifdef CONFIG_HUGETLBFS
1790 /*
1791  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1792  * @vma = virtual memory area whose policy is sought
1793  * @addr = address in @vma for shared policy lookup and interleave policy
1794  * @gfp_flags = for requested zone
1795  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1796  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1797  *
1798  * Returns a zonelist suitable for a huge page allocation and a pointer
1799  * to the struct mempolicy for conditional unref after allocation.
1800  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1801  * @nodemask for filtering the zonelist.
1802  *
1803  * Must be protected by get_mems_allowed()
1804  */
1805 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1806                                 gfp_t gfp_flags, struct mempolicy **mpol,
1807                                 nodemask_t **nodemask)
1808 {
1809         struct zonelist *zl;
1810
1811         *mpol = get_vma_policy(current, vma, addr);
1812         *nodemask = NULL;       /* assume !MPOL_BIND */
1813
1814         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1815                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1816                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1817         } else {
1818                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1819                 if ((*mpol)->mode == MPOL_BIND)
1820                         *nodemask = &(*mpol)->v.nodes;
1821         }
1822         return zl;
1823 }
1824
1825 /*
1826  * init_nodemask_of_mempolicy
1827  *
1828  * If the current task's mempolicy is "default" [NULL], return 'false'
1829  * to indicate default policy.  Otherwise, extract the policy nodemask
1830  * for 'bind' or 'interleave' policy into the argument nodemask, or
1831  * initialize the argument nodemask to contain the single node for
1832  * 'preferred' or 'local' policy and return 'true' to indicate presence
1833  * of non-default mempolicy.
1834  *
1835  * We don't bother with reference counting the mempolicy [mpol_get/put]
1836  * because the current task is examining it's own mempolicy and a task's
1837  * mempolicy is only ever changed by the task itself.
1838  *
1839  * N.B., it is the caller's responsibility to free a returned nodemask.
1840  */
1841 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1842 {
1843         struct mempolicy *mempolicy;
1844         int nid;
1845
1846         if (!(mask && current->mempolicy))
1847                 return false;
1848
1849         task_lock(current);
1850         mempolicy = current->mempolicy;
1851         switch (mempolicy->mode) {
1852         case MPOL_PREFERRED:
1853                 if (mempolicy->flags & MPOL_F_LOCAL)
1854                         nid = numa_node_id();
1855                 else
1856                         nid = mempolicy->v.preferred_node;
1857                 init_nodemask_of_node(mask, nid);
1858                 break;
1859
1860         case MPOL_BIND:
1861                 /* Fall through */
1862         case MPOL_INTERLEAVE:
1863                 *mask =  mempolicy->v.nodes;
1864                 break;
1865
1866         default:
1867                 BUG();
1868         }
1869         task_unlock(current);
1870
1871         return true;
1872 }
1873 #endif
1874
1875 /*
1876  * mempolicy_nodemask_intersects
1877  *
1878  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1879  * policy.  Otherwise, check for intersection between mask and the policy
1880  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1881  * policy, always return true since it may allocate elsewhere on fallback.
1882  *
1883  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1884  */
1885 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1886                                         const nodemask_t *mask)
1887 {
1888         struct mempolicy *mempolicy;
1889         bool ret = true;
1890
1891         if (!mask)
1892                 return ret;
1893         task_lock(tsk);
1894         mempolicy = tsk->mempolicy;
1895         if (!mempolicy)
1896                 goto out;
1897
1898         switch (mempolicy->mode) {
1899         case MPOL_PREFERRED:
1900                 /*
1901                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1902                  * allocate from, they may fallback to other nodes when oom.
1903                  * Thus, it's possible for tsk to have allocated memory from
1904                  * nodes in mask.
1905                  */
1906                 break;
1907         case MPOL_BIND:
1908         case MPOL_INTERLEAVE:
1909                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1910                 break;
1911         default:
1912                 BUG();
1913         }
1914 out:
1915         task_unlock(tsk);
1916         return ret;
1917 }
1918
1919 /* Allocate a page in interleaved policy.
1920    Own path because it needs to do special accounting. */
1921 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1922                                         unsigned nid)
1923 {
1924         struct zonelist *zl;
1925         struct page *page;
1926
1927         zl = node_zonelist(nid, gfp);
1928         page = __alloc_pages(gfp, order, zl);
1929         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1930                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1931         return page;
1932 }
1933
1934 /**
1935  *      alloc_pages_vma - Allocate a page for a VMA.
1936  *
1937  *      @gfp:
1938  *      %GFP_USER    user allocation.
1939  *      %GFP_KERNEL  kernel allocations,
1940  *      %GFP_HIGHMEM highmem/user allocations,
1941  *      %GFP_FS      allocation should not call back into a file system.
1942  *      %GFP_ATOMIC  don't sleep.
1943  *
1944  *      @order:Order of the GFP allocation.
1945  *      @vma:  Pointer to VMA or NULL if not available.
1946  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1947  *
1948  *      This function allocates a page from the kernel page pool and applies
1949  *      a NUMA policy associated with the VMA or the current process.
1950  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1951  *      mm_struct of the VMA to prevent it from going away. Should be used for
1952  *      all allocations for pages that will be mapped into
1953  *      user space. Returns NULL when no page can be allocated.
1954  *
1955  *      Should be called with the mm_sem of the vma hold.
1956  */
1957 struct page *
1958 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1959                 unsigned long addr, int node)
1960 {
1961         struct mempolicy *pol;
1962         struct zonelist *zl;
1963         struct page *page;
1964         unsigned int cpuset_mems_cookie;
1965
1966 retry_cpuset:
1967         pol = get_vma_policy(current, vma, addr);
1968         cpuset_mems_cookie = get_mems_allowed();
1969
1970         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1971                 unsigned nid;
1972
1973                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1974                 mpol_cond_put(pol);
1975                 page = alloc_page_interleave(gfp, order, nid);
1976                 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1977                         goto retry_cpuset;
1978
1979                 return page;
1980         }
1981         zl = policy_zonelist(gfp, pol, node);
1982         if (unlikely(mpol_needs_cond_ref(pol))) {
1983                 /*
1984                  * slow path: ref counted shared policy
1985                  */
1986                 struct page *page =  __alloc_pages_nodemask(gfp, order,
1987                                                 zl, policy_nodemask(gfp, pol));
1988                 __mpol_put(pol);
1989                 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1990                         goto retry_cpuset;
1991                 return page;
1992         }
1993         /*
1994          * fast path:  default or task policy
1995          */
1996         page = __alloc_pages_nodemask(gfp, order, zl,
1997                                       policy_nodemask(gfp, pol));
1998         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1999                 goto retry_cpuset;
2000         return page;
2001 }
2002
2003 /**
2004  *      alloc_pages_current - Allocate pages.
2005  *
2006  *      @gfp:
2007  *              %GFP_USER   user allocation,
2008  *              %GFP_KERNEL kernel allocation,
2009  *              %GFP_HIGHMEM highmem allocation,
2010  *              %GFP_FS     don't call back into a file system.
2011  *              %GFP_ATOMIC don't sleep.
2012  *      @order: Power of two of allocation size in pages. 0 is a single page.
2013  *
2014  *      Allocate a page from the kernel page pool.  When not in
2015  *      interrupt context and apply the current process NUMA policy.
2016  *      Returns NULL when no page can be allocated.
2017  *
2018  *      Don't call cpuset_update_task_memory_state() unless
2019  *      1) it's ok to take cpuset_sem (can WAIT), and
2020  *      2) allocating for current task (not interrupt).
2021  */
2022 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2023 {
2024         struct mempolicy *pol = current->mempolicy;
2025         struct page *page;
2026         unsigned int cpuset_mems_cookie;
2027
2028         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2029                 pol = &default_policy;
2030
2031 retry_cpuset:
2032         cpuset_mems_cookie = get_mems_allowed();
2033
2034         /*
2035          * No reference counting needed for current->mempolicy
2036          * nor system default_policy
2037          */
2038         if (pol->mode == MPOL_INTERLEAVE)
2039                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2040         else
2041                 page = __alloc_pages_nodemask(gfp, order,
2042                                 policy_zonelist(gfp, pol, numa_node_id()),
2043                                 policy_nodemask(gfp, pol));
2044
2045         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2046                 goto retry_cpuset;
2047
2048         return page;
2049 }
2050 EXPORT_SYMBOL(alloc_pages_current);
2051
2052 /*
2053  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2054  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2055  * with the mems_allowed returned by cpuset_mems_allowed().  This
2056  * keeps mempolicies cpuset relative after its cpuset moves.  See
2057  * further kernel/cpuset.c update_nodemask().
2058  *
2059  * current's mempolicy may be rebinded by the other task(the task that changes
2060  * cpuset's mems), so we needn't do rebind work for current task.
2061  */
2062
2063 /* Slow path of a mempolicy duplicate */
2064 struct mempolicy *__mpol_dup(struct mempolicy *old)
2065 {
2066         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2067
2068         if (!new)
2069                 return ERR_PTR(-ENOMEM);
2070
2071         /* task's mempolicy is protected by alloc_lock */
2072         if (old == current->mempolicy) {
2073                 task_lock(current);
2074                 *new = *old;
2075                 task_unlock(current);
2076         } else
2077                 *new = *old;
2078
2079         rcu_read_lock();
2080         if (current_cpuset_is_being_rebound()) {
2081                 nodemask_t mems = cpuset_mems_allowed(current);
2082                 if (new->flags & MPOL_F_REBINDING)
2083                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2084                 else
2085                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2086         }
2087         rcu_read_unlock();
2088         atomic_set(&new->refcnt, 1);
2089         return new;
2090 }
2091
2092 /*
2093  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
2094  * eliminate the * MPOL_F_* flags that require conditional ref and
2095  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
2096  * after return.  Use the returned value.
2097  *
2098  * Allows use of a mempolicy for, e.g., multiple allocations with a single
2099  * policy lookup, even if the policy needs/has extra ref on lookup.
2100  * shmem_readahead needs this.
2101  */
2102 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
2103                                                 struct mempolicy *frompol)
2104 {
2105         if (!mpol_needs_cond_ref(frompol))
2106                 return frompol;
2107
2108         *tompol = *frompol;
2109         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
2110         __mpol_put(frompol);
2111         return tompol;
2112 }
2113
2114 /* Slow path of a mempolicy comparison */
2115 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2116 {
2117         if (!a || !b)
2118                 return false;
2119         if (a->mode != b->mode)
2120                 return false;
2121         if (a->flags != b->flags)
2122                 return false;
2123         if (mpol_store_user_nodemask(a))
2124                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2125                         return false;
2126
2127         switch (a->mode) {
2128         case MPOL_BIND:
2129                 /* Fall through */
2130         case MPOL_INTERLEAVE:
2131                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2132         case MPOL_PREFERRED:
2133                 return a->v.preferred_node == b->v.preferred_node;
2134         default:
2135                 BUG();
2136                 return false;
2137         }
2138 }
2139
2140 /*
2141  * Shared memory backing store policy support.
2142  *
2143  * Remember policies even when nobody has shared memory mapped.
2144  * The policies are kept in Red-Black tree linked from the inode.
2145  * They are protected by the sp->lock spinlock, which should be held
2146  * for any accesses to the tree.
2147  */
2148
2149 /* lookup first element intersecting start-end */
2150 /* Caller holds sp->mutex */
2151 static struct sp_node *
2152 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2153 {
2154         struct rb_node *n = sp->root.rb_node;
2155
2156         while (n) {
2157                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2158
2159                 if (start >= p->end)
2160                         n = n->rb_right;
2161                 else if (end <= p->start)
2162                         n = n->rb_left;
2163                 else
2164                         break;
2165         }
2166         if (!n)
2167                 return NULL;
2168         for (;;) {
2169                 struct sp_node *w = NULL;
2170                 struct rb_node *prev = rb_prev(n);
2171                 if (!prev)
2172                         break;
2173                 w = rb_entry(prev, struct sp_node, nd);
2174                 if (w->end <= start)
2175                         break;
2176                 n = prev;
2177         }
2178         return rb_entry(n, struct sp_node, nd);
2179 }
2180
2181 /* Insert a new shared policy into the list. */
2182 /* Caller holds sp->lock */
2183 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2184 {
2185         struct rb_node **p = &sp->root.rb_node;
2186         struct rb_node *parent = NULL;
2187         struct sp_node *nd;
2188
2189         while (*p) {
2190                 parent = *p;
2191                 nd = rb_entry(parent, struct sp_node, nd);
2192                 if (new->start < nd->start)
2193                         p = &(*p)->rb_left;
2194                 else if (new->end > nd->end)
2195                         p = &(*p)->rb_right;
2196                 else
2197                         BUG();
2198         }
2199         rb_link_node(&new->nd, parent, p);
2200         rb_insert_color(&new->nd, &sp->root);
2201         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2202                  new->policy ? new->policy->mode : 0);
2203 }
2204
2205 /* Find shared policy intersecting idx */
2206 struct mempolicy *
2207 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2208 {
2209         struct mempolicy *pol = NULL;
2210         struct sp_node *sn;
2211
2212         if (!sp->root.rb_node)
2213                 return NULL;
2214         mutex_lock(&sp->mutex);
2215         sn = sp_lookup(sp, idx, idx+1);
2216         if (sn) {
2217                 mpol_get(sn->policy);
2218                 pol = sn->policy;
2219         }
2220         mutex_unlock(&sp->mutex);
2221         return pol;
2222 }
2223
2224 static void sp_free(struct sp_node *n)
2225 {
2226         mpol_put(n->policy);
2227         kmem_cache_free(sn_cache, n);
2228 }
2229
2230 /**
2231  * mpol_misplaced - check whether current page node is valid in policy
2232  *
2233  * @page   - page to be checked
2234  * @vma    - vm area where page mapped
2235  * @addr   - virtual address where page mapped
2236  *
2237  * Lookup current policy node id for vma,addr and "compare to" page's
2238  * node id.
2239  *
2240  * Returns:
2241  *      -1      - not misplaced, page is in the right node
2242  *      node    - node id where the page should be
2243  *
2244  * Policy determination "mimics" alloc_page_vma().
2245  * Called from fault path where we know the vma and faulting address.
2246  */
2247 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2248 {
2249         struct mempolicy *pol;
2250         struct zone *zone;
2251         int curnid = page_to_nid(page);
2252         unsigned long pgoff;
2253         int polnid = -1;
2254         int ret = -1;
2255
2256         BUG_ON(!vma);
2257
2258         pol = get_vma_policy(current, vma, addr);
2259         if (!(pol->flags & MPOL_F_MOF))
2260                 goto out;
2261
2262         switch (pol->mode) {
2263         case MPOL_INTERLEAVE:
2264                 BUG_ON(addr >= vma->vm_end);
2265                 BUG_ON(addr < vma->vm_start);
2266
2267                 pgoff = vma->vm_pgoff;
2268                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2269                 polnid = offset_il_node(pol, vma, pgoff);
2270                 break;
2271
2272         case MPOL_PREFERRED:
2273                 if (pol->flags & MPOL_F_LOCAL)
2274                         polnid = numa_node_id();
2275                 else
2276                         polnid = pol->v.preferred_node;
2277                 break;
2278
2279         case MPOL_BIND:
2280                 /*
2281                  * allows binding to multiple nodes.
2282                  * use current page if in policy nodemask,
2283                  * else select nearest allowed node, if any.
2284                  * If no allowed nodes, use current [!misplaced].
2285                  */
2286                 if (node_isset(curnid, pol->v.nodes))
2287                         goto out;
2288                 (void)first_zones_zonelist(
2289                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2290                                 gfp_zone(GFP_HIGHUSER),
2291                                 &pol->v.nodes, &zone);
2292                 polnid = zone->node;
2293                 break;
2294
2295         default:
2296                 BUG();
2297         }
2298         if (curnid != polnid)
2299                 ret = polnid;
2300 out:
2301         mpol_cond_put(pol);
2302
2303         return ret;
2304 }
2305
2306 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2307 {
2308         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2309         rb_erase(&n->nd, &sp->root);
2310         sp_free(n);
2311 }
2312
2313 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2314                                 struct mempolicy *pol)
2315 {
2316         struct sp_node *n;
2317         struct mempolicy *newpol;
2318
2319         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2320         if (!n)
2321                 return NULL;
2322
2323         newpol = mpol_dup(pol);
2324         if (IS_ERR(newpol)) {
2325                 kmem_cache_free(sn_cache, n);
2326                 return NULL;
2327         }
2328         newpol->flags |= MPOL_F_SHARED;
2329
2330         n->start = start;
2331         n->end = end;
2332         n->policy = newpol;
2333
2334         return n;
2335 }
2336
2337 /* Replace a policy range. */
2338 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2339                                  unsigned long end, struct sp_node *new)
2340 {
2341         struct sp_node *n;
2342         int ret = 0;
2343
2344         mutex_lock(&sp->mutex);
2345         n = sp_lookup(sp, start, end);
2346         /* Take care of old policies in the same range. */
2347         while (n && n->start < end) {
2348                 struct rb_node *next = rb_next(&n->nd);
2349                 if (n->start >= start) {
2350                         if (n->end <= end)
2351                                 sp_delete(sp, n);
2352                         else
2353                                 n->start = end;
2354                 } else {
2355                         /* Old policy spanning whole new range. */
2356                         if (n->end > end) {
2357                                 struct sp_node *new2;
2358                                 new2 = sp_alloc(end, n->end, n->policy);
2359                                 if (!new2) {
2360                                         ret = -ENOMEM;
2361                                         goto out;
2362                                 }
2363                                 n->end = start;
2364                                 sp_insert(sp, new2);
2365                                 break;
2366                         } else
2367                                 n->end = start;
2368                 }
2369                 if (!next)
2370                         break;
2371                 n = rb_entry(next, struct sp_node, nd);
2372         }
2373         if (new)
2374                 sp_insert(sp, new);
2375 out:
2376         mutex_unlock(&sp->mutex);
2377         return ret;
2378 }
2379
2380 /**
2381  * mpol_shared_policy_init - initialize shared policy for inode
2382  * @sp: pointer to inode shared policy
2383  * @mpol:  struct mempolicy to install
2384  *
2385  * Install non-NULL @mpol in inode's shared policy rb-tree.
2386  * On entry, the current task has a reference on a non-NULL @mpol.
2387  * This must be released on exit.
2388  * This is called at get_inode() calls and we can use GFP_KERNEL.
2389  */
2390 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2391 {
2392         int ret;
2393
2394         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2395         mutex_init(&sp->mutex);
2396
2397         if (mpol) {
2398                 struct vm_area_struct pvma;
2399                 struct mempolicy *new;
2400                 NODEMASK_SCRATCH(scratch);
2401
2402                 if (!scratch)
2403                         goto put_mpol;
2404                 /* contextualize the tmpfs mount point mempolicy */
2405                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2406                 if (IS_ERR(new))
2407                         goto free_scratch; /* no valid nodemask intersection */
2408
2409                 task_lock(current);
2410                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2411                 task_unlock(current);
2412                 if (ret)
2413                         goto put_new;
2414
2415                 /* Create pseudo-vma that contains just the policy */
2416                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2417                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2418                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2419
2420 put_new:
2421                 mpol_put(new);                  /* drop initial ref */
2422 free_scratch:
2423                 NODEMASK_SCRATCH_FREE(scratch);
2424 put_mpol:
2425                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2426         }
2427 }
2428
2429 int mpol_set_shared_policy(struct shared_policy *info,
2430                         struct vm_area_struct *vma, struct mempolicy *npol)
2431 {
2432         int err;
2433         struct sp_node *new = NULL;
2434         unsigned long sz = vma_pages(vma);
2435
2436         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2437                  vma->vm_pgoff,
2438                  sz, npol ? npol->mode : -1,
2439                  npol ? npol->flags : -1,
2440                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
2441
2442         if (npol) {
2443                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2444                 if (!new)
2445                         return -ENOMEM;
2446         }
2447         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2448         if (err && new)
2449                 sp_free(new);
2450         return err;
2451 }
2452
2453 /* Free a backing policy store on inode delete. */
2454 void mpol_free_shared_policy(struct shared_policy *p)
2455 {
2456         struct sp_node *n;
2457         struct rb_node *next;
2458
2459         if (!p->root.rb_node)
2460                 return;
2461         mutex_lock(&p->mutex);
2462         next = rb_first(&p->root);
2463         while (next) {
2464                 n = rb_entry(next, struct sp_node, nd);
2465                 next = rb_next(&n->nd);
2466                 sp_delete(p, n);
2467         }
2468         mutex_unlock(&p->mutex);
2469 }
2470
2471 /* assumes fs == KERNEL_DS */
2472 void __init numa_policy_init(void)
2473 {
2474         nodemask_t interleave_nodes;
2475         unsigned long largest = 0;
2476         int nid, prefer = 0;
2477
2478         policy_cache = kmem_cache_create("numa_policy",
2479                                          sizeof(struct mempolicy),
2480                                          0, SLAB_PANIC, NULL);
2481
2482         sn_cache = kmem_cache_create("shared_policy_node",
2483                                      sizeof(struct sp_node),
2484                                      0, SLAB_PANIC, NULL);
2485
2486         /*
2487          * Set interleaving policy for system init. Interleaving is only
2488          * enabled across suitably sized nodes (default is >= 16MB), or
2489          * fall back to the largest node if they're all smaller.
2490          */
2491         nodes_clear(interleave_nodes);
2492         for_each_node_state(nid, N_HIGH_MEMORY) {
2493                 unsigned long total_pages = node_present_pages(nid);
2494
2495                 /* Preserve the largest node */
2496                 if (largest < total_pages) {
2497                         largest = total_pages;
2498                         prefer = nid;
2499                 }
2500
2501                 /* Interleave this node? */
2502                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2503                         node_set(nid, interleave_nodes);
2504         }
2505
2506         /* All too small, use the largest */
2507         if (unlikely(nodes_empty(interleave_nodes)))
2508                 node_set(prefer, interleave_nodes);
2509
2510         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2511                 printk("numa_policy_init: interleaving failed\n");
2512 }
2513
2514 /* Reset policy of current process to default */
2515 void numa_default_policy(void)
2516 {
2517         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2518 }
2519
2520 /*
2521  * Parse and format mempolicy from/to strings
2522  */
2523
2524 /*
2525  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2526  * Used only for mpol_parse_str() and mpol_to_str()
2527  */
2528 static const char * const policy_modes[] =
2529 {
2530         [MPOL_DEFAULT]    = "default",
2531         [MPOL_PREFERRED]  = "prefer",
2532         [MPOL_BIND]       = "bind",
2533         [MPOL_INTERLEAVE] = "interleave",
2534         [MPOL_LOCAL]      = "local",
2535 };
2536
2537
2538 #ifdef CONFIG_TMPFS
2539 /**
2540  * mpol_parse_str - parse string to mempolicy
2541  * @str:  string containing mempolicy to parse
2542  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2543  * @no_context:  flag whether to "contextualize" the mempolicy
2544  *
2545  * Format of input:
2546  *      <mode>[=<flags>][:<nodelist>]
2547  *
2548  * if @no_context is true, save the input nodemask in w.user_nodemask in
2549  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2550  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2551  * mount option.  Note that if 'static' or 'relative' mode flags were
2552  * specified, the input nodemask will already have been saved.  Saving
2553  * it again is redundant, but safe.
2554  *
2555  * On success, returns 0, else 1
2556  */
2557 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2558 {
2559         struct mempolicy *new = NULL;
2560         unsigned short mode;
2561         unsigned short uninitialized_var(mode_flags);
2562         nodemask_t nodes;
2563         char *nodelist = strchr(str, ':');
2564         char *flags = strchr(str, '=');
2565         int err = 1;
2566
2567         if (nodelist) {
2568                 /* NUL-terminate mode or flags string */
2569                 *nodelist++ = '\0';
2570                 if (nodelist_parse(nodelist, nodes))
2571                         goto out;
2572                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2573                         goto out;
2574         } else
2575                 nodes_clear(nodes);
2576
2577         if (flags)
2578                 *flags++ = '\0';        /* terminate mode string */
2579
2580         for (mode = 0; mode < MPOL_MAX; mode++) {
2581                 if (!strcmp(str, policy_modes[mode])) {
2582                         break;
2583                 }
2584         }
2585         if (mode >= MPOL_MAX)
2586                 goto out;
2587
2588         switch (mode) {
2589         case MPOL_PREFERRED:
2590                 /*
2591                  * Insist on a nodelist of one node only
2592                  */
2593                 if (nodelist) {
2594                         char *rest = nodelist;
2595                         while (isdigit(*rest))
2596                                 rest++;
2597                         if (*rest)
2598                                 goto out;
2599                 }
2600                 break;
2601         case MPOL_INTERLEAVE:
2602                 /*
2603                  * Default to online nodes with memory if no nodelist
2604                  */
2605                 if (!nodelist)
2606                         nodes = node_states[N_HIGH_MEMORY];
2607                 break;
2608         case MPOL_LOCAL:
2609                 /*
2610                  * Don't allow a nodelist;  mpol_new() checks flags
2611                  */
2612                 if (nodelist)
2613                         goto out;
2614                 mode = MPOL_PREFERRED;
2615                 break;
2616         case MPOL_DEFAULT:
2617                 /*
2618                  * Insist on a empty nodelist
2619                  */
2620                 if (!nodelist)
2621                         err = 0;
2622                 goto out;
2623         case MPOL_BIND:
2624                 /*
2625                  * Insist on a nodelist
2626                  */
2627                 if (!nodelist)
2628                         goto out;
2629         }
2630
2631         mode_flags = 0;
2632         if (flags) {
2633                 /*
2634                  * Currently, we only support two mutually exclusive
2635                  * mode flags.
2636                  */
2637                 if (!strcmp(flags, "static"))
2638                         mode_flags |= MPOL_F_STATIC_NODES;
2639                 else if (!strcmp(flags, "relative"))
2640                         mode_flags |= MPOL_F_RELATIVE_NODES;
2641                 else
2642                         goto out;
2643         }
2644
2645         new = mpol_new(mode, mode_flags, &nodes);
2646         if (IS_ERR(new))
2647                 goto out;
2648
2649         if (no_context) {
2650                 /* save for contextualization */
2651                 new->w.user_nodemask = nodes;
2652         } else {
2653                 int ret;
2654                 NODEMASK_SCRATCH(scratch);
2655                 if (scratch) {
2656                         task_lock(current);
2657                         ret = mpol_set_nodemask(new, &nodes, scratch);
2658                         task_unlock(current);
2659                 } else
2660                         ret = -ENOMEM;
2661                 NODEMASK_SCRATCH_FREE(scratch);
2662                 if (ret) {
2663                         mpol_put(new);
2664                         goto out;
2665                 }
2666         }
2667         err = 0;
2668
2669 out:
2670         /* Restore string for error message */
2671         if (nodelist)
2672                 *--nodelist = ':';
2673         if (flags)
2674                 *--flags = '=';
2675         if (!err)
2676                 *mpol = new;
2677         return err;
2678 }
2679 #endif /* CONFIG_TMPFS */
2680
2681 /**
2682  * mpol_to_str - format a mempolicy structure for printing
2683  * @buffer:  to contain formatted mempolicy string
2684  * @maxlen:  length of @buffer
2685  * @pol:  pointer to mempolicy to be formatted
2686  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2687  *
2688  * Convert a mempolicy into a string.
2689  * Returns the number of characters in buffer (if positive)
2690  * or an error (negative)
2691  */
2692 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2693 {
2694         char *p = buffer;
2695         int l;
2696         nodemask_t nodes;
2697         unsigned short mode;
2698         unsigned short flags = pol ? pol->flags : 0;
2699
2700         /*
2701          * Sanity check:  room for longest mode, flag and some nodes
2702          */
2703         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2704
2705         if (!pol || pol == &default_policy)
2706                 mode = MPOL_DEFAULT;
2707         else
2708                 mode = pol->mode;
2709
2710         switch (mode) {
2711         case MPOL_DEFAULT:
2712                 nodes_clear(nodes);
2713                 break;
2714
2715         case MPOL_PREFERRED:
2716                 nodes_clear(nodes);
2717                 if (flags & MPOL_F_LOCAL)
2718                         mode = MPOL_LOCAL;      /* pseudo-policy */
2719                 else
2720                         node_set(pol->v.preferred_node, nodes);
2721                 break;
2722
2723         case MPOL_BIND:
2724                 /* Fall through */
2725         case MPOL_INTERLEAVE:
2726                 if (no_context)
2727                         nodes = pol->w.user_nodemask;
2728                 else
2729                         nodes = pol->v.nodes;
2730                 break;
2731
2732         default:
2733                 return -EINVAL;
2734         }
2735
2736         l = strlen(policy_modes[mode]);
2737         if (buffer + maxlen < p + l + 1)
2738                 return -ENOSPC;
2739
2740         strcpy(p, policy_modes[mode]);
2741         p += l;
2742
2743         if (flags & MPOL_MODE_FLAGS) {
2744                 if (buffer + maxlen < p + 2)
2745                         return -ENOSPC;
2746                 *p++ = '=';
2747
2748                 /*
2749                  * Currently, the only defined flags are mutually exclusive
2750                  */
2751                 if (flags & MPOL_F_STATIC_NODES)
2752                         p += snprintf(p, buffer + maxlen - p, "static");
2753                 else if (flags & MPOL_F_RELATIVE_NODES)
2754                         p += snprintf(p, buffer + maxlen - p, "relative");
2755         }
2756
2757         if (!nodes_empty(nodes)) {
2758                 if (buffer + maxlen < p + 2)
2759                         return -ENOSPC;
2760                 *p++ = ':';
2761                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2762         }
2763         return p - buffer;
2764 }