mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/slab.h>
  77 #include <linux/string.h>
  78 #include <linux/export.h>
  79 #include <linux/nsproxy.h>
  80 #include <linux/interrupt.h>
  81 #include <linux/init.h>
  82 #include <linux/compat.h>
  83 #include <linux/swap.h>
  84 #include <linux/seq_file.h>
  85 #include <linux/proc_fs.h>
  86 #include <linux/migrate.h>
  87 #include <linux/ksm.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91 #include <linux/ctype.h>
  92 #include <linux/mm_inline.h>
  93 #include <linux/mmu_notifier.h>
  94
  95 #include <asm/tlbflush.h>
  96 #include <asm/uaccess.h>
  97 #include <linux/random.h>
  98
  99 #include "internal.h"
 100
 101 /* Internal flags */
 102 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 103 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 104
 105 static struct kmem_cache *policy_cache;
 106 static struct kmem_cache *sn_cache;
 107
 108 /* Highest zone. An specific allocation for a zone below that is not
 109    policied. */
 110 enum zone_type policy_zone = 0;
 111
 112 /*
 113  * run-time system-wide default policy => local allocation
 114  */
 115 static struct mempolicy default_policy = {
 116         .refcnt = ATOMIC_INIT(1), /* never free it */
 117         .mode = MPOL_PREFERRED,
 118         .flags = MPOL_F_LOCAL,
 119 };
 120
 121 static const struct mempolicy_operations {
 122         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 123         /*
 124          * If read-side task has no lock to protect task->mempolicy, write-side
 125          * task will rebind the task->mempolicy by two step. The first step is
 126          * setting all the newly nodes, and the second step is cleaning all the
 127          * disallowed nodes. In this way, we can avoid finding no node to alloc
 128          * page.
 129          * If we have a lock to protect task->mempolicy in read-side, we do
 130          * rebind directly.
 131          *
 132          * step:
 133          *      MPOL_REBIND_ONCE - do rebind work at once
 134          *      MPOL_REBIND_STEP1 - set all the newly nodes
 135          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 136          */
 137         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 138                         enum mpol_rebind_step step);
 139 } mpol_ops[MPOL_MAX];
 140
 141 /* Check that the nodemask contains at least one populated zone */
 142 static int is_valid_nodemask(const nodemask_t *nodemask)
 143 {
 144         int nd, k;
 145
 146         for_each_node_mask(nd, *nodemask) {
 147                 struct zone *z;
 148
 149                 for (k = 0; k <= policy_zone; k++) {
 150                         z = &NODE_DATA(nd)->node_zones[k];
 151                         if (z->present_pages > 0)
 152                                 return 1;
 153                 }
 154         }
 155
 156         return 0;
 157 }
 158
 159 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 160 {
 161         return pol->flags & MPOL_MODE_FLAGS;
 162 }
 163
 164 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 165                                    const nodemask_t *rel)
 166 {
 167         nodemask_t tmp;
 168         nodes_fold(tmp, *orig, nodes_weight(*rel));
 169         nodes_onto(*ret, tmp, *rel);
 170 }
 171
 172 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 173 {
 174         if (nodes_empty(*nodes))
 175                 return -EINVAL;
 176         pol->v.nodes = *nodes;
 177         return 0;
 178 }
 179
 180 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 181 {
 182         if (!nodes)
 183                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 184         else if (nodes_empty(*nodes))
 185                 return -EINVAL;                 /*  no allowed nodes */
 186         else
 187                 pol->v.preferred_node = first_node(*nodes);
 188         return 0;
 189 }
 190
 191 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 192 {
 193         if (!is_valid_nodemask(nodes))
 194                 return -EINVAL;
 195         pol->v.nodes = *nodes;
 196         return 0;
 197 }
 198
 199 /*
 200  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 201  * any, for the new policy.  mpol_new() has already validated the nodes
 202  * parameter with respect to the policy mode and flags.  But, we need to
 203  * handle an empty nodemask with MPOL_PREFERRED here.
 204  *
 205  * Must be called holding task's alloc_lock to protect task's mems_allowed
 206  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 207  */
 208 static int mpol_set_nodemask(struct mempolicy *pol,
 209                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 210 {
 211         int ret;
 212
 213         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 214         if (pol == NULL)
 215                 return 0;
 216         /* Check N_HIGH_MEMORY */
 217         nodes_and(nsc->mask1,
 218                   cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
 219
 220         VM_BUG_ON(!nodes);
 221         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 222                 nodes = NULL;   /* explicit local allocation */
 223         else {
 224                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 225                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 226                 else
 227                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 228
 229                 if (mpol_store_user_nodemask(pol))
 230                         pol->w.user_nodemask = *nodes;
 231                 else
 232                         pol->w.cpuset_mems_allowed =
 233                                                 cpuset_current_mems_allowed;
 234         }
 235
 236         if (nodes)
 237                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 238         else
 239                 ret = mpol_ops[pol->mode].create(pol, NULL);
 240         return ret;
 241 }
 242
 243 /*
 244  * This function just creates a new policy, does some check and simple
 245  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 246  */
 247 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 248                                   nodemask_t *nodes)
 249 {
 250         struct mempolicy *policy;
 251
 252         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 253                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 254
 255         if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) {
 256                 if (nodes && !nodes_empty(*nodes))
 257                         return ERR_PTR(-EINVAL);
 258                 return NULL;
 259         }
 260         VM_BUG_ON(!nodes);
 261
 262         /*
 263          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 264          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 265          * All other modes require a valid pointer to a non-empty nodemask.
 266          */
 267         if (mode == MPOL_PREFERRED) {
 268                 if (nodes_empty(*nodes)) {
 269                         if (((flags & MPOL_F_STATIC_NODES) ||
 270                              (flags & MPOL_F_RELATIVE_NODES)))
 271                                 return ERR_PTR(-EINVAL);
 272                 }
 273         } else if (mode == MPOL_LOCAL) {
 274                 if (!nodes_empty(*nodes))
 275                         return ERR_PTR(-EINVAL);
 276                 mode = MPOL_PREFERRED;
 277         } else if (nodes_empty(*nodes))
 278                 return ERR_PTR(-EINVAL);
 279         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 280         if (!policy)
 281                 return ERR_PTR(-ENOMEM);
 282         atomic_set(&policy->refcnt, 1);
 283         policy->mode = mode;
 284         policy->flags = flags;
 285
 286         return policy;
 287 }
 288
 289 /* Slow path of a mpol destructor. */
 290 void __mpol_put(struct mempolicy *p)
 291 {
 292         if (!atomic_dec_and_test(&p->refcnt))
 293                 return;
 294         kmem_cache_free(policy_cache, p);
 295 }
 296
 297 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 298                                 enum mpol_rebind_step step)
 299 {
 300 }
 301
 302 /*
 303  * step:
 304  *      MPOL_REBIND_ONCE  - do rebind work at once
 305  *      MPOL_REBIND_STEP1 - set all the newly nodes
 306  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 307  */
 308 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 309                                  enum mpol_rebind_step step)
 310 {
 311         nodemask_t tmp;
 312
 313         if (pol->flags & MPOL_F_STATIC_NODES)
 314                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 315         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 316                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 317         else {
 318                 /*
 319                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 320                  * result
 321                  */
 322                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 323                         nodes_remap(tmp, pol->v.nodes,
 324                                         pol->w.cpuset_mems_allowed, *nodes);
 325                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 326                 } else if (step == MPOL_REBIND_STEP2) {
 327                         tmp = pol->w.cpuset_mems_allowed;
 328                         pol->w.cpuset_mems_allowed = *nodes;
 329                 } else
 330                         BUG();
 331         }
 332
 333         if (nodes_empty(tmp))
 334                 tmp = *nodes;
 335
 336         if (step == MPOL_REBIND_STEP1)
 337                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 338         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 339                 pol->v.nodes = tmp;
 340         else
 341                 BUG();
 342
 343         if (!node_isset(current->il_next, tmp)) {
 344                 current->il_next = next_node(current->il_next, tmp);
 345                 if (current->il_next >= MAX_NUMNODES)
 346                         current->il_next = first_node(tmp);
 347                 if (current->il_next >= MAX_NUMNODES)
 348                         current->il_next = numa_node_id();
 349         }
 350 }
 351
 352 static void mpol_rebind_preferred(struct mempolicy *pol,
 353                                   const nodemask_t *nodes,
 354                                   enum mpol_rebind_step step)
 355 {
 356         nodemask_t tmp;
 357
 358         if (pol->flags & MPOL_F_STATIC_NODES) {
 359                 int node = first_node(pol->w.user_nodemask);
 360
 361                 if (node_isset(node, *nodes)) {
 362                         pol->v.preferred_node = node;
 363                         pol->flags &= ~MPOL_F_LOCAL;
 364                 } else
 365                         pol->flags |= MPOL_F_LOCAL;
 366         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 367                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 368                 pol->v.preferred_node = first_node(tmp);
 369         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 370                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 371                                                    pol->w.cpuset_mems_allowed,
 372                                                    *nodes);
 373                 pol->w.cpuset_mems_allowed = *nodes;
 374         }
 375 }
 376
 377 /*
 378  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 379  *
 380  * If read-side task has no lock to protect task->mempolicy, write-side
 381  * task will rebind the task->mempolicy by two step. The first step is
 382  * setting all the newly nodes, and the second step is cleaning all the
 383  * disallowed nodes. In this way, we can avoid finding no node to alloc
 384  * page.
 385  * If we have a lock to protect task->mempolicy in read-side, we do
 386  * rebind directly.
 387  *
 388  * step:
 389  *      MPOL_REBIND_ONCE  - do rebind work at once
 390  *      MPOL_REBIND_STEP1 - set all the newly nodes
 391  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 392  */
 393 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 394                                 enum mpol_rebind_step step)
 395 {
 396         if (!pol)
 397                 return;
 398         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 399             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 400                 return;
 401
 402         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 403                 return;
 404
 405         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 406                 BUG();
 407
 408         if (step == MPOL_REBIND_STEP1)
 409                 pol->flags |= MPOL_F_REBINDING;
 410         else if (step == MPOL_REBIND_STEP2)
 411                 pol->flags &= ~MPOL_F_REBINDING;
 412         else if (step >= MPOL_REBIND_NSTEP)
 413                 BUG();
 414
 415         mpol_ops[pol->mode].rebind(pol, newmask, step);
 416 }
 417
 418 /*
 419  * Wrapper for mpol_rebind_policy() that just requires task
 420  * pointer, and updates task mempolicy.
 421  *
 422  * Called with task's alloc_lock held.
 423  */
 424
 425 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 426                         enum mpol_rebind_step step)
 427 {
 428         mpol_rebind_policy(tsk->mempolicy, new, step);
 429 }
 430
 431 /*
 432  * Rebind each vma in mm to new nodemask.
 433  *
 434  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 435  */
 436
 437 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 438 {
 439         struct vm_area_struct *vma;
 440
 441         down_write(&mm->mmap_sem);
 442         for (vma = mm->mmap; vma; vma = vma->vm_next)
 443                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 444         up_write(&mm->mmap_sem);
 445 }
 446
 447 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 448         [MPOL_DEFAULT] = {
 449                 .rebind = mpol_rebind_default,
 450         },
 451         [MPOL_INTERLEAVE] = {
 452                 .create = mpol_new_interleave,
 453                 .rebind = mpol_rebind_nodemask,
 454         },
 455         [MPOL_PREFERRED] = {
 456                 .create = mpol_new_preferred,
 457                 .rebind = mpol_rebind_preferred,
 458         },
 459         [MPOL_BIND] = {
 460                 .create = mpol_new_bind,
 461                 .rebind = mpol_rebind_nodemask,
 462         },
 463 };
 464
 465 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 466                                 unsigned long flags);
 467
 468 /* Scan through pages checking if pages follow certain conditions. */
 469 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 470                 unsigned long addr, unsigned long end,
 471                 const nodemask_t *nodes, unsigned long flags,
 472                 void *private)
 473 {
 474         pte_t *orig_pte;
 475         pte_t *pte;
 476         spinlock_t *ptl;
 477
 478         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 479         do {
 480                 struct page *page;
 481                 int nid;
 482
 483                 if (!pte_present(*pte))
 484                         continue;
 485                 page = vm_normal_page(vma, addr, *pte);
 486                 if (!page)
 487                         continue;
 488                 /*
 489                  * vm_normal_page() filters out zero pages, but there might
 490                  * still be PageReserved pages to skip, perhaps in a VDSO.
 491                  * And we cannot move PageKsm pages sensibly or safely yet.
 492                  */
 493                 if (PageReserved(page) || PageKsm(page))
 494                         continue;
 495                 nid = page_to_nid(page);
 496                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 497                         continue;
 498
 499                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 500                         migrate_page_add(page, private, flags);
 501                 else
 502                         break;
 503         } while (pte++, addr += PAGE_SIZE, addr != end);
 504         pte_unmap_unlock(orig_pte, ptl);
 505         return addr != end;
 506 }
 507
 508 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 509                 unsigned long addr, unsigned long end,
 510                 const nodemask_t *nodes, unsigned long flags,
 511                 void *private)
 512 {
 513         pmd_t *pmd;
 514         unsigned long next;
 515
 516         pmd = pmd_offset(pud, addr);
 517         do {
 518                 next = pmd_addr_end(addr, end);
 519                 split_huge_page_pmd(vma->vm_mm, pmd);
 520                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 521                         continue;
 522                 if (check_pte_range(vma, pmd, addr, next, nodes,
 523                                     flags, private))
 524                         return -EIO;
 525         } while (pmd++, addr = next, addr != end);
 526         return 0;
 527 }
 528
 529 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 530                 unsigned long addr, unsigned long end,
 531                 const nodemask_t *nodes, unsigned long flags,
 532                 void *private)
 533 {
 534         pud_t *pud;
 535         unsigned long next;
 536
 537         pud = pud_offset(pgd, addr);
 538         do {
 539                 next = pud_addr_end(addr, end);
 540                 if (pud_none_or_clear_bad(pud))
 541                         continue;
 542                 if (check_pmd_range(vma, pud, addr, next, nodes,
 543                                     flags, private))
 544                         return -EIO;
 545         } while (pud++, addr = next, addr != end);
 546         return 0;
 547 }
 548
 549 static inline int check_pgd_range(struct vm_area_struct *vma,
 550                 unsigned long addr, unsigned long end,
 551                 const nodemask_t *nodes, unsigned long flags,
 552                 void *private)
 553 {
 554         pgd_t *pgd;
 555         unsigned long next;
 556
 557         pgd = pgd_offset(vma->vm_mm, addr);
 558         do {
 559                 next = pgd_addr_end(addr, end);
 560                 if (pgd_none_or_clear_bad(pgd))
 561                         continue;
 562                 if (check_pud_range(vma, pgd, addr, next, nodes,
 563                                     flags, private))
 564                         return -EIO;
 565         } while (pgd++, addr = next, addr != end);
 566         return 0;
 567 }
 568
 569 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 570 /*
 571  * This is used to mark a range of virtual addresses to be inaccessible.
 572  * These are later cleared by a NUMA hinting fault. Depending on these
 573  * faults, pages may be migrated for better NUMA placement.
 574  *
 575  * This is assuming that NUMA faults are handled using PROT_NONE. If
 576  * an architecture makes a different choice, it will need further
 577  * changes to the core.
 578  */
 579 unsigned long change_prot_numa(struct vm_area_struct *vma,
 580                         unsigned long addr, unsigned long end)
 581 {
 582         int nr_updated;
 583         BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 584
 585         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 586
 587         return nr_updated;
 588 }
 589 #else
 590 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 591                         unsigned long addr, unsigned long end)
 592 {
 593         return 0;
 594 }
 595 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 596
 597 /*
 598  * Check if all pages in a range are on a set of nodes.
 599  * If pagelist != NULL then isolate pages from the LRU and
 600  * put them on the pagelist.
 601  */
 602 static struct vm_area_struct *
 603 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 604                 const nodemask_t *nodes, unsigned long flags, void *private)
 605 {
 606         int err;
 607         struct vm_area_struct *first, *vma, *prev;
 608
 609
 610         first = find_vma(mm, start);
 611         if (!first)
 612                 return ERR_PTR(-EFAULT);
 613         prev = NULL;
 614         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 615                 unsigned long endvma = vma->vm_end;
 616
 617                 if (endvma > end)
 618                         endvma = end;
 619                 if (vma->vm_start > start)
 620                         start = vma->vm_start;
 621
 622                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 623                         if (!vma->vm_next && vma->vm_end < end)
 624                                 return ERR_PTR(-EFAULT);
 625                         if (prev && prev->vm_end < vma->vm_start)
 626                                 return ERR_PTR(-EFAULT);
 627                 }
 628
 629                 if (is_vm_hugetlb_page(vma))
 630                         goto next;
 631
 632                 if (flags & MPOL_MF_LAZY) {
 633                         change_prot_numa(vma, start, endvma);
 634                         goto next;
 635                 }
 636
 637                 if ((flags & MPOL_MF_STRICT) ||
 638                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 639                       vma_migratable(vma))) {
 640
 641                         err = check_pgd_range(vma, start, endvma, nodes,
 642                                                 flags, private);
 643                         if (err) {
 644                                 first = ERR_PTR(err);
 645                                 break;
 646                         }
 647                 }
 648 next:
 649                 prev = vma;
 650         }
 651         return first;
 652 }
 653
 654 /*
 655  * Apply policy to a single VMA
 656  * This must be called with the mmap_sem held for writing.
 657  */
 658 static int vma_replace_policy(struct vm_area_struct *vma,
 659                                                 struct mempolicy *pol)
 660 {
 661         int err;
 662         struct mempolicy *old;
 663         struct mempolicy *new;
 664
 665         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 666                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 667                  vma->vm_ops, vma->vm_file,
 668                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 669
 670         new = mpol_dup(pol);
 671         if (IS_ERR(new))
 672                 return PTR_ERR(new);
 673
 674         if (vma->vm_ops && vma->vm_ops->set_policy) {
 675                 err = vma->vm_ops->set_policy(vma, new);
 676                 if (err)
 677                         goto err_out;
 678         }
 679
 680         old = vma->vm_policy;
 681         vma->vm_policy = new; /* protected by mmap_sem */
 682         mpol_put(old);
 683
 684         return 0;
 685  err_out:
 686         mpol_put(new);
 687         return err;
 688 }
 689
 690 /* Step 2: apply policy to a range and do splits. */
 691 static int mbind_range(struct mm_struct *mm, unsigned long start,
 692                        unsigned long end, struct mempolicy *new_pol)
 693 {
 694         struct vm_area_struct *next;
 695         struct vm_area_struct *prev;
 696         struct vm_area_struct *vma;
 697         int err = 0;
 698         pgoff_t pgoff;
 699         unsigned long vmstart;
 700         unsigned long vmend;
 701
 702         vma = find_vma(mm, start);
 703         if (!vma || vma->vm_start > start)
 704                 return -EFAULT;
 705
 706         prev = vma->vm_prev;
 707         if (start > vma->vm_start)
 708                 prev = vma;
 709
 710         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 711                 next = vma->vm_next;
 712                 vmstart = max(start, vma->vm_start);
 713                 vmend   = min(end, vma->vm_end);
 714
 715                 if (mpol_equal(vma_policy(vma), new_pol))
 716                         continue;
 717
 718                 pgoff = vma->vm_pgoff +
 719                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 720                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 721                                   vma->anon_vma, vma->vm_file, pgoff,
 722                                   new_pol);
 723                 if (prev) {
 724                         vma = prev;
 725                         next = vma->vm_next;
 726                         continue;
 727                 }
 728                 if (vma->vm_start != vmstart) {
 729                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 730                         if (err)
 731                                 goto out;
 732                 }
 733                 if (vma->vm_end != vmend) {
 734                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 735                         if (err)
 736                                 goto out;
 737                 }
 738                 err = vma_replace_policy(vma, new_pol);
 739                 if (err)
 740                         goto out;
 741         }
 742
 743  out:
 744         return err;
 745 }
 746
 747 /*
 748  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 749  * mempolicy.  Allows more rapid checking of this (combined perhaps
 750  * with other PF_* flag bits) on memory allocation hot code paths.
 751  *
 752  * If called from outside this file, the task 'p' should -only- be
 753  * a newly forked child not yet visible on the task list, because
 754  * manipulating the task flags of a visible task is not safe.
 755  *
 756  * The above limitation is why this routine has the funny name
 757  * mpol_fix_fork_child_flag().
 758  *
 759  * It is also safe to call this with a task pointer of current,
 760  * which the static wrapper mpol_set_task_struct_flag() does,
 761  * for use within this file.
 762  */
 763
 764 void mpol_fix_fork_child_flag(struct task_struct *p)
 765 {
 766         if (p->mempolicy)
 767                 p->flags |= PF_MEMPOLICY;
 768         else
 769                 p->flags &= ~PF_MEMPOLICY;
 770 }
 771
 772 static void mpol_set_task_struct_flag(void)
 773 {
 774         mpol_fix_fork_child_flag(current);
 775 }
 776
 777 /* Set the process memory policy */
 778 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 779                              nodemask_t *nodes)
 780 {
 781         struct mempolicy *new, *old;
 782         struct mm_struct *mm = current->mm;
 783         NODEMASK_SCRATCH(scratch);
 784         int ret;
 785
 786         if (!scratch)
 787                 return -ENOMEM;
 788
 789         new = mpol_new(mode, flags, nodes);
 790         if (IS_ERR(new)) {
 791                 ret = PTR_ERR(new);
 792                 goto out;
 793         }
 794         /*
 795          * prevent changing our mempolicy while show_numa_maps()
 796          * is using it.
 797          * Note:  do_set_mempolicy() can be called at init time
 798          * with no 'mm'.
 799          */
 800         if (mm)
 801                 down_write(&mm->mmap_sem);
 802         task_lock(current);
 803         ret = mpol_set_nodemask(new, nodes, scratch);
 804         if (ret) {
 805                 task_unlock(current);
 806                 if (mm)
 807                         up_write(&mm->mmap_sem);
 808                 mpol_put(new);
 809                 goto out;
 810         }
 811         old = current->mempolicy;
 812         current->mempolicy = new;
 813         mpol_set_task_struct_flag();
 814         if (new && new->mode == MPOL_INTERLEAVE &&
 815             nodes_weight(new->v.nodes))
 816                 current->il_next = first_node(new->v.nodes);
 817         task_unlock(current);
 818         if (mm)
 819                 up_write(&mm->mmap_sem);
 820
 821         mpol_put(old);
 822         ret = 0;
 823 out:
 824         NODEMASK_SCRATCH_FREE(scratch);
 825         return ret;
 826 }
 827
 828 /*
 829  * Return nodemask for policy for get_mempolicy() query
 830  *
 831  * Called with task's alloc_lock held
 832  */
 833 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 834 {
 835         nodes_clear(*nodes);
 836         if (p == &default_policy)
 837                 return;
 838
 839         switch (p->mode) {
 840         case MPOL_BIND:
 841                 /* Fall through */
 842         case MPOL_INTERLEAVE:
 843                 *nodes = p->v.nodes;
 844                 break;
 845         case MPOL_PREFERRED:
 846                 if (!(p->flags & MPOL_F_LOCAL))
 847                         node_set(p->v.preferred_node, *nodes);
 848                 /* else return empty node mask for local allocation */
 849                 break;
 850         default:
 851                 BUG();
 852         }
 853 }
 854
 855 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 856 {
 857         struct page *p;
 858         int err;
 859
 860         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 861         if (err >= 0) {
 862                 err = page_to_nid(p);
 863                 put_page(p);
 864         }
 865         return err;
 866 }
 867
 868 /* Retrieve NUMA policy */
 869 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 870                              unsigned long addr, unsigned long flags)
 871 {
 872         int err;
 873         struct mm_struct *mm = current->mm;
 874         struct vm_area_struct *vma = NULL;
 875         struct mempolicy *pol = current->mempolicy;
 876
 877         if (flags &
 878                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 879                 return -EINVAL;
 880
 881         if (flags & MPOL_F_MEMS_ALLOWED) {
 882                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 883                         return -EINVAL;
 884                 *policy = 0;    /* just so it's initialized */
 885                 task_lock(current);
 886                 *nmask  = cpuset_current_mems_allowed;
 887                 task_unlock(current);
 888                 return 0;
 889         }
 890
 891         if (flags & MPOL_F_ADDR) {
 892                 /*
 893                  * Do NOT fall back to task policy if the
 894                  * vma/shared policy at addr is NULL.  We
 895                  * want to return MPOL_DEFAULT in this case.
 896                  */
 897                 down_read(&mm->mmap_sem);
 898                 vma = find_vma_intersection(mm, addr, addr+1);
 899                 if (!vma) {
 900                         up_read(&mm->mmap_sem);
 901                         return -EFAULT;
 902                 }
 903                 if (vma->vm_ops && vma->vm_ops->get_policy)
 904                         pol = vma->vm_ops->get_policy(vma, addr);
 905                 else
 906                         pol = vma->vm_policy;
 907         } else if (addr)
 908                 return -EINVAL;
 909
 910         if (!pol)
 911                 pol = &default_policy;  /* indicates default behavior */
 912
 913         if (flags & MPOL_F_NODE) {
 914                 if (flags & MPOL_F_ADDR) {
 915                         err = lookup_node(mm, addr);
 916                         if (err < 0)
 917                                 goto out;
 918                         *policy = err;
 919                 } else if (pol == current->mempolicy &&
 920                                 pol->mode == MPOL_INTERLEAVE) {
 921                         *policy = current->il_next;
 922                 } else {
 923                         err = -EINVAL;
 924                         goto out;
 925                 }
 926         } else {
 927                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 928                                                 pol->mode;
 929                 /*
 930                  * Internal mempolicy flags must be masked off before exposing
 931                  * the policy to userspace.
 932                  */
 933                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 934         }
 935
 936         if (vma) {
 937                 up_read(&current->mm->mmap_sem);
 938                 vma = NULL;
 939         }
 940
 941         err = 0;
 942         if (nmask) {
 943                 if (mpol_store_user_nodemask(pol)) {
 944                         *nmask = pol->w.user_nodemask;
 945                 } else {
 946                         task_lock(current);
 947                         get_policy_nodemask(pol, nmask);
 948                         task_unlock(current);
 949                 }
 950         }
 951
 952  out:
 953         mpol_cond_put(pol);
 954         if (vma)
 955                 up_read(&current->mm->mmap_sem);
 956         return err;
 957 }
 958
 959 #ifdef CONFIG_MIGRATION
 960 /*
 961  * page migration
 962  */
 963 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 964                                 unsigned long flags)
 965 {
 966         /*
 967          * Avoid migrating a page that is shared with others.
 968          */
 969         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 970                 if (!isolate_lru_page(page)) {
 971                         list_add_tail(&page->lru, pagelist);
 972                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 973                                             page_is_file_cache(page));
 974                 }
 975         }
 976 }
 977
 978 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 979 {
 980         return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 981 }
 982
 983 /*
 984  * Migrate pages from one node to a target node.
 985  * Returns error or the number of pages not migrated.
 986  */
 987 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 988                            int flags)
 989 {
 990         nodemask_t nmask;
 991         LIST_HEAD(pagelist);
 992         int err = 0;
 993
 994         nodes_clear(nmask);
 995         node_set(source, nmask);
 996
 997         /*
 998          * This does not "check" the range but isolates all pages that
 999          * need migration.  Between passing in the full user address
1000          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1001          */
1002         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1003         check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1004                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1005
1006         if (!list_empty(&pagelist)) {
1007                 err = migrate_pages(&pagelist, new_node_page, dest,
1008                                                         false, MIGRATE_SYNC,
1009                                                         MR_SYSCALL);
1010                 if (err)
1011                         putback_lru_pages(&pagelist);
1012         }
1013
1014         return err;
1015 }
1016
1017 /*
1018  * Move pages between the two nodesets so as to preserve the physical
1019  * layout as much as possible.
1020  *
1021  * Returns the number of page that could not be moved.
1022  */
1023 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1024                      const nodemask_t *to, int flags)
1025 {
1026         int busy = 0;
1027         int err;
1028         nodemask_t tmp;
1029
1030         err = migrate_prep();
1031         if (err)
1032                 return err;
1033
1034         down_read(&mm->mmap_sem);
1035
1036         err = migrate_vmas(mm, from, to, flags);
1037         if (err)
1038                 goto out;
1039
1040         /*
1041          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1042          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1043          * bit in 'tmp', and return that <source, dest> pair for migration.
1044          * The pair of nodemasks 'to' and 'from' define the map.
1045          *
1046          * If no pair of bits is found that way, fallback to picking some
1047          * pair of 'source' and 'dest' bits that are not the same.  If the
1048          * 'source' and 'dest' bits are the same, this represents a node
1049          * that will be migrating to itself, so no pages need move.
1050          *
1051          * If no bits are left in 'tmp', or if all remaining bits left
1052          * in 'tmp' correspond to the same bit in 'to', return false
1053          * (nothing left to migrate).
1054          *
1055          * This lets us pick a pair of nodes to migrate between, such that
1056          * if possible the dest node is not already occupied by some other
1057          * source node, minimizing the risk of overloading the memory on a
1058          * node that would happen if we migrated incoming memory to a node
1059          * before migrating outgoing memory source that same node.
1060          *
1061          * A single scan of tmp is sufficient.  As we go, we remember the
1062          * most recent <s, d> pair that moved (s != d).  If we find a pair
1063          * that not only moved, but what's better, moved to an empty slot
1064          * (d is not set in tmp), then we break out then, with that pair.
1065          * Otherwise when we finish scanning from_tmp, we at least have the
1066          * most recent <s, d> pair that moved.  If we get all the way through
1067          * the scan of tmp without finding any node that moved, much less
1068          * moved to an empty node, then there is nothing left worth migrating.
1069          */
1070
1071         tmp = *from;
1072         while (!nodes_empty(tmp)) {
1073                 int s,d;
1074                 int source = -1;
1075                 int dest = 0;
1076
1077                 for_each_node_mask(s, tmp) {
1078
1079                         /*
1080                          * do_migrate_pages() tries to maintain the relative
1081                          * node relationship of the pages established between
1082                          * threads and memory areas.
1083                          *
1084                          * However if the number of source nodes is not equal to
1085                          * the number of destination nodes we can not preserve
1086                          * this node relative relationship.  In that case, skip
1087                          * copying memory from a node that is in the destination
1088                          * mask.
1089                          *
1090                          * Example: [2,3,4] -> [3,4,5] moves everything.
1091                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1092                          */
1093
1094                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1095                                                 (node_isset(s, *to)))
1096                                 continue;
1097
1098                         d = node_remap(s, *from, *to);
1099                         if (s == d)
1100                                 continue;
1101
1102                         source = s;     /* Node moved. Memorize */
1103                         dest = d;
1104
1105                         /* dest not in remaining from nodes? */
1106                         if (!node_isset(dest, tmp))
1107                                 break;
1108                 }
1109                 if (source == -1)
1110                         break;
1111
1112                 node_clear(source, tmp);
1113                 err = migrate_to_node(mm, source, dest, flags);
1114                 if (err > 0)
1115                         busy += err;
1116                 if (err < 0)
1117                         break;
1118         }
1119 out:
1120         up_read(&mm->mmap_sem);
1121         if (err < 0)
1122                 return err;
1123         return busy;
1124
1125 }
1126
1127 /*
1128  * Allocate a new page for page migration based on vma policy.
1129  * Start assuming that page is mapped by vma pointed to by @private.
1130  * Search forward from there, if not.  N.B., this assumes that the
1131  * list of pages handed to migrate_pages()--which is how we get here--
1132  * is in virtual address order.
1133  */
1134 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1135 {
1136         struct vm_area_struct *vma = (struct vm_area_struct *)private;
1137         unsigned long uninitialized_var(address);
1138
1139         while (vma) {
1140                 address = page_address_in_vma(page, vma);
1141                 if (address != -EFAULT)
1142                         break;
1143                 vma = vma->vm_next;
1144         }
1145
1146         /*
1147          * if !vma, alloc_page_vma() will use task or system default policy
1148          */
1149         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1150 }
1151 #else
1152
1153 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1154                                 unsigned long flags)
1155 {
1156 }
1157
1158 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1159                      const nodemask_t *to, int flags)
1160 {
1161         return -ENOSYS;
1162 }
1163
1164 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1165 {
1166         return NULL;
1167 }
1168 #endif
1169
1170 static long do_mbind(unsigned long start, unsigned long len,
1171                      unsigned short mode, unsigned short mode_flags,
1172                      nodemask_t *nmask, unsigned long flags)
1173 {
1174         struct vm_area_struct *vma;
1175         struct mm_struct *mm = current->mm;
1176         struct mempolicy *new;
1177         unsigned long end;
1178         int err;
1179         LIST_HEAD(pagelist);
1180
1181         if (flags & ~(unsigned long)MPOL_MF_VALID)
1182                 return -EINVAL;
1183         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1184                 return -EPERM;
1185
1186         if (start & ~PAGE_MASK)
1187                 return -EINVAL;
1188
1189         if (mode == MPOL_DEFAULT || mode == MPOL_NOOP)
1190                 flags &= ~MPOL_MF_STRICT;
1191
1192         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1193         end = start + len;
1194
1195         if (end < start)
1196                 return -EINVAL;
1197         if (end == start)
1198                 return 0;
1199
1200         new = mpol_new(mode, mode_flags, nmask);
1201         if (IS_ERR(new))
1202                 return PTR_ERR(new);
1203
1204         if (flags & MPOL_MF_LAZY)
1205                 new->flags |= MPOL_F_MOF;
1206
1207         /*
1208          * If we are using the default policy then operation
1209          * on discontinuous address spaces is okay after all
1210          */
1211         if (!new)
1212                 flags |= MPOL_MF_DISCONTIG_OK;
1213
1214         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1215                  start, start + len, mode, mode_flags,
1216                  nmask ? nodes_addr(*nmask)[0] : -1);
1217
1218         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1219
1220                 err = migrate_prep();
1221                 if (err)
1222                         goto mpol_out;
1223         }
1224         {
1225                 NODEMASK_SCRATCH(scratch);
1226                 if (scratch) {
1227                         down_write(&mm->mmap_sem);
1228                         task_lock(current);
1229                         err = mpol_set_nodemask(new, nmask, scratch);
1230                         task_unlock(current);
1231                         if (err)
1232                                 up_write(&mm->mmap_sem);
1233                 } else
1234                         err = -ENOMEM;
1235                 NODEMASK_SCRATCH_FREE(scratch);
1236         }
1237         if (err)
1238                 goto mpol_out;
1239
1240         vma = check_range(mm, start, end, nmask,
1241                           flags | MPOL_MF_INVERT, &pagelist);
1242
1243         err = PTR_ERR(vma);     /* maybe ... */
1244         if (!IS_ERR(vma) && mode != MPOL_NOOP)
1245                 err = mbind_range(mm, start, end, new);
1246
1247         if (!err) {
1248                 int nr_failed = 0;
1249
1250                 if (!list_empty(&pagelist)) {
1251                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1252                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1253                                                 (unsigned long)vma,
1254                                                 false, MIGRATE_SYNC,
1255                                                 MR_MEMPOLICY_MBIND);
1256                         if (nr_failed)
1257                                 putback_lru_pages(&pagelist);
1258                 }
1259
1260                 if (nr_failed && (flags & MPOL_MF_STRICT))
1261                         err = -EIO;
1262         } else
1263                 putback_lru_pages(&pagelist);
1264
1265         up_write(&mm->mmap_sem);
1266  mpol_out:
1267         mpol_put(new);
1268         return err;
1269 }
1270
1271 /*
1272  * User space interface with variable sized bitmaps for nodelists.
1273  */
1274
1275 /* Copy a node mask from user space. */
1276 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1277                      unsigned long maxnode)
1278 {
1279         unsigned long k;
1280         unsigned long nlongs;
1281         unsigned long endmask;
1282
1283         --maxnode;
1284         nodes_clear(*nodes);
1285         if (maxnode == 0 || !nmask)
1286                 return 0;
1287         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1288                 return -EINVAL;
1289
1290         nlongs = BITS_TO_LONGS(maxnode);
1291         if ((maxnode % BITS_PER_LONG) == 0)
1292                 endmask = ~0UL;
1293         else
1294                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1295
1296         /* When the user specified more nodes than supported just check
1297            if the non supported part is all zero. */
1298         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1299                 if (nlongs > PAGE_SIZE/sizeof(long))
1300                         return -EINVAL;
1301                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1302                         unsigned long t;
1303                         if (get_user(t, nmask + k))
1304                                 return -EFAULT;
1305                         if (k == nlongs - 1) {
1306                                 if (t & endmask)
1307                                         return -EINVAL;
1308                         } else if (t)
1309                                 return -EINVAL;
1310                 }
1311                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1312                 endmask = ~0UL;
1313         }
1314
1315         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1316                 return -EFAULT;
1317         nodes_addr(*nodes)[nlongs-1] &= endmask;
1318         return 0;
1319 }
1320
1321 /* Copy a kernel node mask to user space */
1322 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1323                               nodemask_t *nodes)
1324 {
1325         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1326         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1327
1328         if (copy > nbytes) {
1329                 if (copy > PAGE_SIZE)
1330                         return -EINVAL;
1331                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1332                         return -EFAULT;
1333                 copy = nbytes;
1334         }
1335         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1336 }
1337
1338 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1339                 unsigned long, mode, unsigned long __user *, nmask,
1340                 unsigned long, maxnode, unsigned, flags)
1341 {
1342         nodemask_t nodes;
1343         int err;
1344         unsigned short mode_flags;
1345
1346         mode_flags = mode & MPOL_MODE_FLAGS;
1347         mode &= ~MPOL_MODE_FLAGS;
1348         if (mode >= MPOL_MAX)
1349                 return -EINVAL;
1350         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1351             (mode_flags & MPOL_F_RELATIVE_NODES))
1352                 return -EINVAL;
1353         err = get_nodes(&nodes, nmask, maxnode);
1354         if (err)
1355                 return err;
1356         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1357 }
1358
1359 /* Set the process memory policy */
1360 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1361                 unsigned long, maxnode)
1362 {
1363         int err;
1364         nodemask_t nodes;
1365         unsigned short flags;
1366
1367         flags = mode & MPOL_MODE_FLAGS;
1368         mode &= ~MPOL_MODE_FLAGS;
1369         if ((unsigned int)mode >= MPOL_MAX)
1370                 return -EINVAL;
1371         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1372                 return -EINVAL;
1373         err = get_nodes(&nodes, nmask, maxnode);
1374         if (err)
1375                 return err;
1376         return do_set_mempolicy(mode, flags, &nodes);
1377 }
1378
1379 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1380                 const unsigned long __user *, old_nodes,
1381                 const unsigned long __user *, new_nodes)
1382 {
1383         const struct cred *cred = current_cred(), *tcred;
1384         struct mm_struct *mm = NULL;
1385         struct task_struct *task;
1386         nodemask_t task_nodes;
1387         int err;
1388         nodemask_t *old;
1389         nodemask_t *new;
1390         NODEMASK_SCRATCH(scratch);
1391
1392         if (!scratch)
1393                 return -ENOMEM;
1394
1395         old = &scratch->mask1;
1396         new = &scratch->mask2;
1397
1398         err = get_nodes(old, old_nodes, maxnode);
1399         if (err)
1400                 goto out;
1401
1402         err = get_nodes(new, new_nodes, maxnode);
1403         if (err)
1404                 goto out;
1405
1406         /* Find the mm_struct */
1407         rcu_read_lock();
1408         task = pid ? find_task_by_vpid(pid) : current;
1409         if (!task) {
1410                 rcu_read_unlock();
1411                 err = -ESRCH;
1412                 goto out;
1413         }
1414         get_task_struct(task);
1415
1416         err = -EINVAL;
1417
1418         /*
1419          * Check if this process has the right to modify the specified
1420          * process. The right exists if the process has administrative
1421          * capabilities, superuser privileges or the same
1422          * userid as the target process.
1423          */
1424         tcred = __task_cred(task);
1425         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1426             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1427             !capable(CAP_SYS_NICE)) {
1428                 rcu_read_unlock();
1429                 err = -EPERM;
1430                 goto out_put;
1431         }
1432         rcu_read_unlock();
1433
1434         task_nodes = cpuset_mems_allowed(task);
1435         /* Is the user allowed to access the target nodes? */
1436         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1437                 err = -EPERM;
1438                 goto out_put;
1439         }
1440
1441         if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1442                 err = -EINVAL;
1443                 goto out_put;
1444         }
1445
1446         err = security_task_movememory(task);
1447         if (err)
1448                 goto out_put;
1449
1450         mm = get_task_mm(task);
1451         put_task_struct(task);
1452
1453         if (!mm) {
1454                 err = -EINVAL;
1455                 goto out;
1456         }
1457
1458         err = do_migrate_pages(mm, old, new,
1459                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1460
1461         mmput(mm);
1462 out:
1463         NODEMASK_SCRATCH_FREE(scratch);
1464
1465         return err;
1466
1467 out_put:
1468         put_task_struct(task);
1469         goto out;
1470
1471 }
1472
1473
1474 /* Retrieve NUMA policy */
1475 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1476                 unsigned long __user *, nmask, unsigned long, maxnode,
1477                 unsigned long, addr, unsigned long, flags)
1478 {
1479         int err;
1480         int uninitialized_var(pval);
1481         nodemask_t nodes;
1482
1483         if (nmask != NULL && maxnode < MAX_NUMNODES)
1484                 return -EINVAL;
1485
1486         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1487
1488         if (err)
1489                 return err;
1490
1491         if (policy && put_user(pval, policy))
1492                 return -EFAULT;
1493
1494         if (nmask)
1495                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1496
1497         return err;
1498 }
1499
1500 #ifdef CONFIG_COMPAT
1501
1502 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1503                                      compat_ulong_t __user *nmask,
1504                                      compat_ulong_t maxnode,
1505                                      compat_ulong_t addr, compat_ulong_t flags)
1506 {
1507         long err;
1508         unsigned long __user *nm = NULL;
1509         unsigned long nr_bits, alloc_size;
1510         DECLARE_BITMAP(bm, MAX_NUMNODES);
1511
1512         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1513         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1514
1515         if (nmask)
1516                 nm = compat_alloc_user_space(alloc_size);
1517
1518         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1519
1520         if (!err && nmask) {
1521                 unsigned long copy_size;
1522                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1523                 err = copy_from_user(bm, nm, copy_size);
1524                 /* ensure entire bitmap is zeroed */
1525                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1526                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1527         }
1528
1529         return err;
1530 }
1531
1532 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1533                                      compat_ulong_t maxnode)
1534 {
1535         long err = 0;
1536         unsigned long __user *nm = NULL;
1537         unsigned long nr_bits, alloc_size;
1538         DECLARE_BITMAP(bm, MAX_NUMNODES);
1539
1540         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1541         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1542
1543         if (nmask) {
1544                 err = compat_get_bitmap(bm, nmask, nr_bits);
1545                 nm = compat_alloc_user_space(alloc_size);
1546                 err |= copy_to_user(nm, bm, alloc_size);
1547         }
1548
1549         if (err)
1550                 return -EFAULT;
1551
1552         return sys_set_mempolicy(mode, nm, nr_bits+1);
1553 }
1554
1555 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1556                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1557                              compat_ulong_t maxnode, compat_ulong_t flags)
1558 {
1559         long err = 0;
1560         unsigned long __user *nm = NULL;
1561         unsigned long nr_bits, alloc_size;
1562         nodemask_t bm;
1563
1564         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1565         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1566
1567         if (nmask) {
1568                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1569                 nm = compat_alloc_user_space(alloc_size);
1570                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1571         }
1572
1573         if (err)
1574                 return -EFAULT;
1575
1576         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1577 }
1578
1579 #endif
1580
1581 /*
1582  * get_vma_policy(@task, @vma, @addr)
1583  * @task - task for fallback if vma policy == default
1584  * @vma   - virtual memory area whose policy is sought
1585  * @addr  - address in @vma for shared policy lookup
1586  *
1587  * Returns effective policy for a VMA at specified address.
1588  * Falls back to @task or system default policy, as necessary.
1589  * Current or other task's task mempolicy and non-shared vma policies must be
1590  * protected by task_lock(task) by the caller.
1591  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1592  * count--added by the get_policy() vm_op, as appropriate--to protect against
1593  * freeing by another task.  It is the caller's responsibility to free the
1594  * extra reference for shared policies.
1595  */
1596 struct mempolicy *get_vma_policy(struct task_struct *task,
1597                 struct vm_area_struct *vma, unsigned long addr)
1598 {
1599         struct mempolicy *pol = task->mempolicy;
1600
1601         if (vma) {
1602                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1603                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1604                                                                         addr);
1605                         if (vpol)
1606                                 pol = vpol;
1607                 } else if (vma->vm_policy) {
1608                         pol = vma->vm_policy;
1609
1610                         /*
1611                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1612                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1613                          * count on these policies which will be dropped by
1614                          * mpol_cond_put() later
1615                          */
1616                         if (mpol_needs_cond_ref(pol))
1617                                 mpol_get(pol);
1618                 }
1619         }
1620         if (!pol)
1621                 pol = &default_policy;
1622         return pol;
1623 }
1624
1625 /*
1626  * Return a nodemask representing a mempolicy for filtering nodes for
1627  * page allocation
1628  */
1629 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1630 {
1631         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1632         if (unlikely(policy->mode == MPOL_BIND) &&
1633                         gfp_zone(gfp) >= policy_zone &&
1634                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1635                 return &policy->v.nodes;
1636
1637         return NULL;
1638 }
1639
1640 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1641 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1642         int nd)
1643 {
1644         switch (policy->mode) {
1645         case MPOL_PREFERRED:
1646                 if (!(policy->flags & MPOL_F_LOCAL))
1647                         nd = policy->v.preferred_node;
1648                 break;
1649         case MPOL_BIND:
1650                 /*
1651                  * Normally, MPOL_BIND allocations are node-local within the
1652                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1653                  * current node isn't part of the mask, we use the zonelist for
1654                  * the first node in the mask instead.
1655                  */
1656                 if (unlikely(gfp & __GFP_THISNODE) &&
1657                                 unlikely(!node_isset(nd, policy->v.nodes)))
1658                         nd = first_node(policy->v.nodes);
1659                 break;
1660         default:
1661                 BUG();
1662         }
1663         return node_zonelist(nd, gfp);
1664 }
1665
1666 /* Do dynamic interleaving for a process */
1667 static unsigned interleave_nodes(struct mempolicy *policy)
1668 {
1669         unsigned nid, next;
1670         struct task_struct *me = current;
1671
1672         nid = me->il_next;
1673         next = next_node(nid, policy->v.nodes);
1674         if (next >= MAX_NUMNODES)
1675                 next = first_node(policy->v.nodes);
1676         if (next < MAX_NUMNODES)
1677                 me->il_next = next;
1678         return nid;
1679 }
1680
1681 /*
1682  * Depending on the memory policy provide a node from which to allocate the
1683  * next slab entry.
1684  * @policy must be protected by freeing by the caller.  If @policy is
1685  * the current task's mempolicy, this protection is implicit, as only the
1686  * task can change it's policy.  The system default policy requires no
1687  * such protection.
1688  */
1689 unsigned slab_node(void)
1690 {
1691         struct mempolicy *policy;
1692
1693         if (in_interrupt())
1694                 return numa_node_id();
1695
1696         policy = current->mempolicy;
1697         if (!policy || policy->flags & MPOL_F_LOCAL)
1698                 return numa_node_id();
1699
1700         switch (policy->mode) {
1701         case MPOL_PREFERRED:
1702                 /*
1703                  * handled MPOL_F_LOCAL above
1704                  */
1705                 return policy->v.preferred_node;
1706
1707         case MPOL_INTERLEAVE:
1708                 return interleave_nodes(policy);
1709
1710         case MPOL_BIND: {
1711                 /*
1712                  * Follow bind policy behavior and start allocation at the
1713                  * first node.
1714                  */
1715                 struct zonelist *zonelist;
1716                 struct zone *zone;
1717                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1718                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1719                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1720                                                         &policy->v.nodes,
1721                                                         &zone);
1722                 return zone ? zone->node : numa_node_id();
1723         }
1724
1725         default:
1726                 BUG();
1727         }
1728 }
1729
1730 /* Do static interleaving for a VMA with known offset. */
1731 static unsigned offset_il_node(struct mempolicy *pol,
1732                 struct vm_area_struct *vma, unsigned long off)
1733 {
1734         unsigned nnodes = nodes_weight(pol->v.nodes);
1735         unsigned target;
1736         int c;
1737         int nid = -1;
1738
1739         if (!nnodes)
1740                 return numa_node_id();
1741         target = (unsigned int)off % nnodes;
1742         c = 0;
1743         do {
1744                 nid = next_node(nid, pol->v.nodes);
1745                 c++;
1746         } while (c <= target);
1747         return nid;
1748 }
1749
1750 /* Determine a node number for interleave */
1751 static inline unsigned interleave_nid(struct mempolicy *pol,
1752                  struct vm_area_struct *vma, unsigned long addr, int shift)
1753 {
1754         if (vma) {
1755                 unsigned long off;
1756
1757                 /*
1758                  * for small pages, there is no difference between
1759                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1760                  * for huge pages, since vm_pgoff is in units of small
1761                  * pages, we need to shift off the always 0 bits to get
1762                  * a useful offset.
1763                  */
1764                 BUG_ON(shift < PAGE_SHIFT);
1765                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1766                 off += (addr - vma->vm_start) >> shift;
1767                 return offset_il_node(pol, vma, off);
1768         } else
1769                 return interleave_nodes(pol);
1770 }
1771
1772 /*
1773  * Return the bit number of a random bit set in the nodemask.
1774  * (returns -1 if nodemask is empty)
1775  */
1776 int node_random(const nodemask_t *maskp)
1777 {
1778         int w, bit = -1;
1779
1780         w = nodes_weight(*maskp);
1781         if (w)
1782                 bit = bitmap_ord_to_pos(maskp->bits,
1783                         get_random_int() % w, MAX_NUMNODES);
1784         return bit;
1785 }
1786
1787 #ifdef CONFIG_HUGETLBFS
1788 /*
1789  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1790  * @vma = virtual memory area whose policy is sought
1791  * @addr = address in @vma for shared policy lookup and interleave policy
1792  * @gfp_flags = for requested zone
1793  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1794  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1795  *
1796  * Returns a zonelist suitable for a huge page allocation and a pointer
1797  * to the struct mempolicy for conditional unref after allocation.
1798  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1799  * @nodemask for filtering the zonelist.
1800  *
1801  * Must be protected by get_mems_allowed()
1802  */
1803 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1804                                 gfp_t gfp_flags, struct mempolicy **mpol,
1805                                 nodemask_t **nodemask)
1806 {
1807         struct zonelist *zl;
1808
1809         *mpol = get_vma_policy(current, vma, addr);
1810         *nodemask = NULL;       /* assume !MPOL_BIND */
1811
1812         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1813                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1814                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1815         } else {
1816                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1817                 if ((*mpol)->mode == MPOL_BIND)
1818                         *nodemask = &(*mpol)->v.nodes;
1819         }
1820         return zl;
1821 }
1822
1823 /*
1824  * init_nodemask_of_mempolicy
1825  *
1826  * If the current task's mempolicy is "default" [NULL], return 'false'
1827  * to indicate default policy.  Otherwise, extract the policy nodemask
1828  * for 'bind' or 'interleave' policy into the argument nodemask, or
1829  * initialize the argument nodemask to contain the single node for
1830  * 'preferred' or 'local' policy and return 'true' to indicate presence
1831  * of non-default mempolicy.
1832  *
1833  * We don't bother with reference counting the mempolicy [mpol_get/put]
1834  * because the current task is examining it's own mempolicy and a task's
1835  * mempolicy is only ever changed by the task itself.
1836  *
1837  * N.B., it is the caller's responsibility to free a returned nodemask.
1838  */
1839 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1840 {
1841         struct mempolicy *mempolicy;
1842         int nid;
1843
1844         if (!(mask && current->mempolicy))
1845                 return false;
1846
1847         task_lock(current);
1848         mempolicy = current->mempolicy;
1849         switch (mempolicy->mode) {
1850         case MPOL_PREFERRED:
1851                 if (mempolicy->flags & MPOL_F_LOCAL)
1852                         nid = numa_node_id();
1853                 else
1854                         nid = mempolicy->v.preferred_node;
1855                 init_nodemask_of_node(mask, nid);
1856                 break;
1857
1858         case MPOL_BIND:
1859                 /* Fall through */
1860         case MPOL_INTERLEAVE:
1861                 *mask =  mempolicy->v.nodes;
1862                 break;
1863
1864         default:
1865                 BUG();
1866         }
1867         task_unlock(current);
1868
1869         return true;
1870 }
1871 #endif
1872
1873 /*
1874  * mempolicy_nodemask_intersects
1875  *
1876  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1877  * policy.  Otherwise, check for intersection between mask and the policy
1878  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1879  * policy, always return true since it may allocate elsewhere on fallback.
1880  *
1881  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1882  */
1883 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1884                                         const nodemask_t *mask)
1885 {
1886         struct mempolicy *mempolicy;
1887         bool ret = true;
1888
1889         if (!mask)
1890                 return ret;
1891         task_lock(tsk);
1892         mempolicy = tsk->mempolicy;
1893         if (!mempolicy)
1894                 goto out;
1895
1896         switch (mempolicy->mode) {
1897         case MPOL_PREFERRED:
1898                 /*
1899                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1900                  * allocate from, they may fallback to other nodes when oom.
1901                  * Thus, it's possible for tsk to have allocated memory from
1902                  * nodes in mask.
1903                  */
1904                 break;
1905         case MPOL_BIND:
1906         case MPOL_INTERLEAVE:
1907                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1908                 break;
1909         default:
1910                 BUG();
1911         }
1912 out:
1913         task_unlock(tsk);
1914         return ret;
1915 }
1916
1917 /* Allocate a page in interleaved policy.
1918    Own path because it needs to do special accounting. */
1919 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1920                                         unsigned nid)
1921 {
1922         struct zonelist *zl;
1923         struct page *page;
1924
1925         zl = node_zonelist(nid, gfp);
1926         page = __alloc_pages(gfp, order, zl);
1927         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1928                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1929         return page;
1930 }
1931
1932 /**
1933  *      alloc_pages_vma - Allocate a page for a VMA.
1934  *
1935  *      @gfp:
1936  *      %GFP_USER    user allocation.
1937  *      %GFP_KERNEL  kernel allocations,
1938  *      %GFP_HIGHMEM highmem/user allocations,
1939  *      %GFP_FS      allocation should not call back into a file system.
1940  *      %GFP_ATOMIC  don't sleep.
1941  *
1942  *      @order:Order of the GFP allocation.
1943  *      @vma:  Pointer to VMA or NULL if not available.
1944  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1945  *
1946  *      This function allocates a page from the kernel page pool and applies
1947  *      a NUMA policy associated with the VMA or the current process.
1948  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1949  *      mm_struct of the VMA to prevent it from going away. Should be used for
1950  *      all allocations for pages that will be mapped into
1951  *      user space. Returns NULL when no page can be allocated.
1952  *
1953  *      Should be called with the mm_sem of the vma hold.
1954  */
1955 struct page *
1956 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1957                 unsigned long addr, int node)
1958 {
1959         struct mempolicy *pol;
1960         struct zonelist *zl;
1961         struct page *page;
1962         unsigned int cpuset_mems_cookie;
1963
1964 retry_cpuset:
1965         pol = get_vma_policy(current, vma, addr);
1966         cpuset_mems_cookie = get_mems_allowed();
1967
1968         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1969                 unsigned nid;
1970
1971                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1972                 mpol_cond_put(pol);
1973                 page = alloc_page_interleave(gfp, order, nid);
1974                 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1975                         goto retry_cpuset;
1976
1977                 return page;
1978         }
1979         zl = policy_zonelist(gfp, pol, node);
1980         if (unlikely(mpol_needs_cond_ref(pol))) {
1981                 /*
1982                  * slow path: ref counted shared policy
1983                  */
1984                 struct page *page =  __alloc_pages_nodemask(gfp, order,
1985                                                 zl, policy_nodemask(gfp, pol));
1986                 __mpol_put(pol);
1987                 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1988                         goto retry_cpuset;
1989                 return page;
1990         }
1991         /*
1992          * fast path:  default or task policy
1993          */
1994         page = __alloc_pages_nodemask(gfp, order, zl,
1995                                       policy_nodemask(gfp, pol));
1996         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1997                 goto retry_cpuset;
1998         return page;
1999 }
2000
2001 /**
2002  *      alloc_pages_current - Allocate pages.
2003  *
2004  *      @gfp:
2005  *              %GFP_USER   user allocation,
2006  *              %GFP_KERNEL kernel allocation,
2007  *              %GFP_HIGHMEM highmem allocation,
2008  *              %GFP_FS     don't call back into a file system.
2009  *              %GFP_ATOMIC don't sleep.
2010  *      @order: Power of two of allocation size in pages. 0 is a single page.
2011  *
2012  *      Allocate a page from the kernel page pool.  When not in
2013  *      interrupt context and apply the current process NUMA policy.
2014  *      Returns NULL when no page can be allocated.
2015  *
2016  *      Don't call cpuset_update_task_memory_state() unless
2017  *      1) it's ok to take cpuset_sem (can WAIT), and
2018  *      2) allocating for current task (not interrupt).
2019  */
2020 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2021 {
2022         struct mempolicy *pol = current->mempolicy;
2023         struct page *page;
2024         unsigned int cpuset_mems_cookie;
2025
2026         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2027                 pol = &default_policy;
2028
2029 retry_cpuset:
2030         cpuset_mems_cookie = get_mems_allowed();
2031
2032         /*
2033          * No reference counting needed for current->mempolicy
2034          * nor system default_policy
2035          */
2036         if (pol->mode == MPOL_INTERLEAVE)
2037                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2038         else
2039                 page = __alloc_pages_nodemask(gfp, order,
2040                                 policy_zonelist(gfp, pol, numa_node_id()),
2041                                 policy_nodemask(gfp, pol));
2042
2043         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2044                 goto retry_cpuset;
2045
2046         return page;
2047 }
2048 EXPORT_SYMBOL(alloc_pages_current);
2049
2050 /*
2051  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2052  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2053  * with the mems_allowed returned by cpuset_mems_allowed().  This
2054  * keeps mempolicies cpuset relative after its cpuset moves.  See
2055  * further kernel/cpuset.c update_nodemask().
2056  *
2057  * current's mempolicy may be rebinded by the other task(the task that changes
2058  * cpuset's mems), so we needn't do rebind work for current task.
2059  */
2060
2061 /* Slow path of a mempolicy duplicate */
2062 struct mempolicy *__mpol_dup(struct mempolicy *old)
2063 {
2064         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2065
2066         if (!new)
2067                 return ERR_PTR(-ENOMEM);
2068
2069         /* task's mempolicy is protected by alloc_lock */
2070         if (old == current->mempolicy) {
2071                 task_lock(current);
2072                 *new = *old;
2073                 task_unlock(current);
2074         } else
2075                 *new = *old;
2076
2077         rcu_read_lock();
2078         if (current_cpuset_is_being_rebound()) {
2079                 nodemask_t mems = cpuset_mems_allowed(current);
2080                 if (new->flags & MPOL_F_REBINDING)
2081                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2082                 else
2083                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2084         }
2085         rcu_read_unlock();
2086         atomic_set(&new->refcnt, 1);
2087         return new;
2088 }
2089
2090 /*
2091  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
2092  * eliminate the * MPOL_F_* flags that require conditional ref and
2093  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
2094  * after return.  Use the returned value.
2095  *
2096  * Allows use of a mempolicy for, e.g., multiple allocations with a single
2097  * policy lookup, even if the policy needs/has extra ref on lookup.
2098  * shmem_readahead needs this.
2099  */
2100 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
2101                                                 struct mempolicy *frompol)
2102 {
2103         if (!mpol_needs_cond_ref(frompol))
2104                 return frompol;
2105
2106         *tompol = *frompol;
2107         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
2108         __mpol_put(frompol);
2109         return tompol;
2110 }
2111
2112 /* Slow path of a mempolicy comparison */
2113 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2114 {
2115         if (!a || !b)
2116                 return false;
2117         if (a->mode != b->mode)
2118                 return false;
2119         if (a->flags != b->flags)
2120                 return false;
2121         if (mpol_store_user_nodemask(a))
2122                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2123                         return false;
2124
2125         switch (a->mode) {
2126         case MPOL_BIND:
2127                 /* Fall through */
2128         case MPOL_INTERLEAVE:
2129                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2130         case MPOL_PREFERRED:
2131                 return a->v.preferred_node == b->v.preferred_node;
2132         default:
2133                 BUG();
2134                 return false;
2135         }
2136 }
2137
2138 /*
2139  * Shared memory backing store policy support.
2140  *
2141  * Remember policies even when nobody has shared memory mapped.
2142  * The policies are kept in Red-Black tree linked from the inode.
2143  * They are protected by the sp->lock spinlock, which should be held
2144  * for any accesses to the tree.
2145  */
2146
2147 /* lookup first element intersecting start-end */
2148 /* Caller holds sp->mutex */
2149 static struct sp_node *
2150 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2151 {
2152         struct rb_node *n = sp->root.rb_node;
2153
2154         while (n) {
2155                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2156
2157                 if (start >= p->end)
2158                         n = n->rb_right;
2159                 else if (end <= p->start)
2160                         n = n->rb_left;
2161                 else
2162                         break;
2163         }
2164         if (!n)
2165                 return NULL;
2166         for (;;) {
2167                 struct sp_node *w = NULL;
2168                 struct rb_node *prev = rb_prev(n);
2169                 if (!prev)
2170                         break;
2171                 w = rb_entry(prev, struct sp_node, nd);
2172                 if (w->end <= start)
2173                         break;
2174                 n = prev;
2175         }
2176         return rb_entry(n, struct sp_node, nd);
2177 }
2178
2179 /* Insert a new shared policy into the list. */
2180 /* Caller holds sp->lock */
2181 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2182 {
2183         struct rb_node **p = &sp->root.rb_node;
2184         struct rb_node *parent = NULL;
2185         struct sp_node *nd;
2186
2187         while (*p) {
2188                 parent = *p;
2189                 nd = rb_entry(parent, struct sp_node, nd);
2190                 if (new->start < nd->start)
2191                         p = &(*p)->rb_left;
2192                 else if (new->end > nd->end)
2193                         p = &(*p)->rb_right;
2194                 else
2195                         BUG();
2196         }
2197         rb_link_node(&new->nd, parent, p);
2198         rb_insert_color(&new->nd, &sp->root);
2199         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2200                  new->policy ? new->policy->mode : 0);
2201 }
2202
2203 /* Find shared policy intersecting idx */
2204 struct mempolicy *
2205 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2206 {
2207         struct mempolicy *pol = NULL;
2208         struct sp_node *sn;
2209
2210         if (!sp->root.rb_node)
2211                 return NULL;
2212         mutex_lock(&sp->mutex);
2213         sn = sp_lookup(sp, idx, idx+1);
2214         if (sn) {
2215                 mpol_get(sn->policy);
2216                 pol = sn->policy;
2217         }
2218         mutex_unlock(&sp->mutex);
2219         return pol;
2220 }
2221
2222 static void sp_free(struct sp_node *n)
2223 {
2224         mpol_put(n->policy);
2225         kmem_cache_free(sn_cache, n);
2226 }
2227
2228 /**
2229  * mpol_misplaced - check whether current page node is valid in policy
2230  *
2231  * @page   - page to be checked
2232  * @vma    - vm area where page mapped
2233  * @addr   - virtual address where page mapped
2234  *
2235  * Lookup current policy node id for vma,addr and "compare to" page's
2236  * node id.
2237  *
2238  * Returns:
2239  *      -1      - not misplaced, page is in the right node
2240  *      node    - node id where the page should be
2241  *
2242  * Policy determination "mimics" alloc_page_vma().
2243  * Called from fault path where we know the vma and faulting address.
2244  */
2245 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2246 {
2247         struct mempolicy *pol;
2248         struct zone *zone;
2249         int curnid = page_to_nid(page);
2250         unsigned long pgoff;
2251         int polnid = -1;
2252         int ret = -1;
2253
2254         BUG_ON(!vma);
2255
2256         pol = get_vma_policy(current, vma, addr);
2257         if (!(pol->flags & MPOL_F_MOF))
2258                 goto out;
2259
2260         switch (pol->mode) {
2261         case MPOL_INTERLEAVE:
2262                 BUG_ON(addr >= vma->vm_end);
2263                 BUG_ON(addr < vma->vm_start);
2264
2265                 pgoff = vma->vm_pgoff;
2266                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2267                 polnid = offset_il_node(pol, vma, pgoff);
2268                 break;
2269
2270         case MPOL_PREFERRED:
2271                 if (pol->flags & MPOL_F_LOCAL)
2272                         polnid = numa_node_id();
2273                 else
2274                         polnid = pol->v.preferred_node;
2275                 break;
2276
2277         case MPOL_BIND:
2278                 /*
2279                  * allows binding to multiple nodes.
2280                  * use current page if in policy nodemask,
2281                  * else select nearest allowed node, if any.
2282                  * If no allowed nodes, use current [!misplaced].
2283                  */
2284                 if (node_isset(curnid, pol->v.nodes))
2285                         goto out;
2286                 (void)first_zones_zonelist(
2287                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2288                                 gfp_zone(GFP_HIGHUSER),
2289                                 &pol->v.nodes, &zone);
2290                 polnid = zone->node;
2291                 break;
2292
2293         default:
2294                 BUG();
2295         }
2296         if (curnid != polnid)
2297                 ret = polnid;
2298 out:
2299         mpol_cond_put(pol);
2300
2301         return ret;
2302 }
2303
2304 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2305 {
2306         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2307         rb_erase(&n->nd, &sp->root);
2308         sp_free(n);
2309 }
2310
2311 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2312                                 struct mempolicy *pol)
2313 {
2314         struct sp_node *n;
2315         struct mempolicy *newpol;
2316
2317         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2318         if (!n)
2319                 return NULL;
2320
2321         newpol = mpol_dup(pol);
2322         if (IS_ERR(newpol)) {
2323                 kmem_cache_free(sn_cache, n);
2324                 return NULL;
2325         }
2326         newpol->flags |= MPOL_F_SHARED;
2327
2328         n->start = start;
2329         n->end = end;
2330         n->policy = newpol;
2331
2332         return n;
2333 }
2334
2335 /* Replace a policy range. */
2336 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2337                                  unsigned long end, struct sp_node *new)
2338 {
2339         struct sp_node *n;
2340         int ret = 0;
2341
2342         mutex_lock(&sp->mutex);
2343         n = sp_lookup(sp, start, end);
2344         /* Take care of old policies in the same range. */
2345         while (n && n->start < end) {
2346                 struct rb_node *next = rb_next(&n->nd);
2347                 if (n->start >= start) {
2348                         if (n->end <= end)
2349                                 sp_delete(sp, n);
2350                         else
2351                                 n->start = end;
2352                 } else {
2353                         /* Old policy spanning whole new range. */
2354                         if (n->end > end) {
2355                                 struct sp_node *new2;
2356                                 new2 = sp_alloc(end, n->end, n->policy);
2357                                 if (!new2) {
2358                                         ret = -ENOMEM;
2359                                         goto out;
2360                                 }
2361                                 n->end = start;
2362                                 sp_insert(sp, new2);
2363                                 break;
2364                         } else
2365                                 n->end = start;
2366                 }
2367                 if (!next)
2368                         break;
2369                 n = rb_entry(next, struct sp_node, nd);
2370         }
2371         if (new)
2372                 sp_insert(sp, new);
2373 out:
2374         mutex_unlock(&sp->mutex);
2375         return ret;
2376 }
2377
2378 /**
2379  * mpol_shared_policy_init - initialize shared policy for inode
2380  * @sp: pointer to inode shared policy
2381  * @mpol:  struct mempolicy to install
2382  *
2383  * Install non-NULL @mpol in inode's shared policy rb-tree.
2384  * On entry, the current task has a reference on a non-NULL @mpol.
2385  * This must be released on exit.
2386  * This is called at get_inode() calls and we can use GFP_KERNEL.
2387  */
2388 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2389 {
2390         int ret;
2391
2392         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2393         mutex_init(&sp->mutex);
2394
2395         if (mpol) {
2396                 struct vm_area_struct pvma;
2397                 struct mempolicy *new;
2398                 NODEMASK_SCRATCH(scratch);
2399
2400                 if (!scratch)
2401                         goto put_mpol;
2402                 /* contextualize the tmpfs mount point mempolicy */
2403                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2404                 if (IS_ERR(new))
2405                         goto free_scratch; /* no valid nodemask intersection */
2406
2407                 task_lock(current);
2408                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2409                 task_unlock(current);
2410                 if (ret)
2411                         goto put_new;
2412
2413                 /* Create pseudo-vma that contains just the policy */
2414                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2415                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2416                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2417
2418 put_new:
2419                 mpol_put(new);                  /* drop initial ref */
2420 free_scratch:
2421                 NODEMASK_SCRATCH_FREE(scratch);
2422 put_mpol:
2423                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2424         }
2425 }
2426
2427 int mpol_set_shared_policy(struct shared_policy *info,
2428                         struct vm_area_struct *vma, struct mempolicy *npol)
2429 {
2430         int err;
2431         struct sp_node *new = NULL;
2432         unsigned long sz = vma_pages(vma);
2433
2434         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2435                  vma->vm_pgoff,
2436                  sz, npol ? npol->mode : -1,
2437                  npol ? npol->flags : -1,
2438                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
2439
2440         if (npol) {
2441                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2442                 if (!new)
2443                         return -ENOMEM;
2444         }
2445         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2446         if (err && new)
2447                 sp_free(new);
2448         return err;
2449 }
2450
2451 /* Free a backing policy store on inode delete. */
2452 void mpol_free_shared_policy(struct shared_policy *p)
2453 {
2454         struct sp_node *n;
2455         struct rb_node *next;
2456
2457         if (!p->root.rb_node)
2458                 return;
2459         mutex_lock(&p->mutex);
2460         next = rb_first(&p->root);
2461         while (next) {
2462                 n = rb_entry(next, struct sp_node, nd);
2463                 next = rb_next(&n->nd);
2464                 sp_delete(p, n);
2465         }
2466         mutex_unlock(&p->mutex);
2467 }
2468
2469 /* assumes fs == KERNEL_DS */
2470 void __init numa_policy_init(void)
2471 {
2472         nodemask_t interleave_nodes;
2473         unsigned long largest = 0;
2474         int nid, prefer = 0;
2475
2476         policy_cache = kmem_cache_create("numa_policy",
2477                                          sizeof(struct mempolicy),
2478                                          0, SLAB_PANIC, NULL);
2479
2480         sn_cache = kmem_cache_create("shared_policy_node",
2481                                      sizeof(struct sp_node),
2482                                      0, SLAB_PANIC, NULL);
2483
2484         /*
2485          * Set interleaving policy for system init. Interleaving is only
2486          * enabled across suitably sized nodes (default is >= 16MB), or
2487          * fall back to the largest node if they're all smaller.
2488          */
2489         nodes_clear(interleave_nodes);
2490         for_each_node_state(nid, N_HIGH_MEMORY) {
2491                 unsigned long total_pages = node_present_pages(nid);
2492
2493                 /* Preserve the largest node */
2494                 if (largest < total_pages) {
2495                         largest = total_pages;
2496                         prefer = nid;
2497                 }
2498
2499                 /* Interleave this node? */
2500                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2501                         node_set(nid, interleave_nodes);
2502         }
2503
2504         /* All too small, use the largest */
2505         if (unlikely(nodes_empty(interleave_nodes)))
2506                 node_set(prefer, interleave_nodes);
2507
2508         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2509                 printk("numa_policy_init: interleaving failed\n");
2510 }
2511
2512 /* Reset policy of current process to default */
2513 void numa_default_policy(void)
2514 {
2515         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2516 }
2517
2518 /*
2519  * Parse and format mempolicy from/to strings
2520  */
2521
2522 /*
2523  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2524  * Used only for mpol_parse_str() and mpol_to_str()
2525  */
2526 static const char * const policy_modes[] =
2527 {
2528         [MPOL_DEFAULT]    = "default",
2529         [MPOL_PREFERRED]  = "prefer",
2530         [MPOL_BIND]       = "bind",
2531         [MPOL_INTERLEAVE] = "interleave",
2532         [MPOL_LOCAL]      = "local",
2533         [MPOL_NOOP]       = "noop",     /* should not actually be used */
2534 };
2535
2536
2537 #ifdef CONFIG_TMPFS
2538 /**
2539  * mpol_parse_str - parse string to mempolicy
2540  * @str:  string containing mempolicy to parse
2541  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2542  * @no_context:  flag whether to "contextualize" the mempolicy
2543  *
2544  * Format of input:
2545  *      <mode>[=<flags>][:<nodelist>]
2546  *
2547  * if @no_context is true, save the input nodemask in w.user_nodemask in
2548  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2549  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2550  * mount option.  Note that if 'static' or 'relative' mode flags were
2551  * specified, the input nodemask will already have been saved.  Saving
2552  * it again is redundant, but safe.
2553  *
2554  * On success, returns 0, else 1
2555  */
2556 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2557 {
2558         struct mempolicy *new = NULL;
2559         unsigned short mode;
2560         unsigned short uninitialized_var(mode_flags);
2561         nodemask_t nodes;
2562         char *nodelist = strchr(str, ':');
2563         char *flags = strchr(str, '=');
2564         int err = 1;
2565
2566         if (nodelist) {
2567                 /* NUL-terminate mode or flags string */
2568                 *nodelist++ = '\0';
2569                 if (nodelist_parse(nodelist, nodes))
2570                         goto out;
2571                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2572                         goto out;
2573         } else
2574                 nodes_clear(nodes);
2575
2576         if (flags)
2577                 *flags++ = '\0';        /* terminate mode string */
2578
2579         for (mode = 0; mode < MPOL_MAX; mode++) {
2580                 if (!strcmp(str, policy_modes[mode])) {
2581                         break;
2582                 }
2583         }
2584         if (mode >= MPOL_MAX || mode == MPOL_NOOP)
2585                 goto out;
2586
2587         switch (mode) {
2588         case MPOL_PREFERRED:
2589                 /*
2590                  * Insist on a nodelist of one node only
2591                  */
2592                 if (nodelist) {
2593                         char *rest = nodelist;
2594                         while (isdigit(*rest))
2595                                 rest++;
2596                         if (*rest)
2597                                 goto out;
2598                 }
2599                 break;
2600         case MPOL_INTERLEAVE:
2601                 /*
2602                  * Default to online nodes with memory if no nodelist
2603                  */
2604                 if (!nodelist)
2605                         nodes = node_states[N_HIGH_MEMORY];
2606                 break;
2607         case MPOL_LOCAL:
2608                 /*
2609                  * Don't allow a nodelist;  mpol_new() checks flags
2610                  */
2611                 if (nodelist)
2612                         goto out;
2613                 mode = MPOL_PREFERRED;
2614                 break;
2615         case MPOL_DEFAULT:
2616                 /*
2617                  * Insist on a empty nodelist
2618                  */
2619                 if (!nodelist)
2620                         err = 0;
2621                 goto out;
2622         case MPOL_BIND:
2623                 /*
2624                  * Insist on a nodelist
2625                  */
2626                 if (!nodelist)
2627                         goto out;
2628         }
2629
2630         mode_flags = 0;
2631         if (flags) {
2632                 /*
2633                  * Currently, we only support two mutually exclusive
2634                  * mode flags.
2635                  */
2636                 if (!strcmp(flags, "static"))
2637                         mode_flags |= MPOL_F_STATIC_NODES;
2638                 else if (!strcmp(flags, "relative"))
2639                         mode_flags |= MPOL_F_RELATIVE_NODES;
2640                 else
2641                         goto out;
2642         }
2643
2644         new = mpol_new(mode, mode_flags, &nodes);
2645         if (IS_ERR(new))
2646                 goto out;
2647
2648         if (no_context) {
2649                 /* save for contextualization */
2650                 new->w.user_nodemask = nodes;
2651         } else {
2652                 int ret;
2653                 NODEMASK_SCRATCH(scratch);
2654                 if (scratch) {
2655                         task_lock(current);
2656                         ret = mpol_set_nodemask(new, &nodes, scratch);
2657                         task_unlock(current);
2658                 } else
2659                         ret = -ENOMEM;
2660                 NODEMASK_SCRATCH_FREE(scratch);
2661                 if (ret) {
2662                         mpol_put(new);
2663                         goto out;
2664                 }
2665         }
2666         err = 0;
2667
2668 out:
2669         /* Restore string for error message */
2670         if (nodelist)
2671                 *--nodelist = ':';
2672         if (flags)
2673                 *--flags = '=';
2674         if (!err)
2675                 *mpol = new;
2676         return err;
2677 }
2678 #endif /* CONFIG_TMPFS */
2679
2680 /**
2681  * mpol_to_str - format a mempolicy structure for printing
2682  * @buffer:  to contain formatted mempolicy string
2683  * @maxlen:  length of @buffer
2684  * @pol:  pointer to mempolicy to be formatted
2685  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2686  *
2687  * Convert a mempolicy into a string.
2688  * Returns the number of characters in buffer (if positive)
2689  * or an error (negative)
2690  */
2691 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2692 {
2693         char *p = buffer;
2694         int l;
2695         nodemask_t nodes;
2696         unsigned short mode;
2697         unsigned short flags = pol ? pol->flags : 0;
2698
2699         /*
2700          * Sanity check:  room for longest mode, flag and some nodes
2701          */
2702         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2703
2704         if (!pol || pol == &default_policy)
2705                 mode = MPOL_DEFAULT;
2706         else
2707                 mode = pol->mode;
2708
2709         switch (mode) {
2710         case MPOL_DEFAULT:
2711                 nodes_clear(nodes);
2712                 break;
2713
2714         case MPOL_PREFERRED:
2715                 nodes_clear(nodes);
2716                 if (flags & MPOL_F_LOCAL)
2717                         mode = MPOL_LOCAL;      /* pseudo-policy */
2718                 else
2719                         node_set(pol->v.preferred_node, nodes);
2720                 break;
2721
2722         case MPOL_BIND:
2723                 /* Fall through */
2724         case MPOL_INTERLEAVE:
2725                 if (no_context)
2726                         nodes = pol->w.user_nodemask;
2727                 else
2728                         nodes = pol->v.nodes;
2729                 break;
2730
2731         default:
2732                 return -EINVAL;
2733         }
2734
2735         l = strlen(policy_modes[mode]);
2736         if (buffer + maxlen < p + l + 1)
2737                 return -ENOSPC;
2738
2739         strcpy(p, policy_modes[mode]);
2740         p += l;
2741
2742         if (flags & MPOL_MODE_FLAGS) {
2743                 if (buffer + maxlen < p + 2)
2744                         return -ENOSPC;
2745                 *p++ = '=';
2746
2747                 /*
2748                  * Currently, the only defined flags are mutually exclusive
2749                  */
2750                 if (flags & MPOL_F_STATIC_NODES)
2751                         p += snprintf(p, buffer + maxlen - p, "static");
2752                 else if (flags & MPOL_F_RELATIVE_NODES)
2753                         p += snprintf(p, buffer + maxlen - p, "relative");
2754         }
2755
2756         if (!nodes_empty(nodes)) {
2757                 if (buffer + maxlen < p + 2)
2758                         return -ENOSPC;
2759                 *p++ = ':';
2760                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2761         }
2762         return p - buffer;
2763 }