mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90
  91 #include <asm/tlbflush.h>
  92 #include <asm/uaccess.h>
  93
  94 /* Internal flags */
  95 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  96 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  97 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  98
  99 static struct kmem_cache *policy_cache;
 100 static struct kmem_cache *sn_cache;
 101
 102 #define PDprintk(fmt...)
 103
 104 /* Highest zone. An specific allocation for a zone below that is not
 105    policied. */
 106 int policy_zone = ZONE_DMA;
 107
 108 struct mempolicy default_policy = {
 109         .refcnt = ATOMIC_INIT(1), /* never free it */
 110         .policy = MPOL_DEFAULT,
 111 };
 112
 113 /* Do sanity checking on a policy */
 114 static int mpol_check_policy(int mode, nodemask_t *nodes)
 115 {
 116         int empty = nodes_empty(*nodes);
 117
 118         switch (mode) {
 119         case MPOL_DEFAULT:
 120                 if (!empty)
 121                         return -EINVAL;
 122                 break;
 123         case MPOL_BIND:
 124         case MPOL_INTERLEAVE:
 125                 /* Preferred will only use the first bit, but allow
 126                    more for now. */
 127                 if (empty)
 128                         return -EINVAL;
 129                 break;
 130         }
 131         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 132 }
 133
 134 /* Generate a custom zonelist for the BIND policy. */
 135 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 136 {
 137         struct zonelist *zl;
 138         int num, max, nd, k;
 139
 140         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 141         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 142         if (!zl)
 143                 return NULL;
 144         num = 0;
 145         /* First put in the highest zones from all nodes, then all the next
 146            lower zones etc. Avoid empty zones because the memory allocator
 147            doesn't like them. If you implement node hot removal you
 148            have to fix that. */
 149         for (k = policy_zone; k >= 0; k--) {
 150                 for_each_node_mask(nd, *nodes) {
 151                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 152                         if (z->present_pages > 0)
 153                                 zl->zones[num++] = z;
 154                 }
 155         }
 156         zl->zones[num] = NULL;
 157         return zl;
 158 }
 159
 160 /* Create a new policy */
 161 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 162 {
 163         struct mempolicy *policy;
 164
 165         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 166         if (mode == MPOL_DEFAULT)
 167                 return NULL;
 168         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 169         if (!policy)
 170                 return ERR_PTR(-ENOMEM);
 171         atomic_set(&policy->refcnt, 1);
 172         switch (mode) {
 173         case MPOL_INTERLEAVE:
 174                 policy->v.nodes = *nodes;
 175                 if (nodes_weight(*nodes) == 0) {
 176                         kmem_cache_free(policy_cache, policy);
 177                         return ERR_PTR(-EINVAL);
 178                 }
 179                 break;
 180         case MPOL_PREFERRED:
 181                 policy->v.preferred_node = first_node(*nodes);
 182                 if (policy->v.preferred_node >= MAX_NUMNODES)
 183                         policy->v.preferred_node = -1;
 184                 break;
 185         case MPOL_BIND:
 186                 policy->v.zonelist = bind_zonelist(nodes);
 187                 if (policy->v.zonelist == NULL) {
 188                         kmem_cache_free(policy_cache, policy);
 189                         return ERR_PTR(-ENOMEM);
 190                 }
 191                 break;
 192         }
 193         policy->policy = mode;
 194         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 195         return policy;
 196 }
 197
 198 static void gather_stats(struct page *, void *, int pte_dirty);
 199 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 200                                 unsigned long flags);
 201
 202 /* Scan through pages checking if pages follow certain conditions. */
 203 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 204                 unsigned long addr, unsigned long end,
 205                 const nodemask_t *nodes, unsigned long flags,
 206                 void *private)
 207 {
 208         pte_t *orig_pte;
 209         pte_t *pte;
 210         spinlock_t *ptl;
 211
 212         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 213         do {
 214                 struct page *page;
 215                 unsigned int nid;
 216
 217                 if (!pte_present(*pte))
 218                         continue;
 219                 page = vm_normal_page(vma, addr, *pte);
 220                 if (!page)
 221                         continue;
 222                 /*
 223                  * The check for PageReserved here is important to avoid
 224                  * handling zero pages and other pages that may have been
 225                  * marked special by the system.
 226                  *
 227                  * If the PageReserved would not be checked here then f.e.
 228                  * the location of the zero page could have an influence
 229                  * on MPOL_MF_STRICT, zero pages would be counted for
 230                  * the per node stats, and there would be useless attempts
 231                  * to put zero pages on the migration list.
 232                  */
 233                 if (PageReserved(page))
 234                         continue;
 235                 nid = page_to_nid(page);
 236                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 237                         continue;
 238
 239                 if (flags & MPOL_MF_STATS)
 240                         gather_stats(page, private, pte_dirty(*pte));
 241                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 242                         migrate_page_add(page, private, flags);
 243                 else
 244                         break;
 245         } while (pte++, addr += PAGE_SIZE, addr != end);
 246         pte_unmap_unlock(orig_pte, ptl);
 247         return addr != end;
 248 }
 249
 250 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 251                 unsigned long addr, unsigned long end,
 252                 const nodemask_t *nodes, unsigned long flags,
 253                 void *private)
 254 {
 255         pmd_t *pmd;
 256         unsigned long next;
 257
 258         pmd = pmd_offset(pud, addr);
 259         do {
 260                 next = pmd_addr_end(addr, end);
 261                 if (pmd_none_or_clear_bad(pmd))
 262                         continue;
 263                 if (check_pte_range(vma, pmd, addr, next, nodes,
 264                                     flags, private))
 265                         return -EIO;
 266         } while (pmd++, addr = next, addr != end);
 267         return 0;
 268 }
 269
 270 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 271                 unsigned long addr, unsigned long end,
 272                 const nodemask_t *nodes, unsigned long flags,
 273                 void *private)
 274 {
 275         pud_t *pud;
 276         unsigned long next;
 277
 278         pud = pud_offset(pgd, addr);
 279         do {
 280                 next = pud_addr_end(addr, end);
 281                 if (pud_none_or_clear_bad(pud))
 282                         continue;
 283                 if (check_pmd_range(vma, pud, addr, next, nodes,
 284                                     flags, private))
 285                         return -EIO;
 286         } while (pud++, addr = next, addr != end);
 287         return 0;
 288 }
 289
 290 static inline int check_pgd_range(struct vm_area_struct *vma,
 291                 unsigned long addr, unsigned long end,
 292                 const nodemask_t *nodes, unsigned long flags,
 293                 void *private)
 294 {
 295         pgd_t *pgd;
 296         unsigned long next;
 297
 298         pgd = pgd_offset(vma->vm_mm, addr);
 299         do {
 300                 next = pgd_addr_end(addr, end);
 301                 if (pgd_none_or_clear_bad(pgd))
 302                         continue;
 303                 if (check_pud_range(vma, pgd, addr, next, nodes,
 304                                     flags, private))
 305                         return -EIO;
 306         } while (pgd++, addr = next, addr != end);
 307         return 0;
 308 }
 309
 310 /* Check if a vma is migratable */
 311 static inline int vma_migratable(struct vm_area_struct *vma)
 312 {
 313         if (vma->vm_flags & (
 314                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 315                 return 0;
 316         return 1;
 317 }
 318
 319 /*
 320  * Check if all pages in a range are on a set of nodes.
 321  * If pagelist != NULL then isolate pages from the LRU and
 322  * put them on the pagelist.
 323  */
 324 static struct vm_area_struct *
 325 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 326                 const nodemask_t *nodes, unsigned long flags, void *private)
 327 {
 328         int err;
 329         struct vm_area_struct *first, *vma, *prev;
 330
 331         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 332
 333                 err = migrate_prep();
 334                 if (err)
 335                         return ERR_PTR(err);
 336         }
 337
 338         first = find_vma(mm, start);
 339         if (!first)
 340                 return ERR_PTR(-EFAULT);
 341         prev = NULL;
 342         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 343                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 344                         if (!vma->vm_next && vma->vm_end < end)
 345                                 return ERR_PTR(-EFAULT);
 346                         if (prev && prev->vm_end < vma->vm_start)
 347                                 return ERR_PTR(-EFAULT);
 348                 }
 349                 if (!is_vm_hugetlb_page(vma) &&
 350                     ((flags & MPOL_MF_STRICT) ||
 351                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 352                                 vma_migratable(vma)))) {
 353                         unsigned long endvma = vma->vm_end;
 354
 355                         if (endvma > end)
 356                                 endvma = end;
 357                         if (vma->vm_start > start)
 358                                 start = vma->vm_start;
 359                         err = check_pgd_range(vma, start, endvma, nodes,
 360                                                 flags, private);
 361                         if (err) {
 362                                 first = ERR_PTR(err);
 363                                 break;
 364                         }
 365                 }
 366                 prev = vma;
 367         }
 368         return first;
 369 }
 370
 371 /* Apply policy to a single VMA */
 372 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 373 {
 374         int err = 0;
 375         struct mempolicy *old = vma->vm_policy;
 376
 377         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 378                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 379                  vma->vm_ops, vma->vm_file,
 380                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 381
 382         if (vma->vm_ops && vma->vm_ops->set_policy)
 383                 err = vma->vm_ops->set_policy(vma, new);
 384         if (!err) {
 385                 mpol_get(new);
 386                 vma->vm_policy = new;
 387                 mpol_free(old);
 388         }
 389         return err;
 390 }
 391
 392 /* Step 2: apply policy to a range and do splits. */
 393 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 394                        unsigned long end, struct mempolicy *new)
 395 {
 396         struct vm_area_struct *next;
 397         int err;
 398
 399         err = 0;
 400         for (; vma && vma->vm_start < end; vma = next) {
 401                 next = vma->vm_next;
 402                 if (vma->vm_start < start)
 403                         err = split_vma(vma->vm_mm, vma, start, 1);
 404                 if (!err && vma->vm_end > end)
 405                         err = split_vma(vma->vm_mm, vma, end, 0);
 406                 if (!err)
 407                         err = policy_vma(vma, new);
 408                 if (err)
 409                         break;
 410         }
 411         return err;
 412 }
 413
 414 static int contextualize_policy(int mode, nodemask_t *nodes)
 415 {
 416         if (!nodes)
 417                 return 0;
 418
 419         cpuset_update_task_memory_state();
 420         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 421                 return -EINVAL;
 422         return mpol_check_policy(mode, nodes);
 423 }
 424
 425
 426 /*
 427  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 428  * mempolicy.  Allows more rapid checking of this (combined perhaps
 429  * with other PF_* flag bits) on memory allocation hot code paths.
 430  *
 431  * If called from outside this file, the task 'p' should -only- be
 432  * a newly forked child not yet visible on the task list, because
 433  * manipulating the task flags of a visible task is not safe.
 434  *
 435  * The above limitation is why this routine has the funny name
 436  * mpol_fix_fork_child_flag().
 437  *
 438  * It is also safe to call this with a task pointer of current,
 439  * which the static wrapper mpol_set_task_struct_flag() does,
 440  * for use within this file.
 441  */
 442
 443 void mpol_fix_fork_child_flag(struct task_struct *p)
 444 {
 445         if (p->mempolicy)
 446                 p->flags |= PF_MEMPOLICY;
 447         else
 448                 p->flags &= ~PF_MEMPOLICY;
 449 }
 450
 451 static void mpol_set_task_struct_flag(void)
 452 {
 453         mpol_fix_fork_child_flag(current);
 454 }
 455
 456 /* Set the process memory policy */
 457 long do_set_mempolicy(int mode, nodemask_t *nodes)
 458 {
 459         struct mempolicy *new;
 460
 461         if (contextualize_policy(mode, nodes))
 462                 return -EINVAL;
 463         new = mpol_new(mode, nodes);
 464         if (IS_ERR(new))
 465                 return PTR_ERR(new);
 466         mpol_free(current->mempolicy);
 467         current->mempolicy = new;
 468         mpol_set_task_struct_flag();
 469         if (new && new->policy == MPOL_INTERLEAVE)
 470                 current->il_next = first_node(new->v.nodes);
 471         return 0;
 472 }
 473
 474 /* Fill a zone bitmap for a policy */
 475 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 476 {
 477         int i;
 478
 479         nodes_clear(*nodes);
 480         switch (p->policy) {
 481         case MPOL_BIND:
 482                 for (i = 0; p->v.zonelist->zones[i]; i++)
 483                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 484                                 *nodes);
 485                 break;
 486         case MPOL_DEFAULT:
 487                 break;
 488         case MPOL_INTERLEAVE:
 489                 *nodes = p->v.nodes;
 490                 break;
 491         case MPOL_PREFERRED:
 492                 /* or use current node instead of online map? */
 493                 if (p->v.preferred_node < 0)
 494                         *nodes = node_online_map;
 495                 else
 496                         node_set(p->v.preferred_node, *nodes);
 497                 break;
 498         default:
 499                 BUG();
 500         }
 501 }
 502
 503 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 504 {
 505         struct page *p;
 506         int err;
 507
 508         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 509         if (err >= 0) {
 510                 err = page_to_nid(p);
 511                 put_page(p);
 512         }
 513         return err;
 514 }
 515
 516 /* Retrieve NUMA policy */
 517 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 518                         unsigned long addr, unsigned long flags)
 519 {
 520         int err;
 521         struct mm_struct *mm = current->mm;
 522         struct vm_area_struct *vma = NULL;
 523         struct mempolicy *pol = current->mempolicy;
 524
 525         cpuset_update_task_memory_state();
 526         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 527                 return -EINVAL;
 528         if (flags & MPOL_F_ADDR) {
 529                 down_read(&mm->mmap_sem);
 530                 vma = find_vma_intersection(mm, addr, addr+1);
 531                 if (!vma) {
 532                         up_read(&mm->mmap_sem);
 533                         return -EFAULT;
 534                 }
 535                 if (vma->vm_ops && vma->vm_ops->get_policy)
 536                         pol = vma->vm_ops->get_policy(vma, addr);
 537                 else
 538                         pol = vma->vm_policy;
 539         } else if (addr)
 540                 return -EINVAL;
 541
 542         if (!pol)
 543                 pol = &default_policy;
 544
 545         if (flags & MPOL_F_NODE) {
 546                 if (flags & MPOL_F_ADDR) {
 547                         err = lookup_node(mm, addr);
 548                         if (err < 0)
 549                                 goto out;
 550                         *policy = err;
 551                 } else if (pol == current->mempolicy &&
 552                                 pol->policy == MPOL_INTERLEAVE) {
 553                         *policy = current->il_next;
 554                 } else {
 555                         err = -EINVAL;
 556                         goto out;
 557                 }
 558         } else
 559                 *policy = pol->policy;
 560
 561         if (vma) {
 562                 up_read(&current->mm->mmap_sem);
 563                 vma = NULL;
 564         }
 565
 566         err = 0;
 567         if (nmask)
 568                 get_zonemask(pol, nmask);
 569
 570  out:
 571         if (vma)
 572                 up_read(&current->mm->mmap_sem);
 573         return err;
 574 }
 575
 576 #ifdef CONFIG_MIGRATION
 577 /*
 578  * page migration
 579  */
 580 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 581                                 unsigned long flags)
 582 {
 583         /*
 584          * Avoid migrating a page that is shared with others.
 585          */
 586         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 587                 isolate_lru_page(page, pagelist);
 588 }
 589
 590 /*
 591  * Migrate pages from one node to a target node.
 592  * Returns error or the number of pages not migrated.
 593  */
 594 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 595 {
 596         nodemask_t nmask;
 597         LIST_HEAD(pagelist);
 598         int err = 0;
 599
 600         nodes_clear(nmask);
 601         node_set(source, nmask);
 602
 603         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 604                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 605
 606         if (!list_empty(&pagelist))
 607                 err = migrate_pages_to(&pagelist, NULL, dest);
 608         return err;
 609 }
 610
 611 /*
 612  * Move pages between the two nodesets so as to preserve the physical
 613  * layout as much as possible.
 614  *
 615  * Returns the number of page that could not be moved.
 616  */
 617 int do_migrate_pages(struct mm_struct *mm,
 618         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 619 {
 620         LIST_HEAD(pagelist);
 621         int busy = 0;
 622         int err = 0;
 623         nodemask_t tmp;
 624
 625         down_read(&mm->mmap_sem);
 626
 627 /*
 628  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 629  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 630  * bit in 'tmp', and return that <source, dest> pair for migration.
 631  * The pair of nodemasks 'to' and 'from' define the map.
 632  *
 633  * If no pair of bits is found that way, fallback to picking some
 634  * pair of 'source' and 'dest' bits that are not the same.  If the
 635  * 'source' and 'dest' bits are the same, this represents a node
 636  * that will be migrating to itself, so no pages need move.
 637  *
 638  * If no bits are left in 'tmp', or if all remaining bits left
 639  * in 'tmp' correspond to the same bit in 'to', return false
 640  * (nothing left to migrate).
 641  *
 642  * This lets us pick a pair of nodes to migrate between, such that
 643  * if possible the dest node is not already occupied by some other
 644  * source node, minimizing the risk of overloading the memory on a
 645  * node that would happen if we migrated incoming memory to a node
 646  * before migrating outgoing memory source that same node.
 647  *
 648  * A single scan of tmp is sufficient.  As we go, we remember the
 649  * most recent <s, d> pair that moved (s != d).  If we find a pair
 650  * that not only moved, but what's better, moved to an empty slot
 651  * (d is not set in tmp), then we break out then, with that pair.
 652  * Otherwise when we finish scannng from_tmp, we at least have the
 653  * most recent <s, d> pair that moved.  If we get all the way through
 654  * the scan of tmp without finding any node that moved, much less
 655  * moved to an empty node, then there is nothing left worth migrating.
 656  */
 657
 658         tmp = *from_nodes;
 659         while (!nodes_empty(tmp)) {
 660                 int s,d;
 661                 int source = -1;
 662                 int dest = 0;
 663
 664                 for_each_node_mask(s, tmp) {
 665                         d = node_remap(s, *from_nodes, *to_nodes);
 666                         if (s == d)
 667                                 continue;
 668
 669                         source = s;     /* Node moved. Memorize */
 670                         dest = d;
 671
 672                         /* dest not in remaining from nodes? */
 673                         if (!node_isset(dest, tmp))
 674                                 break;
 675                 }
 676                 if (source == -1)
 677                         break;
 678
 679                 node_clear(source, tmp);
 680                 err = migrate_to_node(mm, source, dest, flags);
 681                 if (err > 0)
 682                         busy += err;
 683                 if (err < 0)
 684                         break;
 685         }
 686
 687         up_read(&mm->mmap_sem);
 688         if (err < 0)
 689                 return err;
 690         return busy;
 691
 692 }
 693
 694 #else
 695
 696 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 697                                 unsigned long flags)
 698 {
 699 }
 700
 701 int do_migrate_pages(struct mm_struct *mm,
 702         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 703 {
 704         return -ENOSYS;
 705 }
 706 #endif
 707
 708 long do_mbind(unsigned long start, unsigned long len,
 709                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 710 {
 711         struct vm_area_struct *vma;
 712         struct mm_struct *mm = current->mm;
 713         struct mempolicy *new;
 714         unsigned long end;
 715         int err;
 716         LIST_HEAD(pagelist);
 717
 718         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 719                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 720             || mode > MPOL_MAX)
 721                 return -EINVAL;
 722         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 723                 return -EPERM;
 724
 725         if (start & ~PAGE_MASK)
 726                 return -EINVAL;
 727
 728         if (mode == MPOL_DEFAULT)
 729                 flags &= ~MPOL_MF_STRICT;
 730
 731         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 732         end = start + len;
 733
 734         if (end < start)
 735                 return -EINVAL;
 736         if (end == start)
 737                 return 0;
 738
 739         if (mpol_check_policy(mode, nmask))
 740                 return -EINVAL;
 741
 742         new = mpol_new(mode, nmask);
 743         if (IS_ERR(new))
 744                 return PTR_ERR(new);
 745
 746         /*
 747          * If we are using the default policy then operation
 748          * on discontinuous address spaces is okay after all
 749          */
 750         if (!new)
 751                 flags |= MPOL_MF_DISCONTIG_OK;
 752
 753         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 754                         mode,nodes_addr(nodes)[0]);
 755
 756         down_write(&mm->mmap_sem);
 757         vma = check_range(mm, start, end, nmask,
 758                           flags | MPOL_MF_INVERT, &pagelist);
 759
 760         err = PTR_ERR(vma);
 761         if (!IS_ERR(vma)) {
 762                 int nr_failed = 0;
 763
 764                 err = mbind_range(vma, start, end, new);
 765
 766                 if (!list_empty(&pagelist))
 767                         nr_failed = migrate_pages_to(&pagelist, vma, -1);
 768
 769                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 770                         err = -EIO;
 771         }
 772
 773         up_write(&mm->mmap_sem);
 774         mpol_free(new);
 775         return err;
 776 }
 777
 778 /*
 779  * User space interface with variable sized bitmaps for nodelists.
 780  */
 781
 782 /* Copy a node mask from user space. */
 783 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 784                      unsigned long maxnode)
 785 {
 786         unsigned long k;
 787         unsigned long nlongs;
 788         unsigned long endmask;
 789
 790         --maxnode;
 791         nodes_clear(*nodes);
 792         if (maxnode == 0 || !nmask)
 793                 return 0;
 794         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 795                 return -EINVAL;
 796
 797         nlongs = BITS_TO_LONGS(maxnode);
 798         if ((maxnode % BITS_PER_LONG) == 0)
 799                 endmask = ~0UL;
 800         else
 801                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 802
 803         /* When the user specified more nodes than supported just check
 804            if the non supported part is all zero. */
 805         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 806                 if (nlongs > PAGE_SIZE/sizeof(long))
 807                         return -EINVAL;
 808                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 809                         unsigned long t;
 810                         if (get_user(t, nmask + k))
 811                                 return -EFAULT;
 812                         if (k == nlongs - 1) {
 813                                 if (t & endmask)
 814                                         return -EINVAL;
 815                         } else if (t)
 816                                 return -EINVAL;
 817                 }
 818                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 819                 endmask = ~0UL;
 820         }
 821
 822         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 823                 return -EFAULT;
 824         nodes_addr(*nodes)[nlongs-1] &= endmask;
 825         return 0;
 826 }
 827
 828 /* Copy a kernel node mask to user space */
 829 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 830                               nodemask_t *nodes)
 831 {
 832         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 833         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 834
 835         if (copy > nbytes) {
 836                 if (copy > PAGE_SIZE)
 837                         return -EINVAL;
 838                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 839                         return -EFAULT;
 840                 copy = nbytes;
 841         }
 842         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 843 }
 844
 845 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 846                         unsigned long mode,
 847                         unsigned long __user *nmask, unsigned long maxnode,
 848                         unsigned flags)
 849 {
 850         nodemask_t nodes;
 851         int err;
 852
 853         err = get_nodes(&nodes, nmask, maxnode);
 854         if (err)
 855                 return err;
 856         return do_mbind(start, len, mode, &nodes, flags);
 857 }
 858
 859 /* Set the process memory policy */
 860 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 861                 unsigned long maxnode)
 862 {
 863         int err;
 864         nodemask_t nodes;
 865
 866         if (mode < 0 || mode > MPOL_MAX)
 867                 return -EINVAL;
 868         err = get_nodes(&nodes, nmask, maxnode);
 869         if (err)
 870                 return err;
 871         return do_set_mempolicy(mode, &nodes);
 872 }
 873
 874 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 875                 const unsigned long __user *old_nodes,
 876                 const unsigned long __user *new_nodes)
 877 {
 878         struct mm_struct *mm;
 879         struct task_struct *task;
 880         nodemask_t old;
 881         nodemask_t new;
 882         nodemask_t task_nodes;
 883         int err;
 884
 885         err = get_nodes(&old, old_nodes, maxnode);
 886         if (err)
 887                 return err;
 888
 889         err = get_nodes(&new, new_nodes, maxnode);
 890         if (err)
 891                 return err;
 892
 893         /* Find the mm_struct */
 894         read_lock(&tasklist_lock);
 895         task = pid ? find_task_by_pid(pid) : current;
 896         if (!task) {
 897                 read_unlock(&tasklist_lock);
 898                 return -ESRCH;
 899         }
 900         mm = get_task_mm(task);
 901         read_unlock(&tasklist_lock);
 902
 903         if (!mm)
 904                 return -EINVAL;
 905
 906         /*
 907          * Check if this process has the right to modify the specified
 908          * process. The right exists if the process has administrative
 909          * capabilities, superuser privileges or the same
 910          * userid as the target process.
 911          */
 912         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 913             (current->uid != task->suid) && (current->uid != task->uid) &&
 914             !capable(CAP_SYS_NICE)) {
 915                 err = -EPERM;
 916                 goto out;
 917         }
 918
 919         task_nodes = cpuset_mems_allowed(task);
 920         /* Is the user allowed to access the target nodes? */
 921         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 922                 err = -EPERM;
 923                 goto out;
 924         }
 925
 926         err = do_migrate_pages(mm, &old, &new,
 927                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 928 out:
 929         mmput(mm);
 930         return err;
 931 }
 932
 933
 934 /* Retrieve NUMA policy */
 935 asmlinkage long sys_get_mempolicy(int __user *policy,
 936                                 unsigned long __user *nmask,
 937                                 unsigned long maxnode,
 938                                 unsigned long addr, unsigned long flags)
 939 {
 940         int err, pval;
 941         nodemask_t nodes;
 942
 943         if (nmask != NULL && maxnode < MAX_NUMNODES)
 944                 return -EINVAL;
 945
 946         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 947
 948         if (err)
 949                 return err;
 950
 951         if (policy && put_user(pval, policy))
 952                 return -EFAULT;
 953
 954         if (nmask)
 955                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 956
 957         return err;
 958 }
 959
 960 #ifdef CONFIG_COMPAT
 961
 962 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 963                                      compat_ulong_t __user *nmask,
 964                                      compat_ulong_t maxnode,
 965                                      compat_ulong_t addr, compat_ulong_t flags)
 966 {
 967         long err;
 968         unsigned long __user *nm = NULL;
 969         unsigned long nr_bits, alloc_size;
 970         DECLARE_BITMAP(bm, MAX_NUMNODES);
 971
 972         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 973         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 974
 975         if (nmask)
 976                 nm = compat_alloc_user_space(alloc_size);
 977
 978         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 979
 980         if (!err && nmask) {
 981                 err = copy_from_user(bm, nm, alloc_size);
 982                 /* ensure entire bitmap is zeroed */
 983                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 984                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 985         }
 986
 987         return err;
 988 }
 989
 990 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 991                                      compat_ulong_t maxnode)
 992 {
 993         long err = 0;
 994         unsigned long __user *nm = NULL;
 995         unsigned long nr_bits, alloc_size;
 996         DECLARE_BITMAP(bm, MAX_NUMNODES);
 997
 998         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 999         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1000
1001         if (nmask) {
1002                 err = compat_get_bitmap(bm, nmask, nr_bits);
1003                 nm = compat_alloc_user_space(alloc_size);
1004                 err |= copy_to_user(nm, bm, alloc_size);
1005         }
1006
1007         if (err)
1008                 return -EFAULT;
1009
1010         return sys_set_mempolicy(mode, nm, nr_bits+1);
1011 }
1012
1013 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1014                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1015                              compat_ulong_t maxnode, compat_ulong_t flags)
1016 {
1017         long err = 0;
1018         unsigned long __user *nm = NULL;
1019         unsigned long nr_bits, alloc_size;
1020         nodemask_t bm;
1021
1022         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1023         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1024
1025         if (nmask) {
1026                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1027                 nm = compat_alloc_user_space(alloc_size);
1028                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1029         }
1030
1031         if (err)
1032                 return -EFAULT;
1033
1034         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1035 }
1036
1037 #endif
1038
1039 /* Return effective policy for a VMA */
1040 static struct mempolicy * get_vma_policy(struct task_struct *task,
1041                 struct vm_area_struct *vma, unsigned long addr)
1042 {
1043         struct mempolicy *pol = task->mempolicy;
1044
1045         if (vma) {
1046                 if (vma->vm_ops && vma->vm_ops->get_policy)
1047                         pol = vma->vm_ops->get_policy(vma, addr);
1048                 else if (vma->vm_policy &&
1049                                 vma->vm_policy->policy != MPOL_DEFAULT)
1050                         pol = vma->vm_policy;
1051         }
1052         if (!pol)
1053                 pol = &default_policy;
1054         return pol;
1055 }
1056
1057 /* Return a zonelist representing a mempolicy */
1058 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1059 {
1060         int nd;
1061
1062         switch (policy->policy) {
1063         case MPOL_PREFERRED:
1064                 nd = policy->v.preferred_node;
1065                 if (nd < 0)
1066                         nd = numa_node_id();
1067                 break;
1068         case MPOL_BIND:
1069                 /* Lower zones don't get a policy applied */
1070                 /* Careful: current->mems_allowed might have moved */
1071                 if (gfp_zone(gfp) >= policy_zone)
1072                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1073                                 return policy->v.zonelist;
1074                 /*FALL THROUGH*/
1075         case MPOL_INTERLEAVE: /* should not happen */
1076         case MPOL_DEFAULT:
1077                 nd = numa_node_id();
1078                 break;
1079         default:
1080                 nd = 0;
1081                 BUG();
1082         }
1083         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1084 }
1085
1086 /* Do dynamic interleaving for a process */
1087 static unsigned interleave_nodes(struct mempolicy *policy)
1088 {
1089         unsigned nid, next;
1090         struct task_struct *me = current;
1091
1092         nid = me->il_next;
1093         next = next_node(nid, policy->v.nodes);
1094         if (next >= MAX_NUMNODES)
1095                 next = first_node(policy->v.nodes);
1096         me->il_next = next;
1097         return nid;
1098 }
1099
1100 /*
1101  * Depending on the memory policy provide a node from which to allocate the
1102  * next slab entry.
1103  */
1104 unsigned slab_node(struct mempolicy *policy)
1105 {
1106         switch (policy->policy) {
1107         case MPOL_INTERLEAVE:
1108                 return interleave_nodes(policy);
1109
1110         case MPOL_BIND:
1111                 /*
1112                  * Follow bind policy behavior and start allocation at the
1113                  * first node.
1114                  */
1115                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1116
1117         case MPOL_PREFERRED:
1118                 if (policy->v.preferred_node >= 0)
1119                         return policy->v.preferred_node;
1120                 /* Fall through */
1121
1122         default:
1123                 return numa_node_id();
1124         }
1125 }
1126
1127 /* Do static interleaving for a VMA with known offset. */
1128 static unsigned offset_il_node(struct mempolicy *pol,
1129                 struct vm_area_struct *vma, unsigned long off)
1130 {
1131         unsigned nnodes = nodes_weight(pol->v.nodes);
1132         unsigned target = (unsigned)off % nnodes;
1133         int c;
1134         int nid = -1;
1135
1136         c = 0;
1137         do {
1138                 nid = next_node(nid, pol->v.nodes);
1139                 c++;
1140         } while (c <= target);
1141         return nid;
1142 }
1143
1144 /* Determine a node number for interleave */
1145 static inline unsigned interleave_nid(struct mempolicy *pol,
1146                  struct vm_area_struct *vma, unsigned long addr, int shift)
1147 {
1148         if (vma) {
1149                 unsigned long off;
1150
1151                 off = vma->vm_pgoff;
1152                 off += (addr - vma->vm_start) >> shift;
1153                 return offset_il_node(pol, vma, off);
1154         } else
1155                 return interleave_nodes(pol);
1156 }
1157
1158 #ifdef CONFIG_HUGETLBFS
1159 /* Return a zonelist suitable for a huge page allocation. */
1160 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1161 {
1162         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1163
1164         if (pol->policy == MPOL_INTERLEAVE) {
1165                 unsigned nid;
1166
1167                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1168                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1169         }
1170         return zonelist_policy(GFP_HIGHUSER, pol);
1171 }
1172 #endif
1173
1174 /* Allocate a page in interleaved policy.
1175    Own path because it needs to do special accounting. */
1176 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1177                                         unsigned nid)
1178 {
1179         struct zonelist *zl;
1180         struct page *page;
1181
1182         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1183         page = __alloc_pages(gfp, order, zl);
1184         if (page && page_zone(page) == zl->zones[0]) {
1185                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1186                 put_cpu();
1187         }
1188         return page;
1189 }
1190
1191 /**
1192  *      alloc_page_vma  - Allocate a page for a VMA.
1193  *
1194  *      @gfp:
1195  *      %GFP_USER    user allocation.
1196  *      %GFP_KERNEL  kernel allocations,
1197  *      %GFP_HIGHMEM highmem/user allocations,
1198  *      %GFP_FS      allocation should not call back into a file system.
1199  *      %GFP_ATOMIC  don't sleep.
1200  *
1201  *      @vma:  Pointer to VMA or NULL if not available.
1202  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1203  *
1204  *      This function allocates a page from the kernel page pool and applies
1205  *      a NUMA policy associated with the VMA or the current process.
1206  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1207  *      mm_struct of the VMA to prevent it from going away. Should be used for
1208  *      all allocations for pages that will be mapped into
1209  *      user space. Returns NULL when no page can be allocated.
1210  *
1211  *      Should be called with the mm_sem of the vma hold.
1212  */
1213 struct page *
1214 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1215 {
1216         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1217
1218         cpuset_update_task_memory_state();
1219
1220         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1221                 unsigned nid;
1222
1223                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1224                 return alloc_page_interleave(gfp, 0, nid);
1225         }
1226         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1227 }
1228
1229 /**
1230  *      alloc_pages_current - Allocate pages.
1231  *
1232  *      @gfp:
1233  *              %GFP_USER   user allocation,
1234  *              %GFP_KERNEL kernel allocation,
1235  *              %GFP_HIGHMEM highmem allocation,
1236  *              %GFP_FS     don't call back into a file system.
1237  *              %GFP_ATOMIC don't sleep.
1238  *      @order: Power of two of allocation size in pages. 0 is a single page.
1239  *
1240  *      Allocate a page from the kernel page pool.  When not in
1241  *      interrupt context and apply the current process NUMA policy.
1242  *      Returns NULL when no page can be allocated.
1243  *
1244  *      Don't call cpuset_update_task_memory_state() unless
1245  *      1) it's ok to take cpuset_sem (can WAIT), and
1246  *      2) allocating for current task (not interrupt).
1247  */
1248 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1249 {
1250         struct mempolicy *pol = current->mempolicy;
1251
1252         if ((gfp & __GFP_WAIT) && !in_interrupt())
1253                 cpuset_update_task_memory_state();
1254         if (!pol || in_interrupt())
1255                 pol = &default_policy;
1256         if (pol->policy == MPOL_INTERLEAVE)
1257                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1258         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1259 }
1260 EXPORT_SYMBOL(alloc_pages_current);
1261
1262 /*
1263  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1264  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1265  * with the mems_allowed returned by cpuset_mems_allowed().  This
1266  * keeps mempolicies cpuset relative after its cpuset moves.  See
1267  * further kernel/cpuset.c update_nodemask().
1268  */
1269 void *cpuset_being_rebound;
1270
1271 /* Slow path of a mempolicy copy */
1272 struct mempolicy *__mpol_copy(struct mempolicy *old)
1273 {
1274         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1275
1276         if (!new)
1277                 return ERR_PTR(-ENOMEM);
1278         if (current_cpuset_is_being_rebound()) {
1279                 nodemask_t mems = cpuset_mems_allowed(current);
1280                 mpol_rebind_policy(old, &mems);
1281         }
1282         *new = *old;
1283         atomic_set(&new->refcnt, 1);
1284         if (new->policy == MPOL_BIND) {
1285                 int sz = ksize(old->v.zonelist);
1286                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1287                 if (!new->v.zonelist) {
1288                         kmem_cache_free(policy_cache, new);
1289                         return ERR_PTR(-ENOMEM);
1290                 }
1291                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1292         }
1293         return new;
1294 }
1295
1296 /* Slow path of a mempolicy comparison */
1297 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1298 {
1299         if (!a || !b)
1300                 return 0;
1301         if (a->policy != b->policy)
1302                 return 0;
1303         switch (a->policy) {
1304         case MPOL_DEFAULT:
1305                 return 1;
1306         case MPOL_INTERLEAVE:
1307                 return nodes_equal(a->v.nodes, b->v.nodes);
1308         case MPOL_PREFERRED:
1309                 return a->v.preferred_node == b->v.preferred_node;
1310         case MPOL_BIND: {
1311                 int i;
1312                 for (i = 0; a->v.zonelist->zones[i]; i++)
1313                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1314                                 return 0;
1315                 return b->v.zonelist->zones[i] == NULL;
1316         }
1317         default:
1318                 BUG();
1319                 return 0;
1320         }
1321 }
1322
1323 /* Slow path of a mpol destructor. */
1324 void __mpol_free(struct mempolicy *p)
1325 {
1326         if (!atomic_dec_and_test(&p->refcnt))
1327                 return;
1328         if (p->policy == MPOL_BIND)
1329                 kfree(p->v.zonelist);
1330         p->policy = MPOL_DEFAULT;
1331         kmem_cache_free(policy_cache, p);
1332 }
1333
1334 /*
1335  * Shared memory backing store policy support.
1336  *
1337  * Remember policies even when nobody has shared memory mapped.
1338  * The policies are kept in Red-Black tree linked from the inode.
1339  * They are protected by the sp->lock spinlock, which should be held
1340  * for any accesses to the tree.
1341  */
1342
1343 /* lookup first element intersecting start-end */
1344 /* Caller holds sp->lock */
1345 static struct sp_node *
1346 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1347 {
1348         struct rb_node *n = sp->root.rb_node;
1349
1350         while (n) {
1351                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1352
1353                 if (start >= p->end)
1354                         n = n->rb_right;
1355                 else if (end <= p->start)
1356                         n = n->rb_left;
1357                 else
1358                         break;
1359         }
1360         if (!n)
1361                 return NULL;
1362         for (;;) {
1363                 struct sp_node *w = NULL;
1364                 struct rb_node *prev = rb_prev(n);
1365                 if (!prev)
1366                         break;
1367                 w = rb_entry(prev, struct sp_node, nd);
1368                 if (w->end <= start)
1369                         break;
1370                 n = prev;
1371         }
1372         return rb_entry(n, struct sp_node, nd);
1373 }
1374
1375 /* Insert a new shared policy into the list. */
1376 /* Caller holds sp->lock */
1377 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1378 {
1379         struct rb_node **p = &sp->root.rb_node;
1380         struct rb_node *parent = NULL;
1381         struct sp_node *nd;
1382
1383         while (*p) {
1384                 parent = *p;
1385                 nd = rb_entry(parent, struct sp_node, nd);
1386                 if (new->start < nd->start)
1387                         p = &(*p)->rb_left;
1388                 else if (new->end > nd->end)
1389                         p = &(*p)->rb_right;
1390                 else
1391                         BUG();
1392         }
1393         rb_link_node(&new->nd, parent, p);
1394         rb_insert_color(&new->nd, &sp->root);
1395         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1396                  new->policy ? new->policy->policy : 0);
1397 }
1398
1399 /* Find shared policy intersecting idx */
1400 struct mempolicy *
1401 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1402 {
1403         struct mempolicy *pol = NULL;
1404         struct sp_node *sn;
1405
1406         if (!sp->root.rb_node)
1407                 return NULL;
1408         spin_lock(&sp->lock);
1409         sn = sp_lookup(sp, idx, idx+1);
1410         if (sn) {
1411                 mpol_get(sn->policy);
1412                 pol = sn->policy;
1413         }
1414         spin_unlock(&sp->lock);
1415         return pol;
1416 }
1417
1418 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1419 {
1420         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1421         rb_erase(&n->nd, &sp->root);
1422         mpol_free(n->policy);
1423         kmem_cache_free(sn_cache, n);
1424 }
1425
1426 struct sp_node *
1427 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1428 {
1429         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1430
1431         if (!n)
1432                 return NULL;
1433         n->start = start;
1434         n->end = end;
1435         mpol_get(pol);
1436         n->policy = pol;
1437         return n;
1438 }
1439
1440 /* Replace a policy range. */
1441 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1442                                  unsigned long end, struct sp_node *new)
1443 {
1444         struct sp_node *n, *new2 = NULL;
1445
1446 restart:
1447         spin_lock(&sp->lock);
1448         n = sp_lookup(sp, start, end);
1449         /* Take care of old policies in the same range. */
1450         while (n && n->start < end) {
1451                 struct rb_node *next = rb_next(&n->nd);
1452                 if (n->start >= start) {
1453                         if (n->end <= end)
1454                                 sp_delete(sp, n);
1455                         else
1456                                 n->start = end;
1457                 } else {
1458                         /* Old policy spanning whole new range. */
1459                         if (n->end > end) {
1460                                 if (!new2) {
1461                                         spin_unlock(&sp->lock);
1462                                         new2 = sp_alloc(end, n->end, n->policy);
1463                                         if (!new2)
1464                                                 return -ENOMEM;
1465                                         goto restart;
1466                                 }
1467                                 n->end = start;
1468                                 sp_insert(sp, new2);
1469                                 new2 = NULL;
1470                                 break;
1471                         } else
1472                                 n->end = start;
1473                 }
1474                 if (!next)
1475                         break;
1476                 n = rb_entry(next, struct sp_node, nd);
1477         }
1478         if (new)
1479                 sp_insert(sp, new);
1480         spin_unlock(&sp->lock);
1481         if (new2) {
1482                 mpol_free(new2->policy);
1483                 kmem_cache_free(sn_cache, new2);
1484         }
1485         return 0;
1486 }
1487
1488 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1489                                 nodemask_t *policy_nodes)
1490 {
1491         info->root = RB_ROOT;
1492         spin_lock_init(&info->lock);
1493
1494         if (policy != MPOL_DEFAULT) {
1495                 struct mempolicy *newpol;
1496
1497                 /* Falls back to MPOL_DEFAULT on any error */
1498                 newpol = mpol_new(policy, policy_nodes);
1499                 if (!IS_ERR(newpol)) {
1500                         /* Create pseudo-vma that contains just the policy */
1501                         struct vm_area_struct pvma;
1502
1503                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1504                         /* Policy covers entire file */
1505                         pvma.vm_end = TASK_SIZE;
1506                         mpol_set_shared_policy(info, &pvma, newpol);
1507                         mpol_free(newpol);
1508                 }
1509         }
1510 }
1511
1512 int mpol_set_shared_policy(struct shared_policy *info,
1513                         struct vm_area_struct *vma, struct mempolicy *npol)
1514 {
1515         int err;
1516         struct sp_node *new = NULL;
1517         unsigned long sz = vma_pages(vma);
1518
1519         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1520                  vma->vm_pgoff,
1521                  sz, npol? npol->policy : -1,
1522                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1523
1524         if (npol) {
1525                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1526                 if (!new)
1527                         return -ENOMEM;
1528         }
1529         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1530         if (err && new)
1531                 kmem_cache_free(sn_cache, new);
1532         return err;
1533 }
1534
1535 /* Free a backing policy store on inode delete. */
1536 void mpol_free_shared_policy(struct shared_policy *p)
1537 {
1538         struct sp_node *n;
1539         struct rb_node *next;
1540
1541         if (!p->root.rb_node)
1542                 return;
1543         spin_lock(&p->lock);
1544         next = rb_first(&p->root);
1545         while (next) {
1546                 n = rb_entry(next, struct sp_node, nd);
1547                 next = rb_next(&n->nd);
1548                 rb_erase(&n->nd, &p->root);
1549                 mpol_free(n->policy);
1550                 kmem_cache_free(sn_cache, n);
1551         }
1552         spin_unlock(&p->lock);
1553 }
1554
1555 /* assumes fs == KERNEL_DS */
1556 void __init numa_policy_init(void)
1557 {
1558         policy_cache = kmem_cache_create("numa_policy",
1559                                          sizeof(struct mempolicy),
1560                                          0, SLAB_PANIC, NULL, NULL);
1561
1562         sn_cache = kmem_cache_create("shared_policy_node",
1563                                      sizeof(struct sp_node),
1564                                      0, SLAB_PANIC, NULL, NULL);
1565
1566         /* Set interleaving policy for system init. This way not all
1567            the data structures allocated at system boot end up in node zero. */
1568
1569         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1570                 printk("numa_policy_init: interleaving failed\n");
1571 }
1572
1573 /* Reset policy of current process to default */
1574 void numa_default_policy(void)
1575 {
1576         do_set_mempolicy(MPOL_DEFAULT, NULL);
1577 }
1578
1579 /* Migrate a policy to a different set of nodes */
1580 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1581 {
1582         nodemask_t *mpolmask;
1583         nodemask_t tmp;
1584
1585         if (!pol)
1586                 return;
1587         mpolmask = &pol->cpuset_mems_allowed;
1588         if (nodes_equal(*mpolmask, *newmask))
1589                 return;
1590
1591         switch (pol->policy) {
1592         case MPOL_DEFAULT:
1593                 break;
1594         case MPOL_INTERLEAVE:
1595                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1596                 pol->v.nodes = tmp;
1597                 *mpolmask = *newmask;
1598                 current->il_next = node_remap(current->il_next,
1599                                                 *mpolmask, *newmask);
1600                 break;
1601         case MPOL_PREFERRED:
1602                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1603                                                 *mpolmask, *newmask);
1604                 *mpolmask = *newmask;
1605                 break;
1606         case MPOL_BIND: {
1607                 nodemask_t nodes;
1608                 struct zone **z;
1609                 struct zonelist *zonelist;
1610
1611                 nodes_clear(nodes);
1612                 for (z = pol->v.zonelist->zones; *z; z++)
1613                         node_set((*z)->zone_pgdat->node_id, nodes);
1614                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1615                 nodes = tmp;
1616
1617                 zonelist = bind_zonelist(&nodes);
1618
1619                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1620                  * If that old zonelist has no remaining mems_allowed nodes,
1621                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1622                  */
1623
1624                 if (zonelist) {
1625                         /* Good - got mem - substitute new zonelist */
1626                         kfree(pol->v.zonelist);
1627                         pol->v.zonelist = zonelist;
1628                 }
1629                 *mpolmask = *newmask;
1630                 break;
1631         }
1632         default:
1633                 BUG();
1634                 break;
1635         }
1636 }
1637
1638 /*
1639  * Wrapper for mpol_rebind_policy() that just requires task
1640  * pointer, and updates task mempolicy.
1641  */
1642
1643 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1644 {
1645         mpol_rebind_policy(tsk->mempolicy, new);
1646 }
1647
1648 /*
1649  * Rebind each vma in mm to new nodemask.
1650  *
1651  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1652  */
1653
1654 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1655 {
1656         struct vm_area_struct *vma;
1657
1658         down_write(&mm->mmap_sem);
1659         for (vma = mm->mmap; vma; vma = vma->vm_next)
1660                 mpol_rebind_policy(vma->vm_policy, new);
1661         up_write(&mm->mmap_sem);
1662 }
1663
1664 /*
1665  * Display pages allocated per node and memory policy via /proc.
1666  */
1667
1668 static const char *policy_types[] = { "default", "prefer", "bind",
1669                                       "interleave" };
1670
1671 /*
1672  * Convert a mempolicy into a string.
1673  * Returns the number of characters in buffer (if positive)
1674  * or an error (negative)
1675  */
1676 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1677 {
1678         char *p = buffer;
1679         int l;
1680         nodemask_t nodes;
1681         int mode = pol ? pol->policy : MPOL_DEFAULT;
1682
1683         switch (mode) {
1684         case MPOL_DEFAULT:
1685                 nodes_clear(nodes);
1686                 break;
1687
1688         case MPOL_PREFERRED:
1689                 nodes_clear(nodes);
1690                 node_set(pol->v.preferred_node, nodes);
1691                 break;
1692
1693         case MPOL_BIND:
1694                 get_zonemask(pol, &nodes);
1695                 break;
1696
1697         case MPOL_INTERLEAVE:
1698                 nodes = pol->v.nodes;
1699                 break;
1700
1701         default:
1702                 BUG();
1703                 return -EFAULT;
1704         }
1705
1706         l = strlen(policy_types[mode]);
1707         if (buffer + maxlen < p + l + 1)
1708                 return -ENOSPC;
1709
1710         strcpy(p, policy_types[mode]);
1711         p += l;
1712
1713         if (!nodes_empty(nodes)) {
1714                 if (buffer + maxlen < p + 2)
1715                         return -ENOSPC;
1716                 *p++ = '=';
1717                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1718         }
1719         return p - buffer;
1720 }
1721
1722 struct numa_maps {
1723         unsigned long pages;
1724         unsigned long anon;
1725         unsigned long active;
1726         unsigned long writeback;
1727         unsigned long mapcount_max;
1728         unsigned long dirty;
1729         unsigned long swapcache;
1730         unsigned long node[MAX_NUMNODES];
1731 };
1732
1733 static void gather_stats(struct page *page, void *private, int pte_dirty)
1734 {
1735         struct numa_maps *md = private;
1736         int count = page_mapcount(page);
1737
1738         md->pages++;
1739         if (pte_dirty || PageDirty(page))
1740                 md->dirty++;
1741
1742         if (PageSwapCache(page))
1743                 md->swapcache++;
1744
1745         if (PageActive(page))
1746                 md->active++;
1747
1748         if (PageWriteback(page))
1749                 md->writeback++;
1750
1751         if (PageAnon(page))
1752                 md->anon++;
1753
1754         if (count > md->mapcount_max)
1755                 md->mapcount_max = count;
1756
1757         md->node[page_to_nid(page)]++;
1758 }
1759
1760 #ifdef CONFIG_HUGETLB_PAGE
1761 static void check_huge_range(struct vm_area_struct *vma,
1762                 unsigned long start, unsigned long end,
1763                 struct numa_maps *md)
1764 {
1765         unsigned long addr;
1766         struct page *page;
1767
1768         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1769                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1770                 pte_t pte;
1771
1772                 if (!ptep)
1773                         continue;
1774
1775                 pte = *ptep;
1776                 if (pte_none(pte))
1777                         continue;
1778
1779                 page = pte_page(pte);
1780                 if (!page)
1781                         continue;
1782
1783                 gather_stats(page, md, pte_dirty(*ptep));
1784         }
1785 }
1786 #else
1787 static inline void check_huge_range(struct vm_area_struct *vma,
1788                 unsigned long start, unsigned long end,
1789                 struct numa_maps *md)
1790 {
1791 }
1792 #endif
1793
1794 int show_numa_map(struct seq_file *m, void *v)
1795 {
1796         struct task_struct *task = m->private;
1797         struct vm_area_struct *vma = v;
1798         struct numa_maps *md;
1799         struct file *file = vma->vm_file;
1800         struct mm_struct *mm = vma->vm_mm;
1801         int n;
1802         char buffer[50];
1803
1804         if (!mm)
1805                 return 0;
1806
1807         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1808         if (!md)
1809                 return 0;
1810
1811         mpol_to_str(buffer, sizeof(buffer),
1812                         get_vma_policy(task, vma, vma->vm_start));
1813
1814         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1815
1816         if (file) {
1817                 seq_printf(m, " file=");
1818                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1819         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1820                 seq_printf(m, " heap");
1821         } else if (vma->vm_start <= mm->start_stack &&
1822                         vma->vm_end >= mm->start_stack) {
1823                 seq_printf(m, " stack");
1824         }
1825
1826         if (is_vm_hugetlb_page(vma)) {
1827                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1828                 seq_printf(m, " huge");
1829         } else {
1830                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1831                                 &node_online_map, MPOL_MF_STATS, md);
1832         }
1833
1834         if (!md->pages)
1835                 goto out;
1836
1837         if (md->anon)
1838                 seq_printf(m," anon=%lu",md->anon);
1839
1840         if (md->dirty)
1841                 seq_printf(m," dirty=%lu",md->dirty);
1842
1843         if (md->pages != md->anon && md->pages != md->dirty)
1844                 seq_printf(m, " mapped=%lu", md->pages);
1845
1846         if (md->mapcount_max > 1)
1847                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1848
1849         if (md->swapcache)
1850                 seq_printf(m," swapcache=%lu", md->swapcache);
1851
1852         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1853                 seq_printf(m," active=%lu", md->active);
1854
1855         if (md->writeback)
1856                 seq_printf(m," writeback=%lu", md->writeback);
1857
1858         for_each_online_node(n)
1859                 if (md->node[n])
1860                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1861 out:
1862         seq_putc(m, '\n');
1863         kfree(md);
1864
1865         if (m->count < m->size)
1866                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1867         return 0;
1868 }
1869