Revert "percpu: free percpu allocation info for uniprocessor system"

[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 5248fe0..ff0f6b1 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
  
  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
  static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION   (8)
  
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DEFINE_PER_CPU(int, numa_node);
@@ -205,7 +206,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
  };
  
  int min_free_kbytes = 1024;
-int user_min_free_kbytes;
+int user_min_free_kbytes = -1;
  
  static unsigned long __meminitdata nr_kernel_pages;
  static unsigned long __meminitdata nr_all_pages;
@@ -295,7 +296,7 @@ static inline int bad_range(struct zone *zone, struct page *page)
  }
  #endif
  
-static void bad_page(struct page *page)
+static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
  {
         static unsigned long resume;
         static unsigned long nr_shown;
@@ -329,7 +330,7 @@ static void bad_page(struct page *page)
  
         printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
                 current->comm, page_to_pfn(page));
-       dump_page(page);
+       dump_page_badflags(page, reason, bad_flags);
  
         print_modules();
         dump_stack();
@@ -369,9 +370,11 @@ void prep_compound_page(struct page *page, unsigned long order)
         __SetPageHead(page);
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
-               __SetPageTail(p);
                 set_page_count(p, 0);
                 p->first_page = page;
+               /* Make sure p->first_page is always valid for PageTail() */
+               smp_wmb();
+               __SetPageTail(p);
         }
  }
  
@@ -383,7 +386,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
         int bad = 0;
  
         if (unlikely(compound_order(page) != order)) {
-               bad_page(page);
+               bad_page(page, "wrong compound order", 0);
                 bad++;
         }
  
@@ -392,8 +395,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
  
-               if (unlikely(!PageTail(p) || (p->first_page != page))) {
-                       bad_page(page);
+               if (unlikely(!PageTail(p))) {
+                       bad_page(page, "PageTail not set", 0);
+                       bad++;
+               } else if (unlikely(p->first_page != page)) {
+                       bad_page(page, "first_page not consistent", 0);
                         bad++;
                 }
                 __ClearPageTail(p);
@@ -506,12 +512,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
                 return 0;
  
         if (page_is_guard(buddy) && page_order(buddy) == order) {
-               VM_BUG_ON(page_count(buddy) != 0);
+               VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                 return 1;
         }
  
         if (PageBuddy(buddy) && page_order(buddy) == order) {
-               VM_BUG_ON(page_count(buddy) != 0);
+               VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                 return 1;
         }
         return 0;
@@ -561,8 +567,8 @@ static inline void __free_one_page(struct page *page,
  
         page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
  
-       VM_BUG_ON(page_idx & ((1 << order) - 1));
-       VM_BUG_ON(bad_range(zone, page));
+       VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+       VM_BUG_ON_PAGE(bad_range(zone, page), page);
  
         while (order < MAX_ORDER-1) {
                 buddy_idx = __find_buddy_index(page_idx, order);
@@ -618,12 +624,23 @@ out:
  
  static inline int free_pages_check(struct page *page)
  {
-       if (unlikely(page_mapcount(page) |
-               (page->mapping != NULL)  |
-               (atomic_read(&page->_count) != 0) |
-               (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
-               (mem_cgroup_bad_page_check(page)))) {
-               bad_page(page);
+       char *bad_reason = NULL;
+       unsigned long bad_flags = 0;
+
+       if (unlikely(page_mapcount(page)))
+               bad_reason = "nonzero mapcount";
+       if (unlikely(page->mapping != NULL))
+               bad_reason = "non-NULL mapping";
+       if (unlikely(atomic_read(&page->_count) != 0))
+               bad_reason = "nonzero _count";
+       if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
+               bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
+               bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+       }
+       if (unlikely(mem_cgroup_bad_page_check(page)))
+               bad_reason = "cgroup check failed";
+       if (unlikely(bad_reason)) {
+               bad_page(page, bad_reason, bad_flags);
                 return 1;
         }
         page_cpupid_reset_last(page);
@@ -782,9 +799,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
                 set_page_count(p, 0);
         } while (++p, --i);
  
-       set_page_refcounted(page);
         set_pageblock_migratetype(page, MIGRATE_CMA);
-       __free_pages(page, pageblock_order);
+
+       if (pageblock_order >= MAX_ORDER) {
+               i = pageblock_nr_pages;
+               p = page;
+               do {
+                       set_page_refcounted(p);
+                       __free_pages(p, MAX_ORDER - 1);
+                       p += MAX_ORDER_NR_PAGES;
+               } while (i -= MAX_ORDER_NR_PAGES);
+       } else {
+               set_page_refcounted(page);
+               __free_pages(page, pageblock_order);
+       }
+
         adjust_managed_page_count(page, pageblock_nr_pages);
  }
  #endif
@@ -813,7 +842,7 @@ static inline void expand(struct zone *zone, struct page *page,
                 area--;
                 high--;
                 size >>= 1;
-               VM_BUG_ON(bad_range(zone, &page[size]));
+               VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
  
  #ifdef CONFIG_DEBUG_PAGEALLOC
                 if (high < debug_guardpage_minorder()) {
@@ -843,12 +872,23 @@ static inline void expand(struct zone *zone, struct page *page,
   */
  static inline int check_new_page(struct page *page)
  {
-       if (unlikely(page_mapcount(page) |
-               (page->mapping != NULL)  |
-               (atomic_read(&page->_count) != 0)  |
-               (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
-               (mem_cgroup_bad_page_check(page)))) {
-               bad_page(page);
+       char *bad_reason = NULL;
+       unsigned long bad_flags = 0;
+
+       if (unlikely(page_mapcount(page)))
+               bad_reason = "nonzero mapcount";
+       if (unlikely(page->mapping != NULL))
+               bad_reason = "non-NULL mapping";
+       if (unlikely(atomic_read(&page->_count) != 0))
+               bad_reason = "nonzero _count";
+       if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
+               bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
+               bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
+       }
+       if (unlikely(mem_cgroup_bad_page_check(page)))
+               bad_reason = "cgroup check failed";
+       if (unlikely(bad_reason)) {
+               bad_page(page, bad_reason, bad_flags);
                 return 1;
         }
         return 0;
@@ -955,7 +995,7 @@ int move_freepages(struct zone *zone,
  
         for (page = start_page; page <= end_page;) {
                 /* Make sure we are not inadvertently changing nodes */
-               VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+               VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
  
                 if (!pfn_valid_within(page_to_pfn(page))) {
                         page++;
@@ -1404,8 +1444,8 @@ void split_page(struct page *page, unsigned int order)
  {
         int i;
  
-       VM_BUG_ON(PageCompound(page));
-       VM_BUG_ON(!page_count(page));
+       VM_BUG_ON_PAGE(PageCompound(page), page);
+       VM_BUG_ON_PAGE(!page_count(page), page);
  
  #ifdef CONFIG_KMEMCHECK
         /*
@@ -1548,11 +1588,12 @@ again:
         }
  
         __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+
         __count_zone_vm_events(PGALLOC, zone, 1 << order);
         zone_statistics(preferred_zone, zone, gfp_flags);
         local_irq_restore(flags);
  
-       VM_BUG_ON(bad_range(zone, page));
+       VM_BUG_ON_PAGE(bad_range(zone, page), page);
         if (prep_new_page(page, order, gfp_flags))
                 goto again;
         return page;
@@ -1828,7 +1869,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
  {
         int i;
  
-       for_each_online_node(i)
+       for_each_node_state(i, N_MEMORY)
                 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
                         node_set(i, NODE_DATA(nid)->reclaim_nodes);
                 else
@@ -1912,19 +1953,12 @@ zonelist_scan:
                  * zone size to ensure fair page aging.  The zone a
                  * page was allocated in should have no effect on the
                  * time the page has in memory before being reclaimed.
-                *
-                * Try to stay in local zones in the fastpath.  If
-                * that fails, the slowpath is entered, which will do
-                * another pass starting with the local zones, but
-                * ultimately fall back to remote zones that do not
-                * partake in the fairness round-robin cycle of this
-                * zonelist.
                  */
-               if (alloc_flags & ALLOC_WMARK_LOW) {
-                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
-                               continue;
+               if (alloc_flags & ALLOC_FAIR) {
                         if (!zone_local(preferred_zone, zone))
                                 continue;
+                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+                               continue;
                 }
                 /*
                  * When allocating a page cache page for writing, we
@@ -2072,13 +2106,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                 return;
  
         /*
-        * Walking all memory to count page types is very expensive and should
-        * be inhibited in non-blockable contexts.
-        */
-       if (!(gfp_mask & __GFP_WAIT))
-               filter |= SHOW_MEM_FILTER_PAGE_COUNT;
-
-       /*
          * This documents exceptions given to allocations in certain
          * contexts that are allowed to allocate outside current's set
          * of allowed nodes.
@@ -2242,10 +2269,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                 preferred_zone, migratetype);
                 if (page) {
                         preferred_zone->compact_blockskip_flush = false;
-                       preferred_zone->compact_considered = 0;
-                       preferred_zone->compact_defer_shift = 0;
-                       if (order >= preferred_zone->compact_order_failed)
-                               preferred_zone->compact_order_failed = order + 1;
+                       compaction_defer_reset(preferred_zone, order, true);
                         count_vm_event(COMPACTSUCCESS);
                         return page;
                 }
@@ -2372,37 +2396,45 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
         return page;
  }
  
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
-                            struct zonelist *zonelist,
-                            enum zone_type high_zoneidx,
-                            struct zone *preferred_zone)
+static void reset_alloc_batches(struct zonelist *zonelist,
+                               enum zone_type high_zoneidx,
+                               struct zone *preferred_zone)
  {
         struct zoneref *z;
         struct zone *zone;
  
         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-               if (!(gfp_mask & __GFP_NO_KSWAPD))
-                       wakeup_kswapd(zone, order, zone_idx(preferred_zone));
                 /*
                  * Only reset the batches of zones that were actually
-                * considered in the fast path, we don't want to
-                * thrash fairness information for zones that are not
+                * considered in the fairness pass, we don't want to
+                * trash fairness information for zones that are not
                  * actually part of this zonelist's round-robin cycle.
                  */
                 if (!zone_local(preferred_zone, zone))
                         continue;
                 mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                                   high_wmark_pages(zone) -
-                                   low_wmark_pages(zone) -
-                                   zone_page_state(zone, NR_ALLOC_BATCH));
+                       high_wmark_pages(zone) - low_wmark_pages(zone) -
+                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
         }
  }
  
+static void wake_all_kswapds(unsigned int order,
+                            struct zonelist *zonelist,
+                            enum zone_type high_zoneidx,
+                            struct zone *preferred_zone)
+{
+       struct zoneref *z;
+       struct zone *zone;
+
+       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+               wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
+
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2411,20 +2443,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
          * The caller may dip into page reserves a bit more if the caller
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
          */
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
-       if (!wait) {
+       if (atomic) {
                 /*
-                * Not worth trying to allocate harder for
-                * __GFP_NOMEMALLOC even if it can't schedule.
+                * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+                * if it can't schedule.
                  */
-               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+               if (!(gfp_mask & __GFP_NOMEMALLOC))
                         alloc_flags |= ALLOC_HARDER;
                 /*
-                * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
-                * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+                * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+                * comment for __cpuset_node_allowed_softwall().
                  */
                 alloc_flags &= ~ALLOC_CPUSET;
         } else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -2487,12 +2519,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
          * over allocated.
          */
         if (IS_ENABLED(CONFIG_NUMA) &&
-                       (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+           (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                 goto nopage;
  
  restart:
-       prepare_slowpath(gfp_mask, order, zonelist,
-                        high_zoneidx, preferred_zone);
+       if (!(gfp_mask & __GFP_NO_KSWAPD))
+               wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -2535,8 +2567,15 @@ rebalance:
         }
  
         /* Atomic allocations - we can't balance anything */
-       if (!wait)
+       if (!wait) {
+               /*
+                * All existing users of the deprecated __GFP_NOFAIL are
+                * blockable, so warn of any new users that actually allow this
+                * type of allocation to fail.
+                */
+               WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
                 goto nopage;
+       }
  
         /* Avoid recursion of direct reclaim */
         if (current->flags & PF_MEMALLOC)
@@ -2669,7 +2708,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         struct page *page = NULL;
         int migratetype = allocflags_to_migratetype(gfp_mask);
         unsigned int cpuset_mems_cookie;
-       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
         struct mem_cgroup *memcg = NULL;
  
         gfp_mask &= gfp_allowed_mask;
@@ -2697,7 +2736,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                 return NULL;
  
  retry_cpuset:
-       cpuset_mems_cookie = get_mems_allowed();
+       cpuset_mems_cookie = read_mems_allowed_begin();
  
         /* The preferred zone is used for statistics later */
         first_zones_zonelist(zonelist, high_zoneidx,
@@ -2710,12 +2749,29 @@ retry_cpuset:
         if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                 alloc_flags |= ALLOC_CMA;
  #endif
+retry:
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                         zonelist, high_zoneidx, alloc_flags,
                         preferred_zone, migratetype);
         if (unlikely(!page)) {
                 /*
+                * The first pass makes sure allocations are spread
+                * fairly within the local node.  However, the local
+                * node might have free pages left after the fairness
+                * batches are exhausted, and remote zones haven't
+                * even been considered yet.  Try once more without
+                * fairness, and include remote zones now, before
+                * entering the slowpath and waking kswapd: prefer
+                * spilling to a remote zone over swapping locally.
+                */
+               if (alloc_flags & ALLOC_FAIR) {
+                       reset_alloc_batches(zonelist, high_zoneidx,
+                                           preferred_zone);
+                       alloc_flags &= ~ALLOC_FAIR;
+                       goto retry;
+               }
+               /*
                  * Runtime PM, block IO and its error handling path
                  * can deadlock because I/O on the device might not
                  * complete.
@@ -2735,7 +2791,7 @@ out:
          * the mask is being updated. If a page allocation is about to fail,
          * check if the cpuset changed during allocation and if so, retry.
          */
-       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
                 goto retry_cpuset;
  
         memcg_kmem_commit_charge(page, memcg, order);
@@ -3003,9 +3059,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
                 goto out;
  
         do {
-               cpuset_mems_cookie = get_mems_allowed();
+               cpuset_mems_cookie = read_mems_allowed_begin();
                 ret = !node_isset(nid, cpuset_current_mems_allowed);
-       } while (!put_mems_allowed(cpuset_mems_cookie));
+       } while (read_mems_allowed_retry(cpuset_mems_cookie));
  out:
         return ret;
  }
@@ -3901,6 +3957,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
         struct page *page;
         unsigned long block_migratetype;
         int reserve;
+       int old_reserve;
  
         /*
          * Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3979,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
          * future allocation of hugepages at runtime.
          */
         reserve = min(2, reserve);
+       old_reserve = zone->nr_migrate_reserve_block;
+
+       /* When memory hot-add, we almost always need to do nothing */
+       if (reserve == old_reserve)
+               return;
+       zone->nr_migrate_reserve_block = reserve;
  
         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                 if (!pfn_valid(pfn))
@@ -3959,6 +4022,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                                 reserve--;
                                 continue;
                         }
+               } else if (!old_reserve) {
+                       /*
+                        * At boot time we don't need to scan the whole zone
+                        * for turning off MIGRATE_RESERVE.
+                        */
+                       break;
                 }
  
                 /*
@@ -4050,7 +4119,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
         memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
  #endif
  
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
  {
  #ifdef CONFIG_MMU
         int batch;
@@ -4166,8 +4235,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
         pageset_update(&p->pcp, high, batch);
  }
  
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
-               struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+                                      struct per_cpu_pageset *pcp)
  {
         if (percpu_pagelist_fraction)
                 pageset_set_high(pcp,
@@ -4209,7 +4278,6 @@ static noinline __init_refok
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  {
         int i;
-       struct pglist_data *pgdat = zone->zone_pgdat;
         size_t alloc_size;
  
         /*
@@ -4225,7 +4293,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  
         if (!slab_is_available()) {
                 zone->wait_table = (wait_queue_head_t *)
-                       alloc_bootmem_node_nopanic(pgdat, alloc_size);
+                       memblock_virt_alloc_node_nopanic(
+                               alloc_size, zone->zone_pgdat->node_id);
         } else {
                 /*
                  * This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4414,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
  #endif
  
  /**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * this function may be used instead of calling memblock_free_early_nid()
+ * manually.
   */
  void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
  {
@@ -4363,9 +4433,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
                 end_pfn = min(end_pfn, max_low_pfn);
  
                 if (start_pfn < end_pfn)
-                       free_bootmem_node(NODE_DATA(this_nid),
-                                         PFN_PHYS(start_pfn),
-                                         (end_pfn - start_pfn) << PAGE_SHIFT);
+                       memblock_free_early_nid(PFN_PHYS(start_pfn),
+                                       (end_pfn - start_pfn) << PAGE_SHIFT,
+                                       this_nid);
         }
  }
  
@@ -4636,8 +4706,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
         unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
         zone->pageblock_flags = NULL;
         if (usemapsize)
-               zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
-                                                                  usemapsize);
+               zone->pageblock_flags =
+                       memblock_virt_alloc_node_nopanic(usemapsize,
+                                                        pgdat->node_id);
  }
  #else
  static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4902,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 size =  (end - start) * sizeof(struct page);
                 map = alloc_remap(pgdat->node_id, size);
                 if (!map)
-                       map = alloc_bootmem_node_nopanic(pgdat, size);
+                       map = memblock_virt_alloc_node_nopanic(size,
+                                                              pgdat->node_id);
                 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
         }
  #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4861,7 +4933,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  
         pgdat->node_id = nid;
         pgdat->node_start_pfn = node_start_pfn;
-       init_zone_allows_reclaim(nid);
+       if (node_state(nid, N_MEMORY))
+               init_zone_allows_reclaim(nid);
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
  #endif
@@ -5012,9 +5085,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
         nodemask_t saved_node_state = node_states[N_MEMORY];
         unsigned long totalpages = early_calculate_totalpages();
         int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+       struct memblock_type *type = &memblock.memory;
+
+       /* Need to find movable_zone earlier when movable_node is specified. */
+       find_usable_zone_for_movable();
  
         /*
-        * If movablecore was specified, calculate what size of
+        * If movable_node is specified, ignore kernelcore and movablecore
+        * options.
+        */
+       if (movable_node_is_enabled()) {
+               for (i = 0; i < type->cnt; i++) {
+                       if (!memblock_is_hotpluggable(&type->regions[i]))
+                               continue;
+
+                       nid = type->regions[i].nid;
+
+                       usable_startpfn = PFN_DOWN(type->regions[i].base);
+                       zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+                               min(usable_startpfn, zone_movable_pfn[nid]) :
+                               usable_startpfn;
+               }
+
+               goto out2;
+       }
+
+       /*
+        * If movablecore=nn[KMG] was specified, calculate what size of
          * kernelcore that corresponds so that memory usable for
          * any allocation type is evenly spread. If both kernelcore
          * and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5137,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                 goto out;
  
         /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-       find_usable_zone_for_movable();
         usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
  
  restart:
@@ -5131,6 +5227,7 @@ restart:
         if (usable_nodes && required_kernelcore > usable_nodes)
                 goto restart;
  
+out2:
         /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
         for (nid = 0; nid < MAX_NUMNODES; nid++)
                 zone_movable_pfn[nid] =
@@ -5692,7 +5789,12 @@ module_init(init_per_zone_wmark_min)
  int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
-       proc_dointvec(table, write, buffer, length, ppos);
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
         if (write) {
                 user_min_free_kbytes = min_free_kbytes;
                 setup_per_zone_wmarks();
@@ -5760,23 +5862,38 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
         struct zone *zone;
-       unsigned int cpu;
+       int old_percpu_pagelist_fraction;
         int ret;
  
+       mutex_lock(&pcp_batch_high_lock);
+       old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-       if (!write || (ret < 0))
-               return ret;
+       if (!write || ret < 0)
+               goto out;
+
+       /* Sanity checking to avoid pcp imbalance */
+       if (percpu_pagelist_fraction &&
+           percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+               percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* No change? */
+       if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+               goto out;
  
-       mutex_lock(&pcp_batch_high_lock);
         for_each_populated_zone(zone) {
-               unsigned long  high;
-               high = zone->managed_pages / percpu_pagelist_fraction;
+               unsigned int cpu;
+
                 for_each_possible_cpu(cpu)
-                       pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
-                                        high);
+                       pageset_set_high_and_batch(zone,
+                                       per_cpu_ptr(zone->pageset, cpu));
         }
+out:
         mutex_unlock(&pcp_batch_high_lock);
-       return 0;
+       return ret;
  }
  
  int hashdist = HASHDIST_DEFAULT;
@@ -5857,7 +5974,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         do {
                 size = bucketsize << log2qty;
                 if (flags & HASH_EARLY)
-                       table = alloc_bootmem_nopanic(size);
+                       table = memblock_virt_alloc_nopanic(size, 0);
                 else if (hashdist)
                         table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                 else {
@@ -5919,53 +6036,65 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
   * @end_bitidx: The last bit of interest
   * returns pageblock_bits flags
   */
-unsigned long get_pageblock_flags_group(struct page *page,
-                                       int start_bitidx, int end_bitidx)
+unsigned long get_pageblock_flags_mask(struct page *page,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
  {
         struct zone *zone;
         unsigned long *bitmap;
-       unsigned long pfn, bitidx;
-       unsigned long flags = 0;
-       unsigned long value = 1;
+       unsigned long pfn, bitidx, word_bitidx;
+       unsigned long word;
  
         zone = page_zone(page);
         pfn = page_to_pfn(page);
         bitmap = get_pageblock_bitmap(zone, pfn);
         bitidx = pfn_to_bitidx(zone, pfn);
+       word_bitidx = bitidx / BITS_PER_LONG;
+       bitidx &= (BITS_PER_LONG-1);
  
-       for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
-               if (test_bit(bitidx + start_bitidx, bitmap))
-                       flags |= value;
-
-       return flags;
+       word = bitmap[word_bitidx];
+       bitidx += end_bitidx;
+       return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
  }
  
  /**
- * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest
   * @end_bitidx: The last bit of interest
   * @flags: The flags to set
   */
-void set_pageblock_flags_group(struct page *page, unsigned long flags,
-                                       int start_bitidx, int end_bitidx)
+void set_pageblock_flags_mask(struct page *page, unsigned long flags,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
  {
         struct zone *zone;
         unsigned long *bitmap;
-       unsigned long pfn, bitidx;
-       unsigned long value = 1;
+       unsigned long pfn, bitidx, word_bitidx;
+       unsigned long old_word, word;
+
+       BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
  
         zone = page_zone(page);
         pfn = page_to_pfn(page);
         bitmap = get_pageblock_bitmap(zone, pfn);
         bitidx = pfn_to_bitidx(zone, pfn);
-       VM_BUG_ON(!zone_spans_pfn(zone, pfn));
+       word_bitidx = bitidx / BITS_PER_LONG;
+       bitidx &= (BITS_PER_LONG-1);
  
-       for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
-               if (flags & value)
-                       __set_bit(bitidx + start_bitidx, bitmap);
-               else
-                       __clear_bit(bitidx + start_bitidx, bitmap);
+       VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
+
+       bitidx += end_bitidx;
+       mask <<= (BITS_PER_LONG - bitidx - 1);
+       flags <<= (BITS_PER_LONG - bitidx - 1);
+
+       word = ACCESS_ONCE(bitmap[word_bitidx]);
+       for (;;) {
+               old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+               if (word == old_word)
+                       break;
+               word = old_word;
+       }
  }
  
  /*
@@ -6457,12 +6586,24 @@ static void dump_page_flags(unsigned long flags)
         printk(")\n");
  }
  
-void dump_page(struct page *page)
+void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
  {
         printk(KERN_ALERT
                "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
                 page, atomic_read(&page->_count), page_mapcount(page),
                 page->mapping, page->index);
         dump_page_flags(page->flags);
+       if (reason)
+               pr_alert("page dumped because: %s\n", reason);
+       if (page->flags & badflags) {
+               pr_alert("bad because of flags:\n");
+               dump_page_flags(page->flags & badflags);
+       }
         mem_cgroup_print_bad_page(page);
  }
+
+void dump_page(struct page *page, char *reason)
+{
+       dump_page_badflags(page, reason, 0);
+}
+EXPORT_SYMBOL_GPL(dump_page);