Merge branch 'audit' of git://git.linaro.org/people/rmk/linux-arm

[platform/kernel/linux-arm64.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index fba2a12..889532b 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
  #include <linux/page_cgroup.h>
  #include <linux/debugobjects.h>
  #include <linux/kmemleak.h>
-#include <linux/memory.h>
  #include <linux/compaction.h>
  #include <trace/events/kmem.h>
  #include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
  
  int page_group_by_mobility_disabled __read_mostly;
  
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+/*
+ * NOTE:
+ * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
+ * Instead, use {un}set_pageblock_isolate.
+ */
+void set_pageblock_migratetype(struct page *page, int migratetype)
  {
  
         if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
         return pages_moved;
  }
  
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
                                 int migratetype)
  {
         unsigned long start_pfn, end_pfn;
@@ -1596,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
  {
         /* free_pages my go negative - that's OK */
         long min = mark;
+       long lowmem_reserve = z->lowmem_reserve[classzone_idx];
         int o;
  
         free_pages -= (1 << order) - 1;
@@ -1604,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
         if (alloc_flags & ALLOC_HARDER)
                 min -= min / 4;
  
-       if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+       if (free_pages <= min + lowmem_reserve)
                 return false;
         for (o = 0; o < order; o++) {
                 /* At the next order, this order's pages become unavailable */
@@ -1619,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
         return true;
  }
  
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+       if (unlikely(zone->nr_pageblock_isolate))
+               return zone->nr_pageblock_isolate * pageblock_nr_pages;
+       return 0;
+}
+#else
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+       return 0;
+}
+#endif
+
  bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                       int classzone_idx, int alloc_flags)
  {
@@ -1634,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
         if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
  
+       /*
+        * If the zone has MIGRATE_ISOLATE type free pages, we should consider
+        * it.  nr_zone_isolate_freepages is never accurate so kswapd might not
+        * sleep although it could do so.  But this is more desirable for memory
+        * hotplug than sleeping which can cause a livelock in the direct
+        * reclaim path.
+        */
+       free_pages -= nr_zone_isolate_freepages(z);
         return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                                                 free_pages);
  }
@@ -2089,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  
                 page = get_page_from_freelist(gfp_mask, nodemask,
                                 order, zonelist, high_zoneidx,
-                               alloc_flags, preferred_zone,
-                               migratetype);
+                               alloc_flags & ~ALLOC_NO_WATERMARKS,
+                               preferred_zone, migratetype);
                 if (page) {
                         preferred_zone->compact_considered = 0;
                         preferred_zone->compact_defer_shift = 0;
@@ -2182,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
  retry:
         page = get_page_from_freelist(gfp_mask, nodemask, order,
                                         zonelist, high_zoneidx,
-                                       alloc_flags, preferred_zone,
-                                       migratetype);
+                                       alloc_flags & ~ALLOC_NO_WATERMARKS,
+                                       preferred_zone, migratetype);
  
         /*
          * If an allocation failed after direct reclaim, it could be because
@@ -2267,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                 alloc_flags |= ALLOC_HARDER;
  
         if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
-               if (!in_interrupt() &&
-                   ((current->flags & PF_MEMALLOC) ||
-                    unlikely(test_thread_flag(TIF_MEMDIE))))
+               if (gfp_mask & __GFP_MEMALLOC)
+                       alloc_flags |= ALLOC_NO_WATERMARKS;
+               else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+                       alloc_flags |= ALLOC_NO_WATERMARKS;
+               else if (!in_interrupt() &&
+                               ((current->flags & PF_MEMALLOC) ||
+                                unlikely(test_thread_flag(TIF_MEMDIE))))
                         alloc_flags |= ALLOC_NO_WATERMARKS;
         }
  
         return alloc_flags;
  }
  
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+       return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
+}
+
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2342,11 +2378,27 @@ rebalance:
  
         /* Allocate without watermarks if the context allows */
         if (alloc_flags & ALLOC_NO_WATERMARKS) {
+               /*
+                * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+                * the allocation is high priority and these type of
+                * allocations are system rather than user orientated
+                */
+               zonelist = node_zonelist(numa_node_id(), gfp_mask);
+
                 page = __alloc_pages_high_priority(gfp_mask, order,
                                 zonelist, high_zoneidx, nodemask,
                                 preferred_zone, migratetype);
-               if (page)
+               if (page) {
+                       /*
+                        * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+                        * necessary to allocate the page. The expectation is
+                        * that the caller is taking steps that will free more
+                        * memory. The caller should avoid the page being used
+                        * for !PFMEMALLOC purposes.
+                        */
+                       page->pfmemalloc = true;
                         goto got_pg;
+               }
         }
  
         /* Atomic allocations - we can't balance anything */
@@ -2465,8 +2517,8 @@ nopage:
  got_pg:
         if (kmemcheck_enabled)
                 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-       return page;
  
+       return page;
  }
  
  /*
@@ -2517,6 +2569,8 @@ retry_cpuset:
                 page = __alloc_pages_slowpath(gfp_mask, order,
                                 zonelist, high_zoneidx, nodemask,
                                 preferred_zone, migratetype);
+       else
+               page->pfmemalloc = false;
  
         trace_mm_page_alloc(page, order, gfp_mask, migratetype);
  
@@ -3032,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                         user_zonelist_order = oldval;
                 } else if (oldval != user_zonelist_order) {
                         mutex_lock(&zonelists_mutex);
-                       build_all_zonelists(NULL);
+                       build_all_zonelists(NULL, NULL);
                         mutex_unlock(&zonelists_mutex);
                 }
         }
@@ -3411,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
  DEFINE_MUTEX(zonelists_mutex);
  
  /* return values int ....just for stop_machine() */
-static __init_refok int __build_all_zonelists(void *data)
+static int __build_all_zonelists(void *data)
  {
         int nid;
         int cpu;
+       pg_data_t *self = data;
  
  #ifdef CONFIG_NUMA
         memset(node_load, 0, sizeof(node_load));
  #endif
+
+       if (self && !node_online(self->node_id)) {
+               build_zonelists(self);
+               build_zonelist_cache(self);
+       }
+
         for_each_online_node(nid) {
                 pg_data_t *pgdat = NODE_DATA(nid);
  
@@ -3463,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
   * Called with zonelists_mutex held always
   * unless system_state == SYSTEM_BOOTING.
   */
-void __ref build_all_zonelists(void *data)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
  {
         set_zonelist_order();
  
@@ -3475,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
                 /* we have to stop all cpus to guarantee there is no user
                    of zonelist */
  #ifdef CONFIG_MEMORY_HOTPLUG
-               if (data)
-                       setup_zone_pageset((struct zone *)data);
+               if (zone)
+                       setup_zone_pageset(zone);
  #endif
-               stop_machine(__build_all_zonelists, NULL, NULL);
+               stop_machine(__build_all_zonelists, pgdat, NULL);
                 /* cpuset refresh routine should be here */
         }
         vm_total_pages = nr_free_pagecache_pages();
@@ -3748,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
         memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
  #endif
  
-static int zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
  {
  #ifdef CONFIG_MMU
         int batch;
@@ -3830,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                 pcp->batch = PAGE_SHIFT * 8;
  }
  
-static void setup_zone_pageset(struct zone *zone)
+static void __meminit setup_zone_pageset(struct zone *zone)
  {
         int cpu;
  
@@ -3903,33 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
         return 0;
  }
  
-static int __zone_pcp_update(void *data)
-{
-       struct zone *zone = data;
-       int cpu;
-       unsigned long batch = zone_batchsize(zone), flags;
-
-       for_each_possible_cpu(cpu) {
-               struct per_cpu_pageset *pset;
-               struct per_cpu_pages *pcp;
-
-               pset = per_cpu_ptr(zone->pageset, cpu);
-               pcp = &pset->pcp;
-
-               local_irq_save(flags);
-               if (pcp->count > 0)
-                       free_pcppages_bulk(zone, pcp->count, pcp);
-               setup_pageset(pset, batch);
-               local_irq_restore(flags);
-       }
-       return 0;
-}
-
-void zone_pcp_update(struct zone *zone)
-{
-       stop_machine(__zone_pcp_update, zone, NULL);
-}
-
  static __meminit void zone_pcp_init(struct zone *zone)
  {
         /*
@@ -3945,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
                                          zone_batchsize(zone));
  }
  
-__meminit int init_currently_empty_zone(struct zone *zone,
+int __meminit init_currently_empty_zone(struct zone *zone,
                                         unsigned long zone_start_pfn,
                                         unsigned long size,
                                         enum memmap_context context)
@@ -4304,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
  
  /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(void)
+void __init set_pageblock_order(void)
  {
         unsigned int order;
  
@@ -4332,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
   * include/linux/pageblock-flags.h for the values of pageblock_order based on
   * the kernel config
   */
-static inline void set_pageblock_order(void)
+void __init set_pageblock_order(void)
  {
  }
  
@@ -4343,6 +4377,8 @@ static inline void set_pageblock_order(void)
   *   - mark all pages reserved
   *   - mark all memory queues empty
   *   - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
   */
  static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4353,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
         int ret;
  
         pgdat_resize_init(pgdat);
-       pgdat->nr_zones = 0;
         init_waitqueue_head(&pgdat->kswapd_wait);
-       pgdat->kswapd_max_order = 0;
+       init_waitqueue_head(&pgdat->pfmemalloc_wait);
         pgdat_page_cgroup_init(pgdat);
  
         for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4397,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
  
                 zone->spanned_pages = size;
                 zone->present_pages = realsize;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+               zone->compact_cached_free_pfn = zone->zone_start_pfn +
+                                               zone->spanned_pages;
+               zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
+#endif
  #ifdef CONFIG_NUMA
                 zone->node = nid;
                 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4411,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
  
                 zone_pcp_init(zone);
                 lruvec_init(&zone->lruvec, zone);
-               zap_zone_vm_stats(zone);
-               zone->flags = 0;
                 if (!size)
                         continue;
  
@@ -4472,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  {
         pg_data_t *pgdat = NODE_DATA(nid);
  
+       /* pg_data_t should be reset to zero when it's allocated */
+       WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
+
         pgdat->node_id = nid;
         pgdat->node_start_pfn = node_start_pfn;
         calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4753,7 +4794,7 @@ out:
  }
  
  /* Any regular memory on that node ? */
-static void check_for_regular_memory(pg_data_t *pgdat)
+static void __init check_for_regular_memory(pg_data_t *pgdat)
  {
  #ifdef CONFIG_HIGHMEM
         enum zone_type zone_type;
@@ -5478,8 +5519,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
   * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
   * expect this function should be exact.
   */
-static bool
-__has_unmovable_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
  {
         unsigned long pfn, iter, found;
         int mt;
@@ -5556,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
                         zone->zone_start_pfn + zone->spanned_pages <= pfn)
                 return false;
  
-       return !__has_unmovable_pages(zone, page, 0);
-}
-
-int set_migratetype_isolate(struct page *page)
-{
-       struct zone *zone;
-       unsigned long flags, pfn;
-       struct memory_isolate_notify arg;
-       int notifier_ret;
-       int ret = -EBUSY;
-
-       zone = page_zone(page);
-
-       spin_lock_irqsave(&zone->lock, flags);
-
-       pfn = page_to_pfn(page);
-       arg.start_pfn = pfn;
-       arg.nr_pages = pageblock_nr_pages;
-       arg.pages_found = 0;
-
-       /*
-        * It may be possible to isolate a pageblock even if the
-        * migratetype is not MIGRATE_MOVABLE. The memory isolation
-        * notifier chain is used by balloon drivers to return the
-        * number of pages in a range that are held by the balloon
-        * driver to shrink memory. If all the pages are accounted for
-        * by balloons, are free, or on the LRU, isolation can continue.
-        * Later, for example, when memory hotplug notifier runs, these
-        * pages reported as "can be isolated" should be isolated(freed)
-        * by the balloon driver through the memory notifier chain.
-        */
-       notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
-       notifier_ret = notifier_to_errno(notifier_ret);
-       if (notifier_ret)
-               goto out;
-       /*
-        * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
-        * We just check MOVABLE pages.
-        */
-       if (!__has_unmovable_pages(zone, page, arg.pages_found))
-               ret = 0;
-       /*
-        * Unmovable means "not-on-lru" pages. If Unmovable pages are
-        * larger than removable-by-driver pages reported by notifier,
-        * we'll fail.
-        */
-
-out:
-       if (!ret) {
-               set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-               move_freepages_block(zone, page, MIGRATE_ISOLATE);
-       }
-
-       spin_unlock_irqrestore(&zone->lock, flags);
-       if (!ret)
-               drain_all_pages();
-       return ret;
-}
-
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
-{
-       struct zone *zone;
-       unsigned long flags;
-       zone = page_zone(page);
-       spin_lock_irqsave(&zone->lock, flags);
-       if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
-               goto out;
-       set_pageblock_migratetype(page, migratetype);
-       move_freepages_block(zone, page, migratetype);
-out:
-       spin_unlock_irqrestore(&zone->lock, flags);
+       return !has_unmovable_pages(zone, page, 0);
  }
  
  #ifdef CONFIG_CMA
@@ -5881,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
  }
  #endif
  
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __meminit __zone_pcp_update(void *data)
+{
+       struct zone *zone = data;
+       int cpu;
+       unsigned long batch = zone_batchsize(zone), flags;
+
+       for_each_possible_cpu(cpu) {
+               struct per_cpu_pageset *pset;
+               struct per_cpu_pages *pcp;
+
+               pset = per_cpu_ptr(zone->pageset, cpu);
+               pcp = &pset->pcp;
+
+               local_irq_save(flags);
+               if (pcp->count > 0)
+                       free_pcppages_bulk(zone, pcp->count, pcp);
+               setup_pageset(pset, batch);
+               local_irq_restore(flags);
+       }
+       return 0;
+}
+
+void __meminit zone_pcp_update(struct zone *zone)
+{
+       stop_machine(__zone_pcp_update, zone, NULL);
+}
+#endif
+
  #ifdef CONFIG_MEMORY_HOTREMOVE
+void zone_pcp_reset(struct zone *zone)
+{
+       unsigned long flags;
+
+       /* avoid races with drain_pages()  */
+       local_irq_save(flags);
+       if (zone->pageset != &boot_pageset) {
+               free_percpu(zone->pageset);
+               zone->pageset = &boot_pageset;
+       }
+       local_irq_restore(flags);
+}
+
  /*
   * All pages in the range must be isolated before calling this.
   */