mm: vmscan: respect NUMA policy mask when shrinking slab on direct reclaim

[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 3bac76a..ff0f6b1 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
  
  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
  static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION   (8)
  
  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  DEFINE_PER_CPU(int, numa_node);
@@ -798,9 +799,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
                 set_page_count(p, 0);
         } while (++p, --i);
  
-       set_page_refcounted(page);
         set_pageblock_migratetype(page, MIGRATE_CMA);
-       __free_pages(page, pageblock_order);
+
+       if (pageblock_order >= MAX_ORDER) {
+               i = pageblock_nr_pages;
+               p = page;
+               do {
+                       set_page_refcounted(p);
+                       __free_pages(p, MAX_ORDER - 1);
+                       p += MAX_ORDER_NR_PAGES;
+               } while (i -= MAX_ORDER_NR_PAGES);
+       } else {
+               set_page_refcounted(page);
+               __free_pages(page, pageblock_order);
+       }
+
         adjust_managed_page_count(page, pageblock_nr_pages);
  }
  #endif
@@ -1238,15 +1251,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
         }
         local_irq_restore(flags);
  }
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-       return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
-}
-#else
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
-       return false;
-}
  #endif
  
  /*
@@ -1583,12 +1587,7 @@ again:
                                           get_pageblock_migratetype(page));
         }
  
-       /*
-        * NOTE: GFP_THISNODE allocations do not partake in the kswapd
-        * aging protocol, so they can't be fair.
-        */
-       if (!gfp_thisnode_allocation(gfp_flags))
-               __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+       __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
  
         __count_zone_vm_events(PGALLOC, zone, 1 << order);
         zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1870,7 +1869,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
  {
         int i;
  
-       for_each_online_node(i)
+       for_each_node_state(i, N_MEMORY)
                 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
                         node_set(i, NODE_DATA(nid)->reclaim_nodes);
                 else
@@ -1954,23 +1953,12 @@ zonelist_scan:
                  * zone size to ensure fair page aging.  The zone a
                  * page was allocated in should have no effect on the
                  * time the page has in memory before being reclaimed.
-                *
-                * Try to stay in local zones in the fastpath.  If
-                * that fails, the slowpath is entered, which will do
-                * another pass starting with the local zones, but
-                * ultimately fall back to remote zones that do not
-                * partake in the fairness round-robin cycle of this
-                * zonelist.
-                *
-                * NOTE: GFP_THISNODE allocations do not partake in
-                * the kswapd aging protocol, so they can't be fair.
                  */
-               if ((alloc_flags & ALLOC_WMARK_LOW) &&
-                   !gfp_thisnode_allocation(gfp_mask)) {
-                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
-                               continue;
+               if (alloc_flags & ALLOC_FAIR) {
                         if (!zone_local(preferred_zone, zone))
                                 continue;
+                       if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+                               continue;
                 }
                 /*
                  * When allocating a page cache page for writing, we
@@ -2408,37 +2396,45 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
         return page;
  }
  
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
-                            struct zonelist *zonelist,
-                            enum zone_type high_zoneidx,
-                            struct zone *preferred_zone)
+static void reset_alloc_batches(struct zonelist *zonelist,
+                               enum zone_type high_zoneidx,
+                               struct zone *preferred_zone)
  {
         struct zoneref *z;
         struct zone *zone;
  
         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-               if (!(gfp_mask & __GFP_NO_KSWAPD))
-                       wakeup_kswapd(zone, order, zone_idx(preferred_zone));
                 /*
                  * Only reset the batches of zones that were actually
-                * considered in the fast path, we don't want to
-                * thrash fairness information for zones that are not
+                * considered in the fairness pass, we don't want to
+                * trash fairness information for zones that are not
                  * actually part of this zonelist's round-robin cycle.
                  */
                 if (!zone_local(preferred_zone, zone))
                         continue;
                 mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                                   high_wmark_pages(zone) -
-                                   low_wmark_pages(zone) -
-                                   zone_page_state(zone, NR_ALLOC_BATCH));
+                       high_wmark_pages(zone) - low_wmark_pages(zone) -
+                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
         }
  }
  
+static void wake_all_kswapds(unsigned int order,
+                            struct zonelist *zonelist,
+                            enum zone_type high_zoneidx,
+                            struct zone *preferred_zone)
+{
+       struct zoneref *z;
+       struct zone *zone;
+
+       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+               wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
+
  static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2447,20 +2443,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
          * The caller may dip into page reserves a bit more if the caller
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
          */
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
-       if (!wait) {
+       if (atomic) {
                 /*
-                * Not worth trying to allocate harder for
-                * __GFP_NOMEMALLOC even if it can't schedule.
+                * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+                * if it can't schedule.
                  */
-               if  (!(gfp_mask & __GFP_NOMEMALLOC))
+               if (!(gfp_mask & __GFP_NOMEMALLOC))
                         alloc_flags |= ALLOC_HARDER;
                 /*
-                * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
-                * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+                * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+                * comment for __cpuset_node_allowed_softwall().
                  */
                 alloc_flags &= ~ALLOC_CPUSET;
         } else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -2522,12 +2518,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
          * allowed per node queues are empty and that nodes are
          * over allocated.
          */
-       if (gfp_thisnode_allocation(gfp_mask))
+       if (IS_ENABLED(CONFIG_NUMA) &&
+           (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                 goto nopage;
  
  restart:
-       prepare_slowpath(gfp_mask, order, zonelist,
-                        high_zoneidx, preferred_zone);
+       if (!(gfp_mask & __GFP_NO_KSWAPD))
+               wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
  
         /*
          * OK, we're below the kswapd watermark and have kicked background
@@ -2711,7 +2708,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         struct page *page = NULL;
         int migratetype = allocflags_to_migratetype(gfp_mask);
         unsigned int cpuset_mems_cookie;
-       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+       int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
         struct mem_cgroup *memcg = NULL;
  
         gfp_mask &= gfp_allowed_mask;
@@ -2739,7 +2736,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                 return NULL;
  
  retry_cpuset:
-       cpuset_mems_cookie = get_mems_allowed();
+       cpuset_mems_cookie = read_mems_allowed_begin();
  
         /* The preferred zone is used for statistics later */
         first_zones_zonelist(zonelist, high_zoneidx,
@@ -2752,12 +2749,29 @@ retry_cpuset:
         if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                 alloc_flags |= ALLOC_CMA;
  #endif
+retry:
         /* First allocation attempt */
         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                         zonelist, high_zoneidx, alloc_flags,
                         preferred_zone, migratetype);
         if (unlikely(!page)) {
                 /*
+                * The first pass makes sure allocations are spread
+                * fairly within the local node.  However, the local
+                * node might have free pages left after the fairness
+                * batches are exhausted, and remote zones haven't
+                * even been considered yet.  Try once more without
+                * fairness, and include remote zones now, before
+                * entering the slowpath and waking kswapd: prefer
+                * spilling to a remote zone over swapping locally.
+                */
+               if (alloc_flags & ALLOC_FAIR) {
+                       reset_alloc_batches(zonelist, high_zoneidx,
+                                           preferred_zone);
+                       alloc_flags &= ~ALLOC_FAIR;
+                       goto retry;
+               }
+               /*
                  * Runtime PM, block IO and its error handling path
                  * can deadlock because I/O on the device might not
                  * complete.
@@ -2777,7 +2791,7 @@ out:
          * the mask is being updated. If a page allocation is about to fail,
          * check if the cpuset changed during allocation and if so, retry.
          */
-       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
                 goto retry_cpuset;
  
         memcg_kmem_commit_charge(page, memcg, order);
@@ -3045,9 +3059,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
                 goto out;
  
         do {
-               cpuset_mems_cookie = get_mems_allowed();
+               cpuset_mems_cookie = read_mems_allowed_begin();
                 ret = !node_isset(nid, cpuset_current_mems_allowed);
-       } while (!put_mems_allowed(cpuset_mems_cookie));
+       } while (read_mems_allowed_retry(cpuset_mems_cookie));
  out:
         return ret;
  }
@@ -4105,7 +4119,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
         memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
  #endif
  
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
  {
  #ifdef CONFIG_MMU
         int batch;
@@ -4221,8 +4235,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
         pageset_update(&p->pcp, high, batch);
  }
  
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
-               struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+                                      struct per_cpu_pageset *pcp)
  {
         if (percpu_pagelist_fraction)
                 pageset_set_high(pcp,
@@ -4919,7 +4933,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  
         pgdat->node_id = nid;
         pgdat->node_start_pfn = node_start_pfn;
-       init_zone_allows_reclaim(nid);
+       if (node_state(nid, N_MEMORY))
+               init_zone_allows_reclaim(nid);
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
  #endif
@@ -5847,23 +5862,38 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
         struct zone *zone;
-       unsigned int cpu;
+       int old_percpu_pagelist_fraction;
         int ret;
  
+       mutex_lock(&pcp_batch_high_lock);
+       old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-       if (!write || (ret < 0))
-               return ret;
+       if (!write || ret < 0)
+               goto out;
+
+       /* Sanity checking to avoid pcp imbalance */
+       if (percpu_pagelist_fraction &&
+           percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+               percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* No change? */
+       if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+               goto out;
  
-       mutex_lock(&pcp_batch_high_lock);
         for_each_populated_zone(zone) {
-               unsigned long  high;
-               high = zone->managed_pages / percpu_pagelist_fraction;
+               unsigned int cpu;
+
                 for_each_possible_cpu(cpu)
-                       pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
-                                        high);
+                       pageset_set_high_and_batch(zone,
+                                       per_cpu_ptr(zone->pageset, cpu));
         }
+out:
         mutex_unlock(&pcp_batch_high_lock);
-       return 0;
+       return ret;
  }
  
  int hashdist = HASHDIST_DEFAULT;
@@ -6006,53 +6036,65 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
   * @end_bitidx: The last bit of interest
   * returns pageblock_bits flags
   */
-unsigned long get_pageblock_flags_group(struct page *page,
-                                       int start_bitidx, int end_bitidx)
+unsigned long get_pageblock_flags_mask(struct page *page,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
  {
         struct zone *zone;
         unsigned long *bitmap;
-       unsigned long pfn, bitidx;
-       unsigned long flags = 0;
-       unsigned long value = 1;
+       unsigned long pfn, bitidx, word_bitidx;
+       unsigned long word;
  
         zone = page_zone(page);
         pfn = page_to_pfn(page);
         bitmap = get_pageblock_bitmap(zone, pfn);
         bitidx = pfn_to_bitidx(zone, pfn);
+       word_bitidx = bitidx / BITS_PER_LONG;
+       bitidx &= (BITS_PER_LONG-1);
  
-       for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
-               if (test_bit(bitidx + start_bitidx, bitmap))
-                       flags |= value;
-
-       return flags;
+       word = bitmap[word_bitidx];
+       bitidx += end_bitidx;
+       return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
  }
  
  /**
- * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
   * @page: The page within the block of interest
   * @start_bitidx: The first bit of interest
   * @end_bitidx: The last bit of interest
   * @flags: The flags to set
   */
-void set_pageblock_flags_group(struct page *page, unsigned long flags,
-                                       int start_bitidx, int end_bitidx)
+void set_pageblock_flags_mask(struct page *page, unsigned long flags,
+                                       unsigned long end_bitidx,
+                                       unsigned long mask)
  {
         struct zone *zone;
         unsigned long *bitmap;
-       unsigned long pfn, bitidx;
-       unsigned long value = 1;
+       unsigned long pfn, bitidx, word_bitidx;
+       unsigned long old_word, word;
+
+       BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
  
         zone = page_zone(page);
         pfn = page_to_pfn(page);
         bitmap = get_pageblock_bitmap(zone, pfn);
         bitidx = pfn_to_bitidx(zone, pfn);
+       word_bitidx = bitidx / BITS_PER_LONG;
+       bitidx &= (BITS_PER_LONG-1);
+
         VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
  
-       for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
-               if (flags & value)
-                       __set_bit(bitidx + start_bitidx, bitmap);
-               else
-                       __clear_bit(bitidx + start_bitidx, bitmap);
+       bitidx += end_bitidx;
+       mask <<= (BITS_PER_LONG - bitidx - 1);
+       flags <<= (BITS_PER_LONG - bitidx - 1);
+
+       word = ACCESS_ONCE(bitmap[word_bitidx]);
+       for (;;) {
+               old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+               if (word == old_word)
+                       break;
+               word = old_word;
+       }
  }
  
  /*