mm: vmscan: shrink_slab: rename max_pass -> freeable

[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 90c4075..303a304 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc)
  }
  #endif
  
-unsigned long zone_reclaimable_pages(struct zone *zone)
+static unsigned long zone_reclaimable_pages(struct zone *zone)
  {
         int nr;
  
@@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
         unsigned long freed = 0;
         unsigned long long delta;
         long total_scan;
-       long max_pass;
+       long freeable;
         long nr;
         long new_nr;
         int nid = shrinkctl->nid;
         long batch_size = shrinker->batch ? shrinker->batch
                                           : SHRINK_BATCH;
  
-       max_pass = shrinker->count_objects(shrinker, shrinkctl);
-       if (max_pass == 0)
+       freeable = shrinker->count_objects(shrinker, shrinkctl);
+       if (freeable == 0)
                 return 0;
  
         /*
@@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
  
         total_scan = nr;
         delta = (4 * nr_pages_scanned) / shrinker->seeks;
-       delta *= max_pass;
+       delta *= freeable;
         do_div(delta, lru_pages + 1);
         total_scan += delta;
         if (total_scan < 0) {
                 printk(KERN_ERR
                 "shrink_slab: %pF negative objects to delete nr=%ld\n",
                        shrinker->scan_objects, total_scan);
-               total_scan = max_pass;
+               total_scan = freeable;
         }
  
         /*
@@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
          * shrinkers to return -1 all the time. This results in a large
          * nr being built up so when a shrink that can do some work
          * comes along it empties the entire cache due to nr >>>
-        * max_pass.  This is bad for sustaining a working set in
+        * freeable. This is bad for sustaining a working set in
          * memory.
          *
          * Hence only allow the shrinker to scan the entire cache when
          * a large delta change is calculated directly.
          */
-       if (delta < max_pass / 4)
-               total_scan = min(total_scan, max_pass / 2);
+       if (delta < freeable / 4)
+               total_scan = min(total_scan, freeable / 2);
  
         /*
          * Avoid risking looping forever due to too large nr value:
          * never try to free more than twice the estimate number of
          * freeable entries.
          */
-       if (total_scan > max_pass * 2)
-               total_scan = max_pass * 2;
+       if (total_scan > freeable * 2)
+               total_scan = freeable * 2;
  
         trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
                                 nr_pages_scanned, lru_pages,
-                               max_pass, delta, total_scan);
+                               freeable, delta, total_scan);
  
         /*
          * Normally, we should not scan less than batch_size objects in one
@@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
          *
          * We detect the "tight on memory" situations by looking at the total
          * number of objects we want to scan (total_scan). If it is greater
-        * than the total number of objects on slab (max_pass), we must be
+        * than the total number of objects on slab (freeable), we must be
          * scanning at high prio and therefore should try to reclaim as much as
          * possible.
          */
         while (total_scan >= batch_size ||
-              total_scan >= max_pass) {
+              total_scan >= freeable) {
                 unsigned long ret;
                 unsigned long nr_to_scan = min(batch_size, total_scan);
  
@@ -1540,19 +1540,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                  * If dirty pages are scanned that are not queued for IO, it
                  * implies that flushers are not keeping up. In this case, flag
                  * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
-                * pages from reclaim context. It will forcibly stall in the
-                * next check.
+                * pages from reclaim context.
                  */
                 if (nr_unqueued_dirty == nr_taken)
                         zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
  
                 /*
-                * In addition, if kswapd scans pages marked marked for
-                * immediate reclaim and under writeback (nr_immediate), it
-                * implies that pages are cycling through the LRU faster than
+                * If kswapd scans pages marked marked for immediate
+                * reclaim and under writeback (nr_immediate), it implies
+                * that pages are cycling through the LRU faster than
                  * they are written so also forcibly stall.
                  */
-               if (nr_unqueued_dirty == nr_taken || nr_immediate)
+               if (nr_immediate)
                         congestion_wait(BLK_RW_ASYNC, HZ/10);
         }
  
@@ -2425,8 +2424,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                         unsigned long lru_pages = 0;
  
                         nodes_clear(shrink->nodes_to_scan);
-                       for_each_zone_zonelist(zone, z, zonelist,
-                                       gfp_zone(sc->gfp_mask)) {
+                       for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                       gfp_zone(sc->gfp_mask), sc->nodemask) {
                                 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                         continue;
  
@@ -2502,10 +2501,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
  
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
+               if (!populated_zone(zone))
+                       continue;
+
                 pfmemalloc_reserve += min_wmark_pages(zone);
                 free_pages += zone_page_state(zone, NR_FREE_PAGES);
         }
  
+       /* If there are no reserves (unexpected config) then do not throttle */
+       if (!pfmemalloc_reserve)
+               return true;
+
         wmark_ok = free_pages > pfmemalloc_reserve / 2;
  
         /* kswapd must be awake if processes are being throttled */
@@ -2530,9 +2536,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
  static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                         nodemask_t *nodemask)
  {
+       struct zoneref *z;
         struct zone *zone;
-       int high_zoneidx = gfp_zone(gfp_mask);
-       pg_data_t *pgdat;
+       pg_data_t *pgdat = NULL;
  
         /*
          * Kernel threads should not be throttled as they may be indirectly
@@ -2551,10 +2557,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
         if (fatal_signal_pending(current))
                 goto out;
  
-       /* Check if the pfmemalloc reserves are ok */
-       first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
-       pgdat = zone->zone_pgdat;
-       if (pfmemalloc_watermark_ok(pgdat))
+       /*
+        * Check if the pfmemalloc reserves are ok by finding the first node
+        * with a usable ZONE_NORMAL or lower zone. The expectation is that
+        * GFP_KERNEL will be required for allocating network buffers when
+        * swapping over the network so ZONE_HIGHMEM is unusable.
+        *
+        * Throttling is based on the first usable node and throttled processes
+        * wait on a queue until kswapd makes progress and wakes them. There
+        * is an affinity then between processes waking up and where reclaim
+        * progress has been made assuming the process wakes on the same node.
+        * More importantly, processes running on remote nodes will not compete
+        * for remote pfmemalloc reserves and processes on different nodes
+        * should make reasonable progress.
+        */
+       for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                       gfp_mask, nodemask) {
+               if (zone_idx(zone) > ZONE_NORMAL)
+                       continue;
+
+               /* Throttle based on the first usable node */
+               pgdat = zone->zone_pgdat;
+               if (pfmemalloc_watermark_ok(pgdat))
+                       goto out;
+               break;
+       }
+
+       /* If no zone was usable by the allocation flags then do not throttle */
+       if (!pgdat)
                 goto out;
  
         /* Account for the throttling */
@@ -3285,7 +3315,10 @@ static int kswapd(void *p)
                 }
         }
  
+       tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
         current->reclaim_state = NULL;
+       lockdep_clear_current_reclaim_state();
+
         return 0;
  }
  
@@ -3315,27 +3348,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         wake_up_interruptible(&pgdat->kswapd_wait);
  }
  
-/*
- * The reclaimable count would be mostly accurate.
- * The less reclaimable pages may be
- * - mlocked pages, which will be moved to unevictable list when encountered
- * - mapped pages, which may require several travels to be reclaimed
- * - dirty pages, which is not "instantly" reclaimable
- */
-unsigned long global_reclaimable_pages(void)
-{
-       int nr;
-
-       nr = global_page_state(NR_ACTIVE_FILE) +
-            global_page_state(NR_INACTIVE_FILE);
-
-       if (get_nr_swap_pages() > 0)
-               nr += global_page_state(NR_ACTIVE_ANON) +
-                     global_page_state(NR_INACTIVE_ANON);
-
-       return nr;
-}
-
  #ifdef CONFIG_HIBERNATION
  /*
   * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of