staging: zram: drop zram_stat_dec/inc functions
[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / vmscan.c
index 48550c6..16b42af 100644 (file)
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
 }
 
 /*
- * Are there way too many processes in the direct reclaim path already?
+ * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
+ * then get resheduled. When there are massive number of tasks doing page
+ * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
+ * the LRU list will go small and be scanned faster than necessary, leading to
+ * unnecessary swapping, thrashing and OOM.
  */
 static int too_many_isolated(struct zone *zone, int file,
                struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
                isolated = zone_page_state(zone, NR_ISOLATED_ANON);
        }
 
+       /*
+        * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+        * won't get blocked by normal direct-reclaimers, forming a circular
+        * deadlock.
+        */
+       if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+               inactive >>= 3;
+
        return isolated > inactive;
 }
 
@@ -1679,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 
        if (global_reclaim(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
-               /* If we have very few page cache pages,
-                  force-scan anon pages. */
                if (unlikely(file + free <= high_wmark_pages(zone))) {
+                       /*
+                        * If we have very few page cache pages, force-scan
+                        * anon pages.
+                        */
                        fraction[0] = 1;
                        fraction[1] = 0;
                        denominator = 1;
                        goto out;
+               } else if (!inactive_file_is_low_global(zone)) {
+                       /*
+                        * There is enough inactive page cache, do not
+                        * reclaim anything from the working set right now.
+                        */
+                       fraction[0] = 0;
+                       fraction[1] = 1;
+                       denominator = 1;
+                       goto out;
                }
        }
 
@@ -1752,7 +1775,7 @@ out:
 /* Use reclaim/compaction for costly allocs or under memory pressure */
 static bool in_reclaim_compaction(struct scan_control *sc)
 {
-       if (COMPACTION_BUILD && sc->order &&
+       if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
                        (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
                         sc->priority < DEF_PRIORITY - 2))
                return true;
@@ -2005,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                        if (zone->all_unreclaimable &&
                                        sc->priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
-                       if (COMPACTION_BUILD) {
+                       if (IS_ENABLED(CONFIG_COMPACTION)) {
                                /*
                                 * If we already have plenty of memory free for
                                 * compaction in this zone, don't free any more.
@@ -2207,9 +2230,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
  * Throttle direct reclaimers if backing storage is backed by the network
  * and the PFMEMALLOC reserve for the preferred node is getting dangerously
  * depleted. kswapd will continue to make progress and wake the processes
- * when the low watermark is reached
+ * when the low watermark is reached.
+ *
+ * Returns true if a fatal signal was delivered during throttling. If this
+ * happens, the page allocator should not consider triggering the OOM killer.
  */
-static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
 {
        struct zone *zone;
@@ -2224,13 +2250,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
         * processes to block on log_wait_commit().
         */
        if (current->flags & PF_KTHREAD)
-               return;
+               goto out;
+
+       /*
+        * If a fatal signal is pending, this process should not throttle.
+        * It should return quickly so it can exit and free its memory
+        */
+       if (fatal_signal_pending(current))
+               goto out;
 
        /* Check if the pfmemalloc reserves are ok */
        first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
        pgdat = zone->zone_pgdat;
        if (pfmemalloc_watermark_ok(pgdat))
-               return;
+               goto out;
 
        /* Account for the throttling */
        count_vm_event(PGSCAN_DIRECT_THROTTLE);
@@ -2246,12 +2279,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
        if (!(gfp_mask & __GFP_FS)) {
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                        pfmemalloc_watermark_ok(pgdat), HZ);
-               return;
+
+               goto check_pending;
        }
 
        /* Throttle until kswapd wakes the process */
        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
                pfmemalloc_watermark_ok(pgdat));
+
+check_pending:
+       if (fatal_signal_pending(current))
+               return true;
+
+out:
+       return false;
 }
 
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
@@ -2273,13 +2314,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .gfp_mask = sc.gfp_mask,
        };
 
-       throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
-
        /*
-        * Do not enter reclaim if fatal signal is pending. 1 is returned so
-        * that the page allocator does not consider triggering OOM
+        * Do not enter reclaim if fatal signal was delivered while throttled.
+        * 1 is returned so that the page allocator does not OOM kill at this
+        * point.
         */
-       if (fatal_signal_pending(current))
+       if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
                return 1;
 
        trace_mm_vmscan_direct_reclaim_begin(order,
@@ -2397,13 +2437,31 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
        } while (memcg);
 }
 
+static bool zone_balanced(struct zone *zone, int order,
+                         unsigned long balance_gap, int classzone_idx)
+{
+       if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
+                                   balance_gap, classzone_idx, 0))
+               return false;
+
+       if (IS_ENABLED(CONFIG_COMPACTION) && order &&
+           !compaction_suitable(zone, order))
+               return false;
+
+       return true;
+}
+
 /*
- * pgdat_balanced is used when checking if a node is balanced for high-order
- * allocations. Only zones that meet watermarks and are in a zone allowed
- * by the callers classzone_idx are added to balanced_pages. The total of
- * balanced pages must be at least 25% of the zones allowed by classzone_idx
- * for the node to be considered balanced. Forcing all zones to be balanced
- * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * pgdat_balanced() is used when checking if a node is balanced.
+ *
+ * For order-0, all zones must be balanced!
+ *
+ * For high-order allocations only zones that meet watermarks and are in a
+ * zone allowed by the callers classzone_idx are added to balanced_pages. The
+ * total of balanced pages must be at least 25% of the zones allowed by
+ * classzone_idx for the node to be considered balanced. Forcing all zones to
+ * be balanced for high orders can cause excessive reclaim when there are
+ * imbalanced zones.
  * The choice of 25% is due to
  *   o a 16M DMA zone that is balanced will not balance a zone on any
  *     reasonable sized machine
@@ -2413,17 +2471,43 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
  *     Similarly, on x86-64 the Normal zone would need to be at least 1G
  *     to balance a node on its own. These seemed like reasonable ratios.
  */
-static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
-                                               int classzone_idx)
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 {
        unsigned long present_pages = 0;
+       unsigned long balanced_pages = 0;
        int i;
 
-       for (i = 0; i <= classzone_idx; i++)
-               present_pages += pgdat->node_zones[i].present_pages;
+       /* Check the watermark levels */
+       for (i = 0; i <= classzone_idx; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               present_pages += zone->present_pages;
 
-       /* A special case here: if zone has no page, we think it's balanced */
-       return balanced_pages >= (present_pages >> 2);
+               /*
+                * A special case here:
+                *
+                * balance_pgdat() skips over all_unreclaimable after
+                * DEF_PRIORITY. Effectively, it considers them balanced so
+                * they must be considered balanced here as well!
+                */
+               if (zone->all_unreclaimable) {
+                       balanced_pages += zone->present_pages;
+                       continue;
+               }
+
+               if (zone_balanced(zone, order, 0, i))
+                       balanced_pages += zone->present_pages;
+               else if (!order)
+                       return false;
+       }
+
+       if (order)
+               return balanced_pages >= (present_pages >> 2);
+       else
+               return true;
 }
 
 /*
@@ -2435,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                                        int classzone_idx)
 {
-       int i;
-       unsigned long balanced = 0;
-       bool all_zones_ok = true;
-
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
                return false;
@@ -2457,40 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                return false;
        }
 
-       /* Check the watermark levels */
-       for (i = 0; i <= classzone_idx; i++) {
-               struct zone *zone = pgdat->node_zones + i;
-
-               if (!populated_zone(zone))
-                       continue;
-
-               /*
-                * balance_pgdat() skips over all_unreclaimable after
-                * DEF_PRIORITY. Effectively, it considers them balanced so
-                * they must be considered balanced here as well if kswapd
-                * is to sleep
-                */
-               if (zone->all_unreclaimable) {
-                       balanced += zone->present_pages;
-                       continue;
-               }
-
-               if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-                                                       i, 0))
-                       all_zones_ok = false;
-               else
-                       balanced += zone->present_pages;
-       }
-
-       /*
-        * For high-order requests, the balanced zones must contain at least
-        * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
-        * must be balanced
-        */
-       if (order)
-               return pgdat_balanced(pgdat, balanced, classzone_idx);
-       else
-               return all_zones_ok;
+       return pgdat_balanced(pgdat, order, classzone_idx);
 }
 
 /*
@@ -2517,8 +2564,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                                        int *classzone_idx)
 {
-       int all_zones_ok;
-       unsigned long balanced;
+       struct zone *unbalanced_zone;
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
@@ -2551,8 +2597,7 @@ loop_again:
                unsigned long lru_pages = 0;
                int has_under_min_watermark_zone = 0;
 
-               all_zones_ok = 1;
-               balanced = 0;
+               unbalanced_zone = NULL;
 
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2585,8 +2630,7 @@ loop_again:
                                break;
                        }
 
-                       if (!zone_watermark_ok_safe(zone, order,
-                                       high_wmark_pages(zone), 0, 0)) {
+                       if (!zone_balanced(zone, order, 0, 0)) {
                                end_zone = i;
                                break;
                        } else {
@@ -2656,15 +2700,14 @@ loop_again:
                         * Do not reclaim more than needed for compaction.
                         */
                        testorder = order;
-                       if (COMPACTION_BUILD && order &&
+                       if (IS_ENABLED(CONFIG_COMPACTION) && order &&
                                        compaction_suitable(zone, order) !=
                                                COMPACT_SKIPPED)
                                testorder = 0;
 
                        if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
-                                   !zone_watermark_ok_safe(zone, testorder,
-                                       high_wmark_pages(zone) + balance_gap,
-                                       end_zone, 0)) {
+                           !zone_balanced(zone, testorder,
+                                          balance_gap, end_zone)) {
                                shrink_zone(zone, &sc);
 
                                reclaim_state->reclaimed_slab = 0;
@@ -2691,9 +2734,8 @@ loop_again:
                                continue;
                        }
 
-                       if (!zone_watermark_ok_safe(zone, testorder,
-                                       high_wmark_pages(zone), end_zone, 0)) {
-                               all_zones_ok = 0;
+                       if (!zone_balanced(zone, testorder, 0, end_zone)) {
+                               unbalanced_zone = zone;
                                /*
                                 * We are still under min water mark.  This
                                 * means that we have a GFP_ATOMIC allocation
@@ -2711,8 +2753,6 @@ loop_again:
                                 * speculatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
-                               if (i <= *classzone_idx)
-                                       balanced += zone->present_pages;
                        }
 
                }
@@ -2726,7 +2766,7 @@ loop_again:
                                pfmemalloc_watermark_ok(pgdat))
                        wake_up(&pgdat->pfmemalloc_wait);
 
-               if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
+               if (pgdat_balanced(pgdat, order, *classzone_idx))
                        break;          /* kswapd: all done */
                /*
                 * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2735,8 +2775,8 @@ loop_again:
                if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
                        if (has_under_min_watermark_zone)
                                count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
-                       else
-                               congestion_wait(BLK_RW_ASYNC, HZ/10);
+                       else if (unbalanced_zone)
+                               wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
                }
 
                /*
@@ -2750,12 +2790,7 @@ loop_again:
        } while (--sc.priority >= 0);
 out:
 
-       /*
-        * order-0: All zones must meet high watermark for a balanced node
-        * high-order: Balanced zones must make up at least 25% of the node
-        *             for the node to be balanced
-        */
-       if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
+       if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
                cond_resched();
 
                try_to_freeze();
@@ -2797,29 +2832,10 @@ out:
                        if (!populated_zone(zone))
                                continue;
 
-                       if (zone->all_unreclaimable &&
-                           sc.priority != DEF_PRIORITY)
-                               continue;
-
-                       /* Would compaction fail due to lack of free memory? */
-                       if (COMPACTION_BUILD &&
-                           compaction_suitable(zone, order) == COMPACT_SKIPPED)
-                               goto loop_again;
-
-                       /* Confirm the zone is balanced for order-0 */
-                       if (!zone_watermark_ok(zone, 0,
-                                       high_wmark_pages(zone), 0, 0)) {
-                               order = sc.order = 0;
-                               goto loop_again;
-                       }
-
                        /* Check if the memory needs to be defragmented. */
                        if (zone_watermark_ok(zone, order,
                                    low_wmark_pages(zone), *classzone_idx, 0))
                                zones_need_compaction = 0;
-
-                       /* If balanced, clear the congested flag */
-                       zone_clear_flag(zone, ZONE_CONGESTED);
                }
 
                if (zones_need_compaction)
@@ -2944,7 +2960,7 @@ static int kswapd(void *p)
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
        balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
-               int ret;
+               bool ret;
 
                /*
                 * If the last balance_pgdat was unsuccessful it's unlikely a
@@ -3112,7 +3128,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
        int nid;
 
        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
-               for_each_node_state(nid, N_HIGH_MEMORY) {
+               for_each_node_state(nid, N_MEMORY) {
                        pg_data_t *pgdat = NODE_DATA(nid);
                        const struct cpumask *mask;
 
@@ -3168,7 +3184,7 @@ static int __init kswapd_init(void)
        int nid;
 
        swap_setup();
-       for_each_node_state(nid, N_HIGH_MEMORY)
+       for_each_node_state(nid, N_MEMORY)
                kswapd_run(nid);
        hotcpu_notifier(cpu_callback, 0);
        return 0;