memcg,vmscan: do not break out targeted reclaim without reclaimed pages
[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / vmscan.c
index 196709f..4639909 100644 (file)
@@ -1638,6 +1638,13 @@ static int vmscan_swappiness(struct scan_control *sc)
        return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 
+enum scan_balance {
+       SCAN_EQUAL,
+       SCAN_FRACT,
+       SCAN_ANON,
+       SCAN_FILE,
+};
+
 /*
  * Determine how aggressively the anon and file LRU lists should be
  * scanned.  The relative value of each set of LRU lists is determined
@@ -1650,15 +1657,16 @@ static int vmscan_swappiness(struct scan_control *sc)
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
 {
-       unsigned long anon, file, free;
+       struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+       u64 fraction[2];
+       u64 denominator = 0;    /* gcc */
+       struct zone *zone = lruvec_zone(lruvec);
        unsigned long anon_prio, file_prio;
+       enum scan_balance scan_balance;
+       unsigned long anon, file, free;
+       bool force_scan = false;
        unsigned long ap, fp;
-       struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-       u64 fraction[2], denominator;
        enum lru_list lru;
-       int noswap = 0;
-       bool force_scan = false;
-       struct zone *zone = lruvec_zone(lruvec);
 
        /*
         * If the zone or memcg is small, nr[l] can be 0.  This
@@ -1677,10 +1685,29 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
-               noswap = 1;
-               fraction[0] = 0;
-               fraction[1] = 1;
-               denominator = 1;
+               scan_balance = SCAN_FILE;
+               goto out;
+       }
+
+       /*
+        * Global reclaim will swap to prevent OOM even with no
+        * swappiness, but memcg users want to use this knob to
+        * disable swapping for individual groups completely when
+        * using the memory controller's swap limit feature would be
+        * too expensive.
+        */
+       if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
+               scan_balance = SCAN_FILE;
+               goto out;
+       }
+
+       /*
+        * Do not apply any pressure balancing cleverness when the
+        * system is close to OOM, scan both anon and file equally
+        * (unless the swappiness setting disagrees with swapping).
+        */
+       if (!sc->priority && vmscan_swappiness(sc)) {
+               scan_balance = SCAN_EQUAL;
                goto out;
        }
 
@@ -1689,30 +1716,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
                get_lru_size(lruvec, LRU_INACTIVE_FILE);
 
+       /*
+        * If it's foreseeable that reclaiming the file cache won't be
+        * enough to get the zone back into a desirable shape, we have
+        * to swap.  Better start now and leave the - probably heavily
+        * thrashing - remaining file pages alone.
+        */
        if (global_reclaim(sc)) {
-               free  = zone_page_state(zone, NR_FREE_PAGES);
+               free = zone_page_state(zone, NR_FREE_PAGES);
                if (unlikely(file + free <= high_wmark_pages(zone))) {
-                       /*
-                        * If we have very few page cache pages, force-scan
-                        * anon pages.
-                        */
-                       fraction[0] = 1;
-                       fraction[1] = 0;
-                       denominator = 1;
-                       goto out;
-               } else if (!inactive_file_is_low_global(zone)) {
-                       /*
-                        * There is enough inactive page cache, do not
-                        * reclaim anything from the working set right now.
-                        */
-                       fraction[0] = 0;
-                       fraction[1] = 1;
-                       denominator = 1;
+                       scan_balance = SCAN_ANON;
                        goto out;
                }
        }
 
        /*
+        * There is enough inactive page cache, do not reclaim
+        * anything from the anonymous working set right now.
+        */
+       if (!inactive_file_is_low(lruvec)) {
+               scan_balance = SCAN_FILE;
+               goto out;
+       }
+
+       scan_balance = SCAN_FRACT;
+
+       /*
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
@@ -1759,19 +1788,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 out:
        for_each_evictable_lru(lru) {
                int file = is_file_lru(lru);
+               unsigned long size;
                unsigned long scan;
 
-               scan = get_lru_size(lruvec, lru);
-               if (sc->priority || noswap || !vmscan_swappiness(sc)) {
-                       scan >>= sc->priority;
-                       if (!scan && force_scan)
-                               scan = SWAP_CLUSTER_MAX;
+               size = get_lru_size(lruvec, lru);
+               scan = size >> sc->priority;
+
+               if (!scan && force_scan)
+                       scan = min(size, SWAP_CLUSTER_MAX);
+
+               switch (scan_balance) {
+               case SCAN_EQUAL:
+                       /* Scan lists relative to size */
+                       break;
+               case SCAN_FRACT:
+                       /*
+                        * Scan types proportional to swappiness and
+                        * their relative recent reclaim efficiency.
+                        */
                        scan = div64_u64(scan * fraction[file], denominator);
+                       break;
+               case SCAN_FILE:
+               case SCAN_ANON:
+                       /* Scan one type exclusively */
+                       if ((scan_balance == SCAN_FILE) != file)
+                               scan = 0;
+                       break;
+               default:
+                       /* Look ma, no brain */
+                       BUG();
                }
                nr[lru] = scan;
        }
 }
 
+/*
+ * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
+ */
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+       unsigned long nr[NR_LRU_LISTS];
+       unsigned long nr_to_scan;
+       enum lru_list lru;
+       unsigned long nr_reclaimed = 0;
+       unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+       struct blk_plug plug;
+
+       get_scan_count(lruvec, sc, nr);
+
+       blk_start_plug(&plug);
+       while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+                                       nr[LRU_INACTIVE_FILE]) {
+               for_each_evictable_lru(lru) {
+                       if (nr[lru]) {
+                               nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
+                               nr[lru] -= nr_to_scan;
+
+                               nr_reclaimed += shrink_list(lru, nr_to_scan,
+                                                           lruvec, sc);
+                       }
+               }
+               /*
+                * On large memory systems, scan >> priority can become
+                * really large. This is fine for the starting priority;
+                * we want to put equal scanning pressure on each zone.
+                * However, if the VM has a harder time of freeing pages,
+                * with multiple processes reclaiming pages, the total
+                * freeing target can get unreasonably large.
+                */
+               if (nr_reclaimed >= nr_to_reclaim &&
+                   sc->priority < DEF_PRIORITY)
+                       break;
+       }
+       blk_finish_plug(&plug);
+       sc->nr_reclaimed += nr_reclaimed;
+
+       /*
+        * Even if we did not try to evict anon pages at all, we want to
+        * rebalance the anon lru active/inactive ratio.
+        */
+       if (inactive_anon_is_low(lruvec))
+               shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+                                  sc, LRU_ACTIVE_ANON);
+
+       throttle_vm_writeout(sc->gfp_mask);
+}
+
 /* Use reclaim/compaction for costly allocs or under memory pressure */
 static bool in_reclaim_compaction(struct scan_control *sc)
 {
@@ -1790,7 +1892,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
  * calls try_to_compact_zone() that it will have enough free pages to succeed.
  * It will give up earlier than that if there is difficulty reclaiming pages.
  */
-static inline bool should_continue_reclaim(struct lruvec *lruvec,
+static inline bool should_continue_reclaim(struct zone *zone,
                                        unsigned long nr_reclaimed,
                                        unsigned long nr_scanned,
                                        struct scan_control *sc)
@@ -1830,15 +1932,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-       inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
+       inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
        if (nr_swap_pages > 0)
-               inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
+               inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
 
        /* If compaction would go ahead or the allocation would succeed, stop */
-       switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
+       switch (compaction_suitable(zone, sc->order)) {
        case COMPACT_PARTIAL:
        case COMPACT_CONTINUE:
                return false;
@@ -1847,98 +1949,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
        }
 }
 
-/*
- * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
- */
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
-       unsigned long nr[NR_LRU_LISTS];
-       unsigned long nr_to_scan;
-       enum lru_list lru;
        unsigned long nr_reclaimed, nr_scanned;
-       unsigned long nr_to_reclaim = sc->nr_to_reclaim;
-       struct blk_plug plug;
-
-restart:
-       nr_reclaimed = 0;
-       nr_scanned = sc->nr_scanned;
-       get_scan_count(lruvec, sc, nr);
-
-       blk_start_plug(&plug);
-       while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
-                                       nr[LRU_INACTIVE_FILE]) {
-               for_each_evictable_lru(lru) {
-                       if (nr[lru]) {
-                               nr_to_scan = min_t(unsigned long,
-                                                  nr[lru], SWAP_CLUSTER_MAX);
-                               nr[lru] -= nr_to_scan;
-
-                               nr_reclaimed += shrink_list(lru, nr_to_scan,
-                                                           lruvec, sc);
-                       }
-               }
-               /*
-                * On large memory systems, scan >> priority can become
-                * really large. This is fine for the starting priority;
-                * we want to put equal scanning pressure on each zone.
-                * However, if the VM has a harder time of freeing pages,
-                * with multiple processes reclaiming pages, the total
-                * freeing target can get unreasonably large.
-                */
-               if (nr_reclaimed >= nr_to_reclaim &&
-                   sc->priority < DEF_PRIORITY)
-                       break;
-       }
-       blk_finish_plug(&plug);
-       sc->nr_reclaimed += nr_reclaimed;
 
-       /*
-        * Even if we did not try to evict anon pages at all, we want to
-        * rebalance the anon lru active/inactive ratio.
-        */
-       if (inactive_anon_is_low(lruvec))
-               shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
-                                  sc, LRU_ACTIVE_ANON);
-
-       /* reclaim/compaction might need reclaim to continue */
-       if (should_continue_reclaim(lruvec, nr_reclaimed,
-                                   sc->nr_scanned - nr_scanned, sc))
-               goto restart;
+       do {
+               struct mem_cgroup *root = sc->target_mem_cgroup;
+               struct mem_cgroup_reclaim_cookie reclaim = {
+                       .zone = zone,
+                       .priority = sc->priority,
+               };
+               struct mem_cgroup *memcg;
 
-       throttle_vm_writeout(sc->gfp_mask);
-}
+               nr_reclaimed = sc->nr_reclaimed;
+               nr_scanned = sc->nr_scanned;
 
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
-{
-       struct mem_cgroup *root = sc->target_mem_cgroup;
-       struct mem_cgroup_reclaim_cookie reclaim = {
-               .zone = zone,
-               .priority = sc->priority,
-       };
-       struct mem_cgroup *memcg;
+               memcg = mem_cgroup_iter(root, NULL, &reclaim);
+               do {
+                       struct lruvec *lruvec;
 
-       memcg = mem_cgroup_iter(root, NULL, &reclaim);
-       do {
-               struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+                       lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
-               shrink_lruvec(lruvec, sc);
+                       shrink_lruvec(lruvec, sc);
 
-               /*
-                * Limit reclaim has historically picked one memcg and
-                * scanned it with decreasing priority levels until
-                * nr_to_reclaim had been reclaimed.  This priority
-                * cycle is thus over after a single memcg.
-                *
-                * Direct reclaim and kswapd, on the other hand, have
-                * to scan all memory cgroups to fulfill the overall
-                * scan target for the zone.
-                */
-               if (!global_reclaim(sc)) {
-                       mem_cgroup_iter_break(root, memcg);
-                       break;
-               }
-               memcg = mem_cgroup_iter(root, memcg, &reclaim);
-       } while (memcg);
+                       /*
+                        * Direct reclaim and kswapd have to scan all memory
+                        * cgroups to fulfill the overall scan target for the
+                        * zone.
+                        *
+                        * Limit reclaim, on the other hand, only cares about
+                        * nr_to_reclaim pages to be reclaimed and it will
+                        * retry with decreasing priority if one round over the
+                        * whole hierarchy is not sufficient.
+                        */
+                       if (!global_reclaim(sc) &&
+                                       sc->nr_reclaimed >= sc->nr_to_reclaim) {
+                               mem_cgroup_iter_break(root, memcg);
+                               break;
+                       }
+                       memcg = mem_cgroup_iter(root, memcg, &reclaim);
+               } while (memcg);
+       } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
+                                        sc->nr_scanned - nr_scanned, sc));
 }
 
 /* Returns true if compaction should go ahead for a high-order request */
@@ -3280,8 +3332,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
                .may_swap = 1,
-               .nr_to_reclaim = max_t(unsigned long, nr_pages,
-                                      SWAP_CLUSTER_MAX),
+               .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
                .order = order,
                .priority = ZONE_RECLAIM_PRIORITY,