mm: vmscan: check if reclaim should really abort even if compaction_ready() is true...
[profile/ivi/kernel-adaptation-intel-automotive.git] / mm / vmscan.c
index 130fa32..f8c96c7 100644 (file)
@@ -250,33 +250,61 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                unsigned long long delta;
                unsigned long total_scan;
                unsigned long max_pass;
+               int shrink_ret = 0;
+               long nr;
+               long new_nr;
 
+               /*
+                * copy the current shrinker scan count into a local variable
+                * and zero it so that other concurrent shrinker invocations
+                * don't also do this scanning work.
+                */
+               do {
+                       nr = shrinker->nr;
+               } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+
+               total_scan = nr;
                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
-               shrinker->nr += delta;
-               if (shrinker->nr < 0) {
+               total_scan += delta;
+               if (total_scan < 0) {
                        printk(KERN_ERR "shrink_slab: %pF negative objects to "
                               "delete nr=%ld\n",
-                              shrinker->shrink, shrinker->nr);
-                       shrinker->nr = max_pass;
+                              shrinker->shrink, total_scan);
+                       total_scan = max_pass;
                }
 
                /*
+                * We need to avoid excessive windup on filesystem shrinkers
+                * due to large numbers of GFP_NOFS allocations causing the
+                * shrinkers to return -1 all the time. This results in a large
+                * nr being built up so when a shrink that can do some work
+                * comes along it empties the entire cache due to nr >>>
+                * max_pass.  This is bad for sustaining a working set in
+                * memory.
+                *
+                * Hence only allow the shrinker to scan the entire cache when
+                * a large delta change is calculated directly.
+                */
+               if (delta < max_pass / 4)
+                       total_scan = min(total_scan, max_pass / 2);
+
+               /*
                 * Avoid risking looping forever due to too large nr value:
                 * never try to free more than twice the estimate number of
                 * freeable entries.
                 */
-               if (shrinker->nr > max_pass * 2)
-                       shrinker->nr = max_pass * 2;
+               if (total_scan > max_pass * 2)
+                       total_scan = max_pass * 2;
 
-               total_scan = shrinker->nr;
-               shrinker->nr = 0;
+               trace_mm_shrink_slab_start(shrinker, shrink, nr,
+                                       nr_pages_scanned, lru_pages,
+                                       max_pass, delta, total_scan);
 
                while (total_scan >= SHRINK_BATCH) {
                        long this_scan = SHRINK_BATCH;
-                       int shrink_ret;
                        int nr_before;
 
                        nr_before = do_shrinker_shrink(shrinker, shrink, 0);
@@ -292,7 +320,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                        cond_resched();
                }
 
-               shrinker->nr += total_scan;
+               /*
+                * move the unused scan count back into the shrinker in a
+                * manner that handles concurrent updates. If we exhausted the
+                * scan, there is no need to do an update.
+                */
+               do {
+                       nr = shrinker->nr;
+                       new_nr = total_scan + nr;
+                       if (total_scan <= 0)
+                               break;
+               } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+
+               trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
        }
        up_read(&shrinker_rwsem);
 out:
@@ -972,23 +1012,27 @@ keep_lumpy:
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 {
+       bool all_lru_mode;
        int ret = -EINVAL;
 
        /* Only take pages on the LRU. */
        if (!PageLRU(page))
                return ret;
 
+       all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+               (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
        /*
         * When checking the active state, we need to be sure we are
         * dealing with comparible boolean values.  Take the logical not
         * of each.
         */
-       if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+       if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                return ret;
 
-       if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+       if (!all_lru_mode && !!page_is_file_cache(page) != file)
                return ret;
 
        /*
@@ -1001,6 +1045,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 
        ret = -EBUSY;
 
+       /*
+        * To minimise LRU disruption, the caller can indicate that it only
+        * wants to isolate pages it will be able to operate on without
+        * blocking - clean pages for the most part.
+        *
+        * ISOLATE_CLEAN means that only clean pages should be isolated. This
+        * is used by reclaim when it is cannot write to backing storage
+        *
+        * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+        * that it is possible to migrate without blocking
+        */
+       if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+               /* All the caller can do on PageWriteback is block */
+               if (PageWriteback(page))
+                       return ret;
+
+               if (PageDirty(page)) {
+                       struct address_space *mapping;
+
+                       /* ISOLATE_CLEAN means only clean pages */
+                       if (mode & ISOLATE_CLEAN)
+                               return ret;
+
+                       /*
+                        * Only pages without mappings or that have a
+                        * ->migratepage callback are possible to migrate
+                        * without blocking
+                        */
+                       mapping = page_mapping(page);
+                       if (mapping && !mapping->a_ops->migratepage)
+                               return ret;
+               }
+       }
+
+       if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+               return ret;
+
        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
@@ -1036,7 +1117,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct list_head *src, struct list_head *dst,
-               unsigned long *scanned, int order, int mode, int file)
+               unsigned long *scanned, int order, isolate_mode_t mode,
+               int file)
 {
        unsigned long nr_taken = 0;
        unsigned long nr_lumpy_taken = 0;
@@ -1161,8 +1243,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 static unsigned long isolate_pages_global(unsigned long nr,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
-                                       int mode, struct zone *z,
-                                       int active, int file)
+                                       isolate_mode_t mode,
+                                       struct zone *z, int active, int file)
 {
        int lru = LRU_BASE;
        if (active)
@@ -1408,6 +1490,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_taken;
        unsigned long nr_anon;
        unsigned long nr_file;
+       isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
 
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1418,15 +1501,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        }
 
        set_reclaim_mode(priority, sc, false);
+       if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+               reclaim_mode |= ISOLATE_ACTIVE;
+
        lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
        spin_lock_irq(&zone->lru_lock);
 
        if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, 0, file);
+               nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1435,12 +1524,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
        } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, sc->mem_cgroup,
-                       0, file);
+               nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone,
+                       sc->mem_cgroup, 0, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
                 * scanned pages on its own.
@@ -1542,19 +1628,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        unsigned long nr_rotated = 0;
+       isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
 
        lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                1, file);
                zone->pages_scanned += pgscanned;
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                sc->mem_cgroup, 1, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
@@ -1747,23 +1840,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
-       int force_scan = 0;
+       bool force_scan = false;
        unsigned long nr_force_scan[2];
 
-
-       anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-       file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
-       if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
-               /* kswapd does zone balancing and need to scan this zone */
-               if (scanning_global_lru(sc) && current_is_kswapd())
-                       force_scan = 1;
-               /* memcg may have small limit and need to avoid priority drop */
-               if (!scanning_global_lru(sc))
-                       force_scan = 1;
-       }
+       /* kswapd does zone balancing and needs to scan this zone */
+       if (scanning_global_lru(sc) && current_is_kswapd())
+               force_scan = true;
+       /* memcg may have small limit and need to avoid priority drop */
+       if (!scanning_global_lru(sc))
+               force_scan = true;
 
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1776,6 +1861,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                goto out;
        }
 
+       anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+       file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
@@ -1985,6 +2075,42 @@ restart:
        throttle_vm_writeout(sc->gfp_mask);
 }
 
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+       unsigned long balance_gap, watermark;
+       bool watermark_ok;
+
+       /* Do not consider compaction for orders reclaim is meant to satisfy */
+       if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+               return false;
+
+       /*
+        * Compaction takes time to run and there are potentially other
+        * callers using the pages just freed. Continue reclaiming until
+        * there is a buffer of free pages available to give compaction
+        * a reasonable chance of completing and allocating the page
+        */
+       balance_gap = min(low_wmark_pages(zone),
+               (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                       KSWAPD_ZONE_BALANCE_GAP_RATIO);
+       watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+       /*
+        * If compaction is deferred, reclaim up to a point where
+        * compaction will have a chance of success when re-enabled
+        */
+       if (compaction_deferred(zone))
+               return watermark_ok;
+
+       /* If compaction is not ready to start, keep reclaiming */
+       if (!compaction_suitable(zone, sc->order))
+               return false;
+
+       return watermark_ok;
+}
+
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2000,14 +2126,20 @@ restart:
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
  */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
+       bool aborted_reclaim = false;
 
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2022,6 +2154,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
+                       if (COMPACTION_BUILD) {
+                               /*
+                                * If we already have plenty of memory free for
+                                * compaction in this zone, don't free any more.
+                                * Even though compaction is invoked for any
+                                * non-zero order, only frequent costly order
+                                * reclamation is disruptive enough to become a
+                                * noticable problem, like transparent huge page
+                                * allocations.
+                                */
+                               if (compaction_ready(zone, sc)) {
+                                       aborted_reclaim = true;
+                                       continue;
+                               }
+                       }
                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
@@ -2039,6 +2186,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 
                shrink_zone(priority, zone, sc);
        }
+
+       return aborted_reclaim;
 }
 
 static bool zone_reclaimable(struct zone *zone)
@@ -2092,6 +2241,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        unsigned long writeback_threshold;
+       bool aborted_reclaim;
 
        get_mems_allowed();
        delayacct_freepages_start();
@@ -2103,7 +2253,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token(sc->mem_cgroup);
-               shrink_zones(priority, zonelist, sc);
+               aborted_reclaim = shrink_zones(priority, zonelist, sc);
+
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
@@ -2168,6 +2319,10 @@ out:
        if (oom_killer_disabled)
                return 0;
 
+       /* Aborted reclaim to try compaction? don't OOM, then */
+       if (aborted_reclaim)
+               return 1;
+
        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
                return 1;
@@ -2459,6 +2614,9 @@ loop_again:
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
+                       } else {
+                               /* If balanced, clear the congested flag */
+                               zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
                if (i < 0)
@@ -2725,7 +2883,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 static int kswapd(void *p)
 {
        unsigned long order, new_order;
+       unsigned balanced_order;
        int classzone_idx, new_classzone_idx;
+       int balanced_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
 
@@ -2756,7 +2916,9 @@ static int kswapd(void *p)
        set_freezable();
 
        order = new_order = 0;
+       balanced_order = 0;
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+       balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
                int ret;
 
@@ -2765,7 +2927,8 @@ static int kswapd(void *p)
                 * new request of a similar or harder type will succeed soon
                 * so consider going to sleep on the basis we reclaimed at
                 */
-               if (classzone_idx >= new_classzone_idx && order == new_order) {
+               if (balanced_classzone_idx >= new_classzone_idx &&
+                                       balanced_order == new_order) {
                        new_order = pgdat->kswapd_max_order;
                        new_classzone_idx = pgdat->classzone_idx;
                        pgdat->kswapd_max_order =  0;
@@ -2780,9 +2943,12 @@ static int kswapd(void *p)
                        order = new_order;
                        classzone_idx = new_classzone_idx;
                } else {
-                       kswapd_try_to_sleep(pgdat, order, classzone_idx);
+                       kswapd_try_to_sleep(pgdat, balanced_order,
+                                               balanced_classzone_idx);
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
+                       new_order = order;
+                       new_classzone_idx = classzone_idx;
                        pgdat->kswapd_max_order = 0;
                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
@@ -2797,7 +2963,9 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       order = balance_pgdat(pgdat, order, &classzone_idx);
+                       balanced_classzone_idx = classzone_idx;
+                       balanced_order = balance_pgdat(pgdat, order,
+                                               &balanced_classzone_idx);
                }
        }
        return 0;