mm: vmscan: check if reclaim should really abort even if compaction_ready() is true...

[profile/ivi/kernel-adaptation-intel-automotive.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 130fa32..f8c96c7 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -250,33 +250,61 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                 unsigned long long delta;
                 unsigned long total_scan;
                 unsigned long max_pass;
+               int shrink_ret = 0;
+               long nr;
+               long new_nr;
  
+               /*
+                * copy the current shrinker scan count into a local variable
+                * and zero it so that other concurrent shrinker invocations
+                * don't also do this scanning work.
+                */
+               do {
+                       nr = shrinker->nr;
+               } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+
+               total_scan = nr;
                 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
                 delta = (4 * nr_pages_scanned) / shrinker->seeks;
                 delta *= max_pass;
                 do_div(delta, lru_pages + 1);
-               shrinker->nr += delta;
-               if (shrinker->nr < 0) {
+               total_scan += delta;
+               if (total_scan < 0) {
                         printk(KERN_ERR "shrink_slab: %pF negative objects to "
                                "delete nr=%ld\n",
-                              shrinker->shrink, shrinker->nr);
-                       shrinker->nr = max_pass;
+                              shrinker->shrink, total_scan);
+                       total_scan = max_pass;
                 }
  
                 /*
+                * We need to avoid excessive windup on filesystem shrinkers
+                * due to large numbers of GFP_NOFS allocations causing the
+                * shrinkers to return -1 all the time. This results in a large
+                * nr being built up so when a shrink that can do some work
+                * comes along it empties the entire cache due to nr >>>
+                * max_pass.  This is bad for sustaining a working set in
+                * memory.
+                *
+                * Hence only allow the shrinker to scan the entire cache when
+                * a large delta change is calculated directly.
+                */
+               if (delta < max_pass / 4)
+                       total_scan = min(total_scan, max_pass / 2);
+
+               /*
                  * Avoid risking looping forever due to too large nr value:
                  * never try to free more than twice the estimate number of
                  * freeable entries.
                  */
-               if (shrinker->nr > max_pass * 2)
-                       shrinker->nr = max_pass * 2;
+               if (total_scan > max_pass * 2)
+                       total_scan = max_pass * 2;
  
-               total_scan = shrinker->nr;
-               shrinker->nr = 0;
+               trace_mm_shrink_slab_start(shrinker, shrink, nr,
+                                       nr_pages_scanned, lru_pages,
+                                       max_pass, delta, total_scan);
  
                 while (total_scan >= SHRINK_BATCH) {
                         long this_scan = SHRINK_BATCH;
-                       int shrink_ret;
                         int nr_before;
  
                         nr_before = do_shrinker_shrink(shrinker, shrink, 0);
@@ -292,7 +320,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                         cond_resched();
                 }
  
-               shrinker->nr += total_scan;
+               /*
+                * move the unused scan count back into the shrinker in a
+                * manner that handles concurrent updates. If we exhausted the
+                * scan, there is no need to do an update.
+                */
+               do {
+                       nr = shrinker->nr;
+                       new_nr = total_scan + nr;
+                       if (total_scan <= 0)
+                               break;
+               } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+
+               trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
         }
         up_read(&shrinker_rwsem);
  out:
@@ -972,23 +1012,27 @@ keep_lumpy:
   *
   * returns 0 on success, -ve errno on failure.
   */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
  {
+       bool all_lru_mode;
         int ret = -EINVAL;
  
         /* Only take pages on the LRU. */
         if (!PageLRU(page))
                 return ret;
  
+       all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+               (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
         /*
          * When checking the active state, we need to be sure we are
          * dealing with comparible boolean values.  Take the logical not
          * of each.
          */
-       if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+       if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                 return ret;
  
-       if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+       if (!all_lru_mode && !!page_is_file_cache(page) != file)
                 return ret;
  
         /*
@@ -1001,6 +1045,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
  
         ret = -EBUSY;
  
+       /*
+        * To minimise LRU disruption, the caller can indicate that it only
+        * wants to isolate pages it will be able to operate on without
+        * blocking - clean pages for the most part.
+        *
+        * ISOLATE_CLEAN means that only clean pages should be isolated. This
+        * is used by reclaim when it is cannot write to backing storage
+        *
+        * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+        * that it is possible to migrate without blocking
+        */
+       if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+               /* All the caller can do on PageWriteback is block */
+               if (PageWriteback(page))
+                       return ret;
+
+               if (PageDirty(page)) {
+                       struct address_space *mapping;
+
+                       /* ISOLATE_CLEAN means only clean pages */
+                       if (mode & ISOLATE_CLEAN)
+                               return ret;
+
+                       /*
+                        * Only pages without mappings or that have a
+                        * ->migratepage callback are possible to migrate
+                        * without blocking
+                        */
+                       mapping = page_mapping(page);
+                       if (mapping && !mapping->a_ops->migratepage)
+                               return ret;
+               }
+       }
+
+       if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+               return ret;
+
         if (likely(get_page_unless_zero(page))) {
                 /*
                  * Be careful not to clear PageLRU until after we're
@@ -1036,7 +1117,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
   */
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 struct list_head *src, struct list_head *dst,
-               unsigned long *scanned, int order, int mode, int file)
+               unsigned long *scanned, int order, isolate_mode_t mode,
+               int file)
  {
         unsigned long nr_taken = 0;
         unsigned long nr_lumpy_taken = 0;
@@ -1161,8 +1243,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  static unsigned long isolate_pages_global(unsigned long nr,
                                         struct list_head *dst,
                                         unsigned long *scanned, int order,
-                                       int mode, struct zone *z,
-                                       int active, int file)
+                                       isolate_mode_t mode,
+                                       struct zone *z, int active, int file)
  {
         int lru = LRU_BASE;
         if (active)
@@ -1408,6 +1490,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         unsigned long nr_taken;
         unsigned long nr_anon;
         unsigned long nr_file;
+       isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
  
         while (unlikely(too_many_isolated(zone, file, sc))) {
                 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1418,15 +1501,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
         }
  
         set_reclaim_mode(priority, sc, false);
+       if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+               reclaim_mode |= ISOLATE_ACTIVE;
+
         lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
         spin_lock_irq(&zone->lru_lock);
  
         if (scanning_global_lru(sc)) {
-               nr_taken = isolate_pages_global(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, 0, file);
+               nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
                 zone->pages_scanned += nr_scanned;
                 if (current_is_kswapd())
                         __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1435,12 +1524,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                         __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                                nr_scanned);
         } else {
-               nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
-                       &page_list, &nr_scanned, sc->order,
-                       sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                       ISOLATE_BOTH : ISOLATE_INACTIVE,
-                       zone, sc->mem_cgroup,
-                       0, file);
+               nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+                       &nr_scanned, sc->order, reclaim_mode, zone,
+                       sc->mem_cgroup, 0, file);
                 /*
                  * mem_cgroup_isolate_pages() keeps track of
                  * scanned pages on its own.
@@ -1542,19 +1628,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         struct page *page;
         struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
         unsigned long nr_rotated = 0;
+       isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
  
         lru_add_drain();
+
+       if (!sc->may_unmap)
+               reclaim_mode |= ISOLATE_UNMAPPED;
+       if (!sc->may_writepage)
+               reclaim_mode |= ISOLATE_CLEAN;
+
         spin_lock_irq(&zone->lru_lock);
         if (scanning_global_lru(sc)) {
                 nr_taken = isolate_pages_global(nr_pages, &l_hold,
                                                 &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                 1, file);
                 zone->pages_scanned += pgscanned;
         } else {
                 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
                                                 &pgscanned, sc->order,
-                                               ISOLATE_ACTIVE, zone,
+                                               reclaim_mode, zone,
                                                 sc->mem_cgroup, 1, file);
                 /*
                  * mem_cgroup_isolate_pages() keeps track of
@@ -1747,23 +1840,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         u64 fraction[2], denominator;
         enum lru_list l;
         int noswap = 0;
-       int force_scan = 0;
+       bool force_scan = false;
         unsigned long nr_force_scan[2];
  
-
-       anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-       file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
-       if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
-               /* kswapd does zone balancing and need to scan this zone */
-               if (scanning_global_lru(sc) && current_is_kswapd())
-                       force_scan = 1;
-               /* memcg may have small limit and need to avoid priority drop */
-               if (!scanning_global_lru(sc))
-                       force_scan = 1;
-       }
+       /* kswapd does zone balancing and needs to scan this zone */
+       if (scanning_global_lru(sc) && current_is_kswapd())
+               force_scan = true;
+       /* memcg may have small limit and need to avoid priority drop */
+       if (!scanning_global_lru(sc))
+               force_scan = true;
  
         /* If we have no swap space, do not bother scanning anon pages. */
         if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1776,6 +1861,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                 goto out;
         }
  
+       anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+       file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+               zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
         if (scanning_global_lru(sc)) {
                 free  = zone_page_state(zone, NR_FREE_PAGES);
                 /* If we have very few page cache pages,
@@ -1985,6 +2075,42 @@ restart:
         throttle_vm_writeout(sc->gfp_mask);
  }
  
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+       unsigned long balance_gap, watermark;
+       bool watermark_ok;
+
+       /* Do not consider compaction for orders reclaim is meant to satisfy */
+       if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+               return false;
+
+       /*
+        * Compaction takes time to run and there are potentially other
+        * callers using the pages just freed. Continue reclaiming until
+        * there is a buffer of free pages available to give compaction
+        * a reasonable chance of completing and allocating the page
+        */
+       balance_gap = min(low_wmark_pages(zone),
+               (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                       KSWAPD_ZONE_BALANCE_GAP_RATIO);
+       watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+       /*
+        * If compaction is deferred, reclaim up to a point where
+        * compaction will have a chance of success when re-enabled
+        */
+       if (compaction_deferred(zone))
+               return watermark_ok;
+
+       /* If compaction is not ready to start, keep reclaiming */
+       if (!compaction_suitable(zone, sc->order))
+               return false;
+
+       return watermark_ok;
+}
+
  /*
   * This is the direct reclaim path, for page-allocating processes.  We only
   * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2000,14 +2126,20 @@ restart:
   *
   * If a zone is deemed to be full of pinned pages then just give it a light
   * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
   */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
                                         struct scan_control *sc)
  {
         struct zoneref *z;
         struct zone *zone;
         unsigned long nr_soft_reclaimed;
         unsigned long nr_soft_scanned;
+       bool aborted_reclaim = false;
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                         gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2022,6 +2154,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                 continue;
                         if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                 continue;       /* Let kswapd poll it */
+                       if (COMPACTION_BUILD) {
+                               /*
+                                * If we already have plenty of memory free for
+                                * compaction in this zone, don't free any more.
+                                * Even though compaction is invoked for any
+                                * non-zero order, only frequent costly order
+                                * reclamation is disruptive enough to become a
+                                * noticable problem, like transparent huge page
+                                * allocations.
+                                */
+                               if (compaction_ready(zone, sc)) {
+                                       aborted_reclaim = true;
+                                       continue;
+                               }
+                       }
                         /*
                          * This steals pages from memory cgroups over softlimit
                          * and returns the number of reclaimed pages and
@@ -2039,6 +2186,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
  
                 shrink_zone(priority, zone, sc);
         }
+
+       return aborted_reclaim;
  }
  
  static bool zone_reclaimable(struct zone *zone)
@@ -2092,6 +2241,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
         struct zoneref *z;
         struct zone *zone;
         unsigned long writeback_threshold;
+       bool aborted_reclaim;
  
         get_mems_allowed();
         delayacct_freepages_start();
@@ -2103,7 +2253,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 sc->nr_scanned = 0;
                 if (!priority)
                         disable_swap_token(sc->mem_cgroup);
-               shrink_zones(priority, zonelist, sc);
+               aborted_reclaim = shrink_zones(priority, zonelist, sc);
+
                 /*
                  * Don't shrink slabs when reclaiming memory from
                  * over limit cgroups
@@ -2168,6 +2319,10 @@ out:
         if (oom_killer_disabled)
                 return 0;
  
+       /* Aborted reclaim to try compaction? don't OOM, then */
+       if (aborted_reclaim)
+               return 1;
+
         /* top priority shrink_zones still had more to do? don't OOM, then */
         if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
                 return 1;
@@ -2459,6 +2614,9 @@ loop_again:
                                         high_wmark_pages(zone), 0, 0)) {
                                 end_zone = i;
                                 break;
+                       } else {
+                               /* If balanced, clear the congested flag */
+                               zone_clear_flag(zone, ZONE_CONGESTED);
                         }
                 }
                 if (i < 0)
@@ -2725,7 +2883,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  static int kswapd(void *p)
  {
         unsigned long order, new_order;
+       unsigned balanced_order;
         int classzone_idx, new_classzone_idx;
+       int balanced_classzone_idx;
         pg_data_t *pgdat = (pg_data_t*)p;
         struct task_struct *tsk = current;
  
@@ -2756,7 +2916,9 @@ static int kswapd(void *p)
         set_freezable();
  
         order = new_order = 0;
+       balanced_order = 0;
         classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+       balanced_classzone_idx = classzone_idx;
         for ( ; ; ) {
                 int ret;
  
@@ -2765,7 +2927,8 @@ static int kswapd(void *p)
                  * new request of a similar or harder type will succeed soon
                  * so consider going to sleep on the basis we reclaimed at
                  */
-               if (classzone_idx >= new_classzone_idx && order == new_order) {
+               if (balanced_classzone_idx >= new_classzone_idx &&
+                                       balanced_order == new_order) {
                         new_order = pgdat->kswapd_max_order;
                         new_classzone_idx = pgdat->classzone_idx;
                         pgdat->kswapd_max_order =  0;
@@ -2780,9 +2943,12 @@ static int kswapd(void *p)
                         order = new_order;
                         classzone_idx = new_classzone_idx;
                 } else {
-                       kswapd_try_to_sleep(pgdat, order, classzone_idx);
+                       kswapd_try_to_sleep(pgdat, balanced_order,
+                                               balanced_classzone_idx);
                         order = pgdat->kswapd_max_order;
                         classzone_idx = pgdat->classzone_idx;
+                       new_order = order;
+                       new_classzone_idx = classzone_idx;
                         pgdat->kswapd_max_order = 0;
                         pgdat->classzone_idx = pgdat->nr_zones - 1;
                 }
@@ -2797,7 +2963,9 @@ static int kswapd(void *p)
                  */
                 if (!ret) {
                         trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                       order = balance_pgdat(pgdat, order, &classzone_idx);
+                       balanced_classzone_idx = classzone_idx;
+                       balanced_order = balance_pgdat(pgdat, order,
+                                               &balanced_classzone_idx);
                 }
         }
         return 0;