mm: vmscan: stall page reclaim and writeback pages based on dirty/writepage pages...
[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / vmscan.c
index cd09803..999ef0b 100644 (file)
@@ -669,6 +669,25 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
 
+/* Check if a page is dirty or under writeback */
+static void page_check_dirty_writeback(struct page *page,
+                                      bool *dirty, bool *writeback)
+{
+       /*
+        * Anonymous pages are not handled by flushers and must be written
+        * from reclaim context. Do not stall reclaim based on them
+        */
+       if (!page_is_file_cache(page)) {
+               *dirty = false;
+               *writeback = false;
+               return;
+       }
+
+       /* By default assume that the page flags are accurate */
+       *dirty = PageDirty(page);
+       *writeback = PageWriteback(page);
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -676,13 +695,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct zone *zone,
                                      struct scan_control *sc,
                                      enum ttu_flags ttu_flags,
-                                     unsigned long *ret_nr_dirty,
+                                     unsigned long *ret_nr_unqueued_dirty,
                                      unsigned long *ret_nr_writeback,
                                      bool force_reclaim)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        int pgactivate = 0;
+       unsigned long nr_unqueued_dirty = 0;
        unsigned long nr_dirty = 0;
        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
@@ -696,6 +716,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                struct page *page;
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
+               bool dirty, writeback;
 
                cond_resched();
 
@@ -723,25 +744,73 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
+               /*
+                * The number of dirty pages determines if a zone is marked
+                * reclaim_congested which affects wait_iff_congested. kswapd
+                * will stall and start writing pages if the tail of the LRU
+                * is all dirty unqueued pages.
+                */
+               page_check_dirty_writeback(page, &dirty, &writeback);
+               if (dirty || writeback)
+                       nr_dirty++;
+
+               if (dirty && !writeback)
+                       nr_unqueued_dirty++;
+
+               /* Treat this page as congested if underlying BDI is */
+               mapping = page_mapping(page);
+               if (mapping && bdi_write_congested(mapping->backing_dev_info))
+                       nr_congested++;
+
+               /*
+                * If a page at the tail of the LRU is under writeback, there
+                * are three cases to consider.
+                *
+                * 1) If reclaim is encountering an excessive number of pages
+                *    under writeback and this page is both under writeback and
+                *    PageReclaim then it indicates that pages are being queued
+                *    for IO but are being recycled through the LRU before the
+                *    IO can complete. Waiting on the page itself risks an
+                *    indefinite stall if it is impossible to writeback the
+                *    page due to IO error or disconnected storage so instead
+                *    block for HZ/10 or until some IO completes then clear the
+                *    ZONE_WRITEBACK flag to recheck if the condition exists.
+                *
+                * 2) Global reclaim encounters a page, memcg encounters a
+                *    page that is not marked for immediate reclaim or
+                *    the caller does not have __GFP_IO. In this case mark
+                *    the page for immediate reclaim and continue scanning.
+                *
+                *    __GFP_IO is checked  because a loop driver thread might
+                *    enter reclaim, and deadlock if it waits on a page for
+                *    which it is needed to do the write (loop masks off
+                *    __GFP_IO|__GFP_FS for this reason); but more thought
+                *    would probably show more reasons.
+                *
+                *    Don't require __GFP_FS, since we're not going into the
+                *    FS, just waiting on its writeback completion. Worryingly,
+                *    ext4 gfs2 and xfs allocate pages with
+                *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+                *    may_enter_fs here is liable to OOM on them.
+                *
+                * 3) memcg encounters a page that is not already marked
+                *    PageReclaim. memcg does not have any dirty pages
+                *    throttling so we could easily OOM just because too many
+                *    pages are in writeback and there is nothing else to
+                *    reclaim. Wait for the writeback to complete.
+                */
                if (PageWriteback(page)) {
-                       /*
-                        * memcg doesn't have any dirty pages throttling so we
-                        * could easily OOM just because too many pages are in
-                        * writeback and there is nothing else to reclaim.
-                        *
-                        * Check __GFP_IO, certainly because a loop driver
-                        * thread might enter reclaim, and deadlock if it waits
-                        * on a page for which it is needed to do the write
-                        * (loop masks off __GFP_IO|__GFP_FS for this reason);
-                        * but more thought would probably show more reasons.
-                        *
-                        * Don't require __GFP_FS, since we're not going into
-                        * the FS, just waiting on its writeback completion.
-                        * Worryingly, ext4 gfs2 and xfs allocate pages with
-                        * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
-                        * testing may_enter_fs here is liable to OOM on them.
-                        */
-                       if (global_reclaim(sc) ||
+                       /* Case 1 above */
+                       if (current_is_kswapd() &&
+                           PageReclaim(page) &&
+                           zone_is_reclaim_writeback(zone)) {
+                               unlock_page(page);
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
+                               zone_clear_flag(zone, ZONE_WRITEBACK);
+                               goto keep;
+
+                       /* Case 2 above */
+                       } else if (global_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@ -756,9 +825,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 */
                                SetPageReclaim(page);
                                nr_writeback++;
+
                                goto keep_locked;
+
+                       /* Case 3 above */
+                       } else {
+                               wait_on_page_writeback(page);
                        }
-                       wait_on_page_writeback(page);
                }
 
                if (!force_reclaim)
@@ -784,9 +857,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
                        may_enter_fs = 1;
-               }
 
-               mapping = page_mapping(page);
+                       /* Adding to swap updated mapping */
+                       mapping = page_mapping(page);
+               }
 
                /*
                 * The page is mapped into the page tables of one or more
@@ -806,16 +880,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
 
                if (PageDirty(page)) {
-                       nr_dirty++;
-
                        /*
                         * Only kswapd can writeback filesystem pages to
-                        * avoid risk of stack overflow but do not writeback
-                        * unless under significant pressure.
+                        * avoid risk of stack overflow but only writeback
+                        * if many dirty pages have been encountered.
                         */
                        if (page_is_file_cache(page) &&
                                        (!current_is_kswapd() ||
-                                        sc->priority >= DEF_PRIORITY - 2)) {
+                                        !zone_is_reclaim_dirty(zone))) {
                                /*
                                 * Immediately reclaim when written back.
                                 * Similar in principal to deactivate_page()
@@ -838,7 +910,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        /* Page is dirty, try to write it out here */
                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
-                               nr_congested++;
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
@@ -960,7 +1031,7 @@ keep:
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
        mem_cgroup_uncharge_end();
-       *ret_nr_dirty += nr_dirty;
+       *ret_nr_unqueued_dirty += nr_unqueued_dirty;
        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
 }
@@ -1280,7 +1351,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        unsigned long nr_scanned;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_taken;
-       unsigned long nr_dirty = 0;
+       unsigned long nr_unqueued_dirty = 0;
        unsigned long nr_writeback = 0;
        isolate_mode_t isolate_mode = 0;
        int file = is_file_lru(lru);
@@ -1323,7 +1394,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                return 0;
 
        nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
-                                       &nr_dirty, &nr_writeback, false);
+                               &nr_unqueued_dirty, &nr_writeback, false);
 
        spin_lock_irq(&zone->lru_lock);
 
@@ -1370,8 +1441,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         *                     isolated page is PageWriteback
         */
        if (nr_writeback && nr_writeback >=
-                       (nr_taken >> (DEF_PRIORITY - sc->priority)))
+                       (nr_taken >> (DEF_PRIORITY - sc->priority))) {
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+               zone_set_flag(zone, ZONE_WRITEBACK);
+       }
+
+       /*
+        * Similarly, if many dirty pages are encountered that are not
+        * currently being written then flag that kswapd should start
+        * writing back pages and stall to give a chance for flushers
+        * to catch up.
+        */
+       if (global_reclaim(sc) && nr_unqueued_dirty == nr_taken) {
+               congestion_wait(BLK_RW_ASYNC, HZ/10);
+               zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+       }
 
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                zone_idx(zone),
@@ -2656,22 +2740,57 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
  * the high watermark.
  *
  * Returns true if kswapd scanned at least the requested number of pages to
- * reclaim. This is used to determine if the scanning priority needs to be
- * raised.
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
  */
 static bool kswapd_shrink_zone(struct zone *zone,
+                              int classzone_idx,
                               struct scan_control *sc,
                               unsigned long lru_pages,
                               unsigned long *nr_attempted)
 {
        unsigned long nr_slab;
+       int testorder = sc->order;
+       unsigned long balance_gap;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        struct shrink_control shrink = {
                .gfp_mask = sc->gfp_mask,
        };
+       bool lowmem_pressure;
 
        /* Reclaim above the high watermark. */
        sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+
+       /*
+        * Kswapd reclaims only single pages with compaction enabled. Trying
+        * too hard to reclaim until contiguous free pages have become
+        * available can hurt performance by evicting too much useful data
+        * from memory. Do not reclaim more than needed for compaction.
+        */
+       if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+                       compaction_suitable(zone, sc->order) !=
+                               COMPACT_SKIPPED)
+               testorder = 0;
+
+       /*
+        * We put equal pressure on every zone, unless one zone has way too
+        * many pages free already. The "too many pages" is defined as the
+        * high wmark plus a "gap" where the gap is either the low
+        * watermark or 1% of the zone, whichever is smaller.
+        */
+       balance_gap = min(low_wmark_pages(zone),
+               (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+               KSWAPD_ZONE_BALANCE_GAP_RATIO);
+
+       /*
+        * If there is no low memory pressure or the zone is balanced then no
+        * reclaim is necessary
+        */
+       lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
+       if (!lowmem_pressure && zone_balanced(zone, testorder,
+                                               balance_gap, classzone_idx))
+               return true;
+
        shrink_zone(zone, sc);
 
        reclaim_state->reclaimed_slab = 0;
@@ -2684,6 +2803,20 @@ static bool kswapd_shrink_zone(struct zone *zone,
        if (nr_slab == 0 && !zone_reclaimable(zone))
                zone->all_unreclaimable = 1;
 
+       zone_clear_flag(zone, ZONE_WRITEBACK);
+
+       /*
+        * If a zone reaches its high watermark, consider it to be no longer
+        * congested. It's possible there are dirty pages backed by congested
+        * BDIs but as pressure is relieved, speculatively avoid congestion
+        * waits.
+        */
+       if (!zone->all_unreclaimable &&
+           zone_balanced(zone, testorder, 0, classzone_idx)) {
+               zone_clear_flag(zone, ZONE_CONGESTED);
+               zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+       }
+
        return sc->nr_scanned >= sc->nr_to_reclaim;
 }
 
@@ -2769,8 +2902,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                                end_zone = i;
                                break;
                        } else {
-                               /* If balanced, clear the congested flag */
+                               /*
+                                * If balanced, clear the dirty and congested
+                                * flags
+                                */
                                zone_clear_flag(zone, ZONE_CONGESTED);
+                               zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
                        }
                }
 
@@ -2798,6 +2935,13 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                }
 
                /*
+                * If we're getting trouble reclaiming, start doing writepage
+                * even in laptop mode.
+                */
+               if (sc.priority < DEF_PRIORITY - 2)
+                       sc.may_writepage = 1;
+
+               /*
                 * Now scan the zone in the dma->highmem direction, stopping
                 * at the last zone which needs scanning.
                 *
@@ -2808,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                 */
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                       int testorder;
-                       unsigned long balance_gap;
 
                        if (!populated_zone(zone))
                                continue;
@@ -2830,66 +2972,14 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                        sc.nr_reclaimed += nr_soft_reclaimed;
 
                        /*
-                        * We put equal pressure on every zone, unless
-                        * one zone has way too many pages free
-                        * already. The "too many pages" is defined
-                        * as the high wmark plus a "gap" where the
-                        * gap is either the low watermark or 1%
-                        * of the zone, whichever is smaller.
-                        */
-                       balance_gap = min(low_wmark_pages(zone),
-                               (zone->managed_pages +
-                                       KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
-                               KSWAPD_ZONE_BALANCE_GAP_RATIO);
-                       /*
-                        * Kswapd reclaims only single pages with compaction
-                        * enabled. Trying too hard to reclaim until contiguous
-                        * free pages have become available can hurt performance
-                        * by evicting too much useful data from memory.
-                        * Do not reclaim more than needed for compaction.
-                        */
-                       testorder = order;
-                       if (IS_ENABLED(CONFIG_COMPACTION) && order &&
-                                       compaction_suitable(zone, order) !=
-                                               COMPACT_SKIPPED)
-                               testorder = 0;
-
-                       if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
-                           !zone_balanced(zone, testorder,
-                                          balance_gap, end_zone)) {
-                               /*
-                                * There should be no need to raise the
-                                * scanning priority if enough pages are
-                                * already being scanned that high
-                                * watermark would be met at 100% efficiency.
-                                */
-                               if (kswapd_shrink_zone(zone, &sc, lru_pages,
-                                                      &nr_attempted))
-                                       raise_priority = false;
-                       }
-
-                       /*
-                        * If we're getting trouble reclaiming, start doing
-                        * writepage even in laptop mode.
+                        * There should be no need to raise the scanning
+                        * priority if enough pages are already being scanned
+                        * that that high watermark would be met at 100%
+                        * efficiency.
                         */
-                       if (sc.priority < DEF_PRIORITY - 2)
-                               sc.may_writepage = 1;
-
-                       if (zone->all_unreclaimable) {
-                               if (end_zone && end_zone == i)
-                                       end_zone--;
-                               continue;
-                       }
-
-                       if (zone_balanced(zone, testorder, 0, end_zone))
-                               /*
-                                * If a zone reaches its high watermark,
-                                * consider it to be no longer congested. It's
-                                * possible there are dirty pages backed by
-                                * congested BDIs but as pressure is relieved,
-                                * speculatively avoid congestion waits
-                                */
-                               zone_clear_flag(zone, ZONE_CONGESTED);
+                       if (kswapd_shrink_zone(zone, end_zone, &sc,
+                                       lru_pages, &nr_attempted))
+                               raise_priority = false;
                }
 
                /*
@@ -2929,7 +3019,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                 */
                if (raise_priority || !sc.nr_reclaimed)
                        sc.priority--;
-       } while (sc.priority >= 0 &&
+       } while (sc.priority >= 1 &&
                 !pgdat_balanced(pgdat, order, *classzone_idx));
 
 out: