return PAGEREF_RECLAIM;
}
+/* Check if a page is dirty or under writeback */
+static void page_check_dirty_writeback(struct page *page,
+ bool *dirty, bool *writeback)
+{
+ /*
+ * Anonymous pages are not handled by flushers and must be written
+ * from reclaim context. Do not stall reclaim based on them
+ */
+ if (!page_is_file_cache(page)) {
+ *dirty = false;
+ *writeback = false;
+ return;
+ }
+
+ /* By default assume that the page flags are accurate */
+ *dirty = PageDirty(page);
+ *writeback = PageWriteback(page);
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
struct zone *zone,
struct scan_control *sc,
enum ttu_flags ttu_flags,
- unsigned long *ret_nr_dirty,
+ unsigned long *ret_nr_unqueued_dirty,
unsigned long *ret_nr_writeback,
bool force_reclaim)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
int pgactivate = 0;
+ unsigned long nr_unqueued_dirty = 0;
unsigned long nr_dirty = 0;
unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
struct page *page;
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
+ bool dirty, writeback;
cond_resched();
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+ /*
+ * The number of dirty pages determines if a zone is marked
+ * reclaim_congested which affects wait_iff_congested. kswapd
+ * will stall and start writing pages if the tail of the LRU
+ * is all dirty unqueued pages.
+ */
+ page_check_dirty_writeback(page, &dirty, &writeback);
+ if (dirty || writeback)
+ nr_dirty++;
+
+ if (dirty && !writeback)
+ nr_unqueued_dirty++;
+
+ /* Treat this page as congested if underlying BDI is */
+ mapping = page_mapping(page);
+ if (mapping && bdi_write_congested(mapping->backing_dev_info))
+ nr_congested++;
+
+ /*
+ * If a page at the tail of the LRU is under writeback, there
+ * are three cases to consider.
+ *
+ * 1) If reclaim is encountering an excessive number of pages
+ * under writeback and this page is both under writeback and
+ * PageReclaim then it indicates that pages are being queued
+ * for IO but are being recycled through the LRU before the
+ * IO can complete. Waiting on the page itself risks an
+ * indefinite stall if it is impossible to writeback the
+ * page due to IO error or disconnected storage so instead
+ * block for HZ/10 or until some IO completes then clear the
+ * ZONE_WRITEBACK flag to recheck if the condition exists.
+ *
+ * 2) Global reclaim encounters a page, memcg encounters a
+ * page that is not marked for immediate reclaim or
+ * the caller does not have __GFP_IO. In this case mark
+ * the page for immediate reclaim and continue scanning.
+ *
+ * __GFP_IO is checked because a loop driver thread might
+ * enter reclaim, and deadlock if it waits on a page for
+ * which it is needed to do the write (loop masks off
+ * __GFP_IO|__GFP_FS for this reason); but more thought
+ * would probably show more reasons.
+ *
+ * Don't require __GFP_FS, since we're not going into the
+ * FS, just waiting on its writeback completion. Worryingly,
+ * ext4 gfs2 and xfs allocate pages with
+ * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+ * may_enter_fs here is liable to OOM on them.
+ *
+ * 3) memcg encounters a page that is not already marked
+ * PageReclaim. memcg does not have any dirty pages
+ * throttling so we could easily OOM just because too many
+ * pages are in writeback and there is nothing else to
+ * reclaim. Wait for the writeback to complete.
+ */
if (PageWriteback(page)) {
- /*
- * memcg doesn't have any dirty pages throttling so we
- * could easily OOM just because too many pages are in
- * writeback and there is nothing else to reclaim.
- *
- * Check __GFP_IO, certainly because a loop driver
- * thread might enter reclaim, and deadlock if it waits
- * on a page for which it is needed to do the write
- * (loop masks off __GFP_IO|__GFP_FS for this reason);
- * but more thought would probably show more reasons.
- *
- * Don't require __GFP_FS, since we're not going into
- * the FS, just waiting on its writeback completion.
- * Worryingly, ext4 gfs2 and xfs allocate pages with
- * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
- * testing may_enter_fs here is liable to OOM on them.
- */
- if (global_reclaim(sc) ||
+ /* Case 1 above */
+ if (current_is_kswapd() &&
+ PageReclaim(page) &&
+ zone_is_reclaim_writeback(zone)) {
+ unlock_page(page);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ zone_clear_flag(zone, ZONE_WRITEBACK);
+ goto keep;
+
+ /* Case 2 above */
+ } else if (global_reclaim(sc) ||
!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
/*
* This is slightly racy - end_page_writeback()
*/
SetPageReclaim(page);
nr_writeback++;
+
goto keep_locked;
+
+ /* Case 3 above */
+ } else {
+ wait_on_page_writeback(page);
}
- wait_on_page_writeback(page);
}
if (!force_reclaim)
if (!add_to_swap(page, page_list))
goto activate_locked;
may_enter_fs = 1;
- }
- mapping = page_mapping(page);
+ /* Adding to swap updated mapping */
+ mapping = page_mapping(page);
+ }
/*
* The page is mapped into the page tables of one or more
}
if (PageDirty(page)) {
- nr_dirty++;
-
/*
* Only kswapd can writeback filesystem pages to
- * avoid risk of stack overflow but do not writeback
- * unless under significant pressure.
+ * avoid risk of stack overflow but only writeback
+ * if many dirty pages have been encountered.
*/
if (page_is_file_cache(page) &&
(!current_is_kswapd() ||
- sc->priority >= DEF_PRIORITY - 2)) {
+ !zone_is_reclaim_dirty(zone))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
/* Page is dirty, try to write it out here */
switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
- nr_congested++;
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
mem_cgroup_uncharge_end();
- *ret_nr_dirty += nr_dirty;
+ *ret_nr_unqueued_dirty += nr_unqueued_dirty;
*ret_nr_writeback += nr_writeback;
return nr_reclaimed;
}
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- unsigned long nr_dirty = 0;
+ unsigned long nr_unqueued_dirty = 0;
unsigned long nr_writeback = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
return 0;
nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
- &nr_dirty, &nr_writeback, false);
+ &nr_unqueued_dirty, &nr_writeback, false);
spin_lock_irq(&zone->lru_lock);
* isolated page is PageWriteback
*/
if (nr_writeback && nr_writeback >=
- (nr_taken >> (DEF_PRIORITY - sc->priority)))
+ (nr_taken >> (DEF_PRIORITY - sc->priority))) {
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+ zone_set_flag(zone, ZONE_WRITEBACK);
+ }
+
+ /*
+ * Similarly, if many dirty pages are encountered that are not
+ * currently being written then flag that kswapd should start
+ * writing back pages and stall to give a chance for flushers
+ * to catch up.
+ */
+ if (global_reclaim(sc) && nr_unqueued_dirty == nr_taken) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+ }
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
* the high watermark.
*
* Returns true if kswapd scanned at least the requested number of pages to
- * reclaim. This is used to determine if the scanning priority needs to be
- * raised.
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
*/
static bool kswapd_shrink_zone(struct zone *zone,
+ int classzone_idx,
struct scan_control *sc,
unsigned long lru_pages,
unsigned long *nr_attempted)
{
unsigned long nr_slab;
+ int testorder = sc->order;
+ unsigned long balance_gap;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct shrink_control shrink = {
.gfp_mask = sc->gfp_mask,
};
+ bool lowmem_pressure;
/* Reclaim above the high watermark. */
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+
+ /*
+ * Kswapd reclaims only single pages with compaction enabled. Trying
+ * too hard to reclaim until contiguous free pages have become
+ * available can hurt performance by evicting too much useful data
+ * from memory. Do not reclaim more than needed for compaction.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+ compaction_suitable(zone, sc->order) !=
+ COMPACT_SKIPPED)
+ testorder = 0;
+
+ /*
+ * We put equal pressure on every zone, unless one zone has way too
+ * many pages free already. The "too many pages" is defined as the
+ * high wmark plus a "gap" where the gap is either the low
+ * watermark or 1% of the zone, whichever is smaller.
+ */
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
+
+ /*
+ * If there is no low memory pressure or the zone is balanced then no
+ * reclaim is necessary
+ */
+ lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
+ if (!lowmem_pressure && zone_balanced(zone, testorder,
+ balance_gap, classzone_idx))
+ return true;
+
shrink_zone(zone, sc);
reclaim_state->reclaimed_slab = 0;
if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
+ zone_clear_flag(zone, ZONE_WRITEBACK);
+
+ /*
+ * If a zone reaches its high watermark, consider it to be no longer
+ * congested. It's possible there are dirty pages backed by congested
+ * BDIs but as pressure is relieved, speculatively avoid congestion
+ * waits.
+ */
+ if (!zone->all_unreclaimable &&
+ zone_balanced(zone, testorder, 0, classzone_idx)) {
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+ }
+
return sc->nr_scanned >= sc->nr_to_reclaim;
}
end_zone = i;
break;
} else {
- /* If balanced, clear the congested flag */
+ /*
+ * If balanced, clear the dirty and congested
+ * flags
+ */
zone_clear_flag(zone, ZONE_CONGESTED);
+ zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
}
}
}
/*
+ * If we're getting trouble reclaiming, start doing writepage
+ * even in laptop mode.
+ */
+ if (sc.priority < DEF_PRIORITY - 2)
+ sc.may_writepage = 1;
+
+ /*
* Now scan the zone in the dma->highmem direction, stopping
* at the last zone which needs scanning.
*
*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- int testorder;
- unsigned long balance_gap;
if (!populated_zone(zone))
continue;
sc.nr_reclaimed += nr_soft_reclaimed;
/*
- * We put equal pressure on every zone, unless
- * one zone has way too many pages free
- * already. The "too many pages" is defined
- * as the high wmark plus a "gap" where the
- * gap is either the low watermark or 1%
- * of the zone, whichever is smaller.
- */
- balance_gap = min(low_wmark_pages(zone),
- (zone->managed_pages +
- KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
- KSWAPD_ZONE_BALANCE_GAP_RATIO);
- /*
- * Kswapd reclaims only single pages with compaction
- * enabled. Trying too hard to reclaim until contiguous
- * free pages have become available can hurt performance
- * by evicting too much useful data from memory.
- * Do not reclaim more than needed for compaction.
- */
- testorder = order;
- if (IS_ENABLED(CONFIG_COMPACTION) && order &&
- compaction_suitable(zone, order) !=
- COMPACT_SKIPPED)
- testorder = 0;
-
- if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
- !zone_balanced(zone, testorder,
- balance_gap, end_zone)) {
- /*
- * There should be no need to raise the
- * scanning priority if enough pages are
- * already being scanned that high
- * watermark would be met at 100% efficiency.
- */
- if (kswapd_shrink_zone(zone, &sc, lru_pages,
- &nr_attempted))
- raise_priority = false;
- }
-
- /*
- * If we're getting trouble reclaiming, start doing
- * writepage even in laptop mode.
+ * There should be no need to raise the scanning
+ * priority if enough pages are already being scanned
+ * that that high watermark would be met at 100%
+ * efficiency.
*/
- if (sc.priority < DEF_PRIORITY - 2)
- sc.may_writepage = 1;
-
- if (zone->all_unreclaimable) {
- if (end_zone && end_zone == i)
- end_zone--;
- continue;
- }
-
- if (zone_balanced(zone, testorder, 0, end_zone))
- /*
- * If a zone reaches its high watermark,
- * consider it to be no longer congested. It's
- * possible there are dirty pages backed by
- * congested BDIs but as pressure is relieved,
- * speculatively avoid congestion waits
- */
- zone_clear_flag(zone, ZONE_CONGESTED);
+ if (kswapd_shrink_zone(zone, end_zone, &sc,
+ lru_pages, &nr_attempted))
+ raise_priority = false;
}
/*