X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=mm%2Fvmscan.c;h=a7ccb96895c3799e63651c4517c1a4b0cb4ff917;hb=268c1f46b2aaf95fa8501716864366b9796f0e7b;hp=adacd65104d75c92e1ce70f42e8c2930b3148e89;hpb=5bb230aa94f79d302a2c46ba3911f099ede626ec;p=platform%2Fkernel%2Flinux-rpi.git diff --git a/mm/vmscan.c b/mm/vmscan.c index adacd65..a7ccb96 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -51,6 +51,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -79,6 +83,13 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; + /* Can active pages be deactivated as part of reclaim? */ +#define DEACTIVATE_ANON 1 +#define DEACTIVATE_FILE 2 + unsigned int may_deactivate:2; + unsigned int force_deactivate:1; + unsigned int skipped_deactivate:1; + /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; @@ -101,6 +112,18 @@ struct scan_control { /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; + /* There is easily reclaimable cold cache in the current node */ + unsigned int cache_trim_mode:1; + + /* The file pages on the current node are dangerously low */ + unsigned int file_is_tiny:1; + +#ifdef CONFIG_LRU_GEN + /* help kswapd make better choices among multiple memcgs */ + unsigned int memcgs_need_aging:1; + unsigned long last_reclaimed; +#endif + /* Allocation order */ s8 order; @@ -239,13 +262,13 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) up_write(&shrinker_rwsem); } -static bool global_reclaim(struct scan_control *sc) +static bool cgroup_reclaim(struct scan_control *sc) { - return !sc->target_mem_cgroup; + return sc->target_mem_cgroup; } /** - * sane_reclaim - is the usual dirty throttling mechanism operational? + * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is @@ -257,11 +280,9 @@ static bool global_reclaim(struct scan_control *sc) * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ -static bool sane_reclaim(struct scan_control *sc) +static bool writeback_throttling_sane(struct scan_control *sc) { - struct mem_cgroup *memcg = sc->target_mem_cgroup; - - if (!memcg) + if (!cgroup_reclaim(sc)) return true; #ifdef CONFIG_CGROUP_WRITEBACK if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) @@ -269,29 +290,6 @@ static bool sane_reclaim(struct scan_control *sc) #endif return false; } - -static void set_memcg_congestion(pg_data_t *pgdat, - struct mem_cgroup *memcg, - bool congested) -{ - struct mem_cgroup_per_node *mn; - - if (!memcg) - return; - - mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - WRITE_ONCE(mn->congested, congested); -} - -static bool memcg_congested(pg_data_t *pgdat, - struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *mn; - - mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - return READ_ONCE(mn->congested); - -} #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { @@ -302,27 +300,15 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) { } -static bool global_reclaim(struct scan_control *sc) +static bool cgroup_reclaim(struct scan_control *sc) { - return true; + return false; } -static bool sane_reclaim(struct scan_control *sc) +static bool writeback_throttling_sane(struct scan_control *sc) { return true; } - -static inline void set_memcg_congestion(struct pglist_data *pgdat, - struct mem_cgroup *memcg, bool congested) -{ -} - -static inline bool memcg_congested(struct pglist_data *pgdat, - struct mem_cgroup *memcg) -{ - return false; - -} #endif /* @@ -351,32 +337,21 @@ unsigned long zone_reclaimable_pages(struct zone *zone) */ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { - unsigned long lru_size = 0; + unsigned long size = 0; int zid; - if (!mem_cgroup_disabled()) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) - lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); - } else - lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); - - for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { + for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; - unsigned long size; if (!managed_zone(zone)) continue; if (!mem_cgroup_disabled()) - size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else - size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid], - NR_ZONE_LRU_BASE + lru); - lru_size -= min(size, lru_size); + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); } - - return lru_size; - + return size; } /* @@ -899,7 +874,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct page *page, - bool reclaimed) + bool reclaimed, struct mem_cgroup *target_memcg) { unsigned long flags; int refcount; @@ -971,7 +946,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) - shadow = workingset_eviction(page); + shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); @@ -994,7 +969,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page, false)) { + if (__remove_mapping(mapping, page, false, NULL)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -1119,7 +1094,6 @@ static void page_check_dirty_writeback(struct page *page, static unsigned long shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, struct scan_control *sc, - enum ttu_flags ttu_flags, struct reclaim_stat *stat, bool ignore_references) { @@ -1160,6 +1134,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!sc->may_unmap && page_mapped(page)) goto keep_locked; + /* page_update_gen() tried to promote this page? */ + if (lru_gen_enabled() && !ignore_references && + page_mapped(page) && PageReferenced(page)) + goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); @@ -1239,7 +1218,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; /* Case 2 above */ - } else if (sane_reclaim(sc) || + } else if (writeback_throttling_sane(sc) || !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() @@ -1345,7 +1324,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page)) { - enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + enum ttu_flags flags = TTU_BATCH_FLUSH; if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; @@ -1472,7 +1451,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); - } else if (!mapping || !__remove_mapping(mapping, page, true)) + } else if (!mapping || !__remove_mapping(mapping, page, true, + sc->target_mem_cgroup)) goto keep_locked; unlock_page(page); @@ -1555,7 +1535,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, &dummy_stat, true); + &dummy_stat, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1573,7 +1553,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, */ int __isolate_lru_page(struct page *page, isolate_mode_t mode) { - int ret = -EINVAL; + int ret = -EBUSY; /* Only take pages on the LRU. */ if (!PageLRU(page)) @@ -1583,8 +1563,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) return ret; - ret = -EBUSY; - /* * To minimise LRU disruption, the caller can indicate that it only * wants to isolate pages it will be able to operate on without @@ -1631,8 +1609,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * sure the page is not being freed elsewhere -- the * page release code relies on it. */ - ClearPageLRU(page); - ret = 0; + if (TestClearPageLRU(page)) + ret = 0; + else + put_page(page); } return ret; @@ -1702,8 +1682,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - VM_BUG_ON_PAGE(!PageLRU(page), page); - nr_pages = compound_nr(page); total_scan += nr_pages; @@ -1800,21 +1778,18 @@ int isolate_lru_page(struct page *page) VM_BUG_ON_PAGE(!page_count(page), page); WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); - if (PageLRU(page)) { + if (TestClearPageLRU(page)) { pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; - spin_lock_irq(&pgdat->lru_lock); + get_page(page); lruvec = mem_cgroup_page_lruvec(page, pgdat); - if (PageLRU(page)) { - int lru = page_lru(page); - get_page(page); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, lru); - ret = 0; - } + spin_lock_irq(&pgdat->lru_lock); + del_page_from_lru_list(page, lruvec, page_lru(page)); spin_unlock_irq(&pgdat->lru_lock); + ret = 0; } + return ret; } @@ -1833,7 +1808,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, if (current_is_kswapd()) return 0; - if (!sane_reclaim(sc)) + if (!writeback_throttling_sane(sc)) return 0; if (file) { @@ -1983,7 +1958,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_scanned[file] += nr_taken; item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; - if (global_reclaim(sc)) + if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -1991,13 +1966,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, - &stat, false); + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); spin_lock_irq(&pgdat->lru_lock); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; - if (global_reclaim(sc)) + if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; @@ -2173,8 +2147,7 @@ unsigned long reclaim_pages(struct list_head *page_list) nr_reclaimed += shrink_page_list(&node_page_list, NODE_DATA(nid), - &sc, 0, - &dummy_stat, false); + &sc, &dummy_stat, false); while (!list_empty(&node_page_list)) { page = lru_to_page(&node_page_list); list_del(&page->lru); @@ -2187,8 +2160,7 @@ unsigned long reclaim_pages(struct list_head *page_list) if (!list_empty(&node_page_list)) { nr_reclaimed += shrink_page_list(&node_page_list, NODE_DATA(nid), - &sc, 0, - &dummy_stat, false); + &sc, &dummy_stat, false); while (!list_empty(&node_page_list)) { page = lru_to_page(&node_page_list); list_del(&page->lru); @@ -2199,6 +2171,20 @@ unsigned long reclaim_pages(struct list_head *page_list) return nr_reclaimed; } +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc) +{ + if (is_active_lru(lru)) { + if (sc->may_deactivate & (1 << is_file_lru(lru))) + shrink_active_list(nr_to_scan, lruvec, sc, lru); + else + sc->skipped_deactivate = 1; + return 0; + } + + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work. @@ -2227,64 +2213,25 @@ unsigned long reclaim_pages(struct list_head *page_list) * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc, bool trace) +static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { - enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - enum lru_list inactive_lru = file * LRU_FILE; + enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; unsigned long inactive_ratio; - unsigned long refaults; unsigned long gb; - /* - * If we don't have swap space, anonymous page deactivation - * is pointless. - */ - if (!file && !total_swap_pages) - return false; - - inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); - active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); - - /* - * When refaults are being observed, it means a new workingset - * is being established. Disable active list protection to get - * rid of the stale workingset quickly. - */ - refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - if (file && lruvec->refaults != refaults) { - inactive_ratio = 0; - } else { - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); - else - inactive_ratio = 1; - } + inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); + active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); - if (trace) - trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, - lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, - lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, - inactive_ratio, file); + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; return inactive * inactive_ratio < active; } -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct scan_control *sc) -{ - if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) - shrink_active_list(nr_to_scan, lruvec, sc, lru); - return 0; - } - - return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); -} - enum scan_balance { SCAN_EQUAL, SCAN_FRACT, @@ -2292,6 +2239,95 @@ enum scan_balance { SCAN_FILE, }; +static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) +{ + unsigned long file; + struct lruvec *target_lruvec; + + if (lru_gen_enabled()) + return; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE); + if (refaults != target_lruvec->refaults || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } +} + /* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined @@ -2301,12 +2337,13 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, - struct scan_control *sc, unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + unsigned long *nr) { + struct mem_cgroup *memcg = lruvec_memcg(lruvec); int swappiness = mem_cgroup_swappiness(memcg); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - u64 fraction[2]; + u64 fraction[ANON_AND_FILE]; u64 denominator = 0; /* gcc */ struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long anon_prio, file_prio; @@ -2328,7 +2365,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !swappiness) { + if (cgroup_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -2344,58 +2381,18 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, } /* - * Prevent the reclaimer from falling into the cache trap: as - * cache pages start out inactive, every cache fault will tip - * the scan balance towards the file LRU. And as the file LRU - * shrinks, so does the window for rotation from references. - * This means we have a runaway feedback loop where a tiny - * thrashing file LRU becomes infinitely more attractive than - * anon pages. Try to detect this based on file LRU size. + * If the system is almost out of file pages, force-scan anon. */ - if (global_reclaim(sc)) { - unsigned long pgdatfile; - unsigned long pgdatfree; - int z; - unsigned long total_high_wmark = 0; - - pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); - pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + - node_page_state(pgdat, NR_INACTIVE_FILE); - - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = &pgdat->node_zones[z]; - if (!managed_zone(zone)) - continue; - - total_high_wmark += high_wmark_pages(zone); - } - - if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { - /* - * Force SCAN_ANON if there are enough inactive - * anonymous pages on the LRU in eligible zones. - * Otherwise, the small LRU gets thrashed. - */ - if (!inactive_list_is_low(lruvec, false, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) - >> sc->priority) { - scan_balance = SCAN_ANON; - goto out; - } - } + if (sc->file_is_tiny) { + scan_balance = SCAN_ANON; + goto out; } /* - * If there is enough inactive page cache, i.e. if the size of the - * inactive list is greater than that of the active list *and* the - * inactive list actually has some pages to scan on this priority, we - * do not reclaim anything from the anonymous working set right now. - * Without the second condition we could end up never scanning an - * lruvec even if it has plenty of old anonymous pages unless the - * system is under heavy pressure. + * If there is enough inactive page cache, we do not reclaim + * anything from the anonymous working right now. */ - if (!inactive_list_is_low(lruvec, true, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { + if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; goto out; } @@ -2540,10 +2537,8 @@ out: case SCAN_FILE: case SCAN_ANON: /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) { - lruvec_size = 0; + if ((scan_balance == SCAN_FILE) != file) scan = 0; - } break; default: /* Look ma, no brain */ @@ -2554,121 +2549,2839 @@ out: } } -/* - * This is a basic per-node page freer. Used by both kswapd and direct reclaim. - */ -static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, - struct scan_control *sc) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - unsigned long nr[NR_LRU_LISTS]; - unsigned long targets[NR_LRU_LISTS]; - unsigned long nr_to_scan; - enum lru_list lru; - unsigned long nr_reclaimed = 0; - unsigned long nr_to_reclaim = sc->nr_to_reclaim; - struct blk_plug plug; - bool scan_adjusted; +#ifdef CONFIG_LRU_GEN - get_scan_count(lruvec, memcg, sc, nr); +#ifdef CONFIG_LRU_GEN_ENABLED +DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) +#else +DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) +#endif - /* Record the original scan target for proportional adjustments later */ - memcpy(targets, nr, sizeof(nr)); +/****************************************************************************** + * shorthand helpers + ******************************************************************************/ - /* - * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal - * event that can occur when there is little memory pressure e.g. - * multiple streaming readers/writers. Hence, we do not abort scanning - * when the requested number of pages are reclaimed when scanning at - * DEF_PRIORITY on the assumption that the fact we are direct - * reclaiming implies that kswapd is not keeping up and it is best to - * do a batch of work at once. For memcg reclaim one check is made to - * abort proportional reclaim if either the file or anon lru has already - * dropped to zero at the first pass. - */ - scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && - sc->priority == DEF_PRIORITY); +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - blk_start_plug(&plug); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || - nr[LRU_INACTIVE_FILE]) { - unsigned long nr_anon, nr_file, percentage; - unsigned long nr_scanned; +#define DEFINE_MAX_SEQ(lruvec) \ + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) - for_each_evictable_lru(lru) { - if (nr[lru]) { - nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); - nr[lru] -= nr_to_scan; +#define DEFINE_MIN_SEQ(lruvec) \ + unsigned long min_seq[ANON_AND_FILE] = { \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ + } - nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, sc); - } - } +#define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) - cond_resched(); +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) +{ + struct pglist_data *pgdat = NODE_DATA(nid); - if (nr_reclaimed < nr_to_reclaim || scan_adjusted) - continue; +#ifdef CONFIG_MEMCG + if (memcg) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; - /* - * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs are scanned - * proportionally what was requested by get_scan_count(). We - * stop reclaiming one LRU and reduce the amount scanning - * proportional to the original scan target. - */ - nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; - nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + /* for hotadd_new_pgdat() */ + if (!lruvec->pgdat) + lruvec->pgdat = pgdat; - /* - * It's just vindictive to attack the larger once the smaller - * has gone to zero. And given the way we stop scanning the - * smaller below, this makes sure that we only make one nudge - * towards proportionality once we've got nr_to_reclaim. - */ - if (!nr_file || !nr_anon) - break; + return lruvec; + } +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); - if (nr_file > nr_anon) { - unsigned long scan_target = targets[LRU_INACTIVE_ANON] + - targets[LRU_ACTIVE_ANON] + 1; - lru = LRU_BASE; - percentage = nr_anon * 100 / scan_target; - } else { - unsigned long scan_target = targets[LRU_INACTIVE_FILE] + - targets[LRU_ACTIVE_FILE] + 1; - lru = LRU_FILE; - percentage = nr_file * 100 / scan_target; - } + return pgdat ? &pgdat->__lruvec : NULL; +} - /* Stop scanning the smaller of the LRU */ - nr[lru] = 0; - nr[lru + LRU_ACTIVE] = 0; +static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) +{ + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + /* struct pglist_data *pgdat = lruvec_pgdat(lruvec); */ - /* - * Recalculate the other LRU scan count based on its original - * scan target and the percentage scanning already complete - */ - lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; - nr_scanned = targets[lru] - nr[lru]; - nr[lru] = targets[lru] * (100 - percentage) / 100; - nr[lru] -= min(nr[lru], nr_scanned); + /* FIXME: see a2a36488a61c + 26aa2d199d6f */ + if (/* !can_demote(pgdat->node_id, sc) && */ + mem_cgroup_get_nr_swap_pages(memcg) <= 0) + return 0; - lru += LRU_ACTIVE; - nr_scanned = targets[lru] - nr[lru]; - nr[lru] = targets[lru] * (100 - percentage) / 100; - nr[lru] -= min(nr[lru], nr_scanned); + return mem_cgroup_swappiness(memcg); +} - scan_adjusted = true; - } - blk_finish_plug(&plug); +static int get_nr_gens(struct lruvec *lruvec, int type) +{ + return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; +} + +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) +{ + /* see the comment on lru_gen_struct */ + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; +} + +/****************************************************************************** + * mm_struct list + ******************************************************************************/ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + static struct lru_gen_mm_list mm_list = { + .fifo = LIST_HEAD_INIT(mm_list.fifo), + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), + }; + +#ifdef CONFIG_MEMCG + if (memcg) + return &memcg->mm_list; +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); + + return &mm_list; +} + +void lru_gen_add_mm(struct mm_struct *mm) +{ + int nid; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); +#ifdef CONFIG_MEMCG + VM_WARN_ON_ONCE(mm->lru_gen.memcg); + mm->lru_gen.memcg = memcg; +#endif + spin_lock(&mm_list->lock); + + for_each_node_state(nid, N_MEMORY) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + /* the first addition since the last iteration */ + if (lruvec->mm_state.tail == &mm_list->fifo) + lruvec->mm_state.tail = &mm->lru_gen.list; + } + + list_add_tail(&mm->lru_gen.list, &mm_list->fifo); + + spin_unlock(&mm_list->lock); +} + +void lru_gen_del_mm(struct mm_struct *mm) +{ + int nid; + struct lru_gen_mm_list *mm_list; + struct mem_cgroup *memcg = NULL; + + if (list_empty(&mm->lru_gen.list)) + return; + +#ifdef CONFIG_MEMCG + memcg = mm->lru_gen.memcg; +#endif + mm_list = get_mm_list(memcg); + + spin_lock(&mm_list->lock); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + /* where the current iteration continues after */ + if (lruvec->mm_state.head == &mm->lru_gen.list) + lruvec->mm_state.head = lruvec->mm_state.head->prev; + + /* where the last iteration ended before */ + if (lruvec->mm_state.tail == &mm->lru_gen.list) + lruvec->mm_state.tail = lruvec->mm_state.tail->next; + } + + list_del_init(&mm->lru_gen.list); + + spin_unlock(&mm_list->lock); + +#ifdef CONFIG_MEMCG + mem_cgroup_put(mm->lru_gen.memcg); + mm->lru_gen.memcg = NULL; +#endif +} + +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + struct task_struct *task = rcu_dereference_protected(mm->owner, true); + + VM_WARN_ON_ONCE(task->mm != mm); + lockdep_assert_held(&task->alloc_lock); + + /* for mm_update_next_owner() */ + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(task); + rcu_read_unlock(); + if (memcg == mm->lru_gen.memcg) + return; + + VM_WARN_ON_ONCE(!mm->lru_gen.memcg); + VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); + + lru_gen_del_mm(mm); + lru_gen_add_mm(mm); +} +#endif + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = lruvec->mm_state.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return true; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) +{ + int i; + int hist; + + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); + + if (walk) { + hist = lru_hist_from_seq(walk->max_seq); + + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(lruvec->mm_state.stats[hist][i], + lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); + walk->mm_stats[i] = 0; + } + } + + if (NR_HIST_GENS > 1 && last) { + hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); + + for (i = 0; i < NR_MM_STATS; i++) + WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); + } +} + +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + int type; + unsigned long size = 0; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); + + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) + return true; + + clear_bit(key, &mm->lru_gen.bitmap); + + for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { + size += type ? get_mm_counter(mm, MM_FILEPAGES) : + get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + } + + if (size < MIN_LRU_BATCH) + return true; + + return !mmget_not_zero(mm); +} + +static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, + struct mm_struct **iter) +{ + bool first = false; + bool last = false; + struct mm_struct *mm = NULL; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + /* + * mm_state->seq is incremented after each iteration of mm_list. There + * are three interesting cases for this page table walker: + * 1. It tries to start a new iteration with a stale max_seq: there is + * nothing left to do. + * 2. It started the next iteration: it needs to reset the Bloom filter + * so that a fresh set of PTE tables can be recorded. + * 3. It ended the current iteration: it needs to reset the mm stats + * counters and tell its caller to increment max_seq. + */ + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); + + if (walk->max_seq <= mm_state->seq) + goto done; + + if (!mm_state->head) + mm_state->head = &mm_list->fifo; + + if (mm_state->head == &mm_list->fifo) + first = true; + + do { + mm_state->head = mm_state->head->next; + if (mm_state->head == &mm_list->fifo) { + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + last = true; + break; + } + + /* force scan for those added after the last iteration */ + if (!mm_state->tail || mm_state->tail == mm_state->head) { + mm_state->tail = mm_state->head->next; + walk->force_scan = true; + } + + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + if (should_skip_mm(mm, walk)) + mm = NULL; + } while (!mm); +done: + if (*iter || last) + reset_mm_stats(lruvec, walk, last); + + spin_unlock(&mm_list->lock); + + if (mm && first) + reset_bloom_filter(lruvec, walk->max_seq + 1); + + if (*iter) + mmput_async(*iter); + + *iter = mm; + + return last; +} + +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) +{ + bool success = false; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + + if (max_seq > mm_state->seq) { + mm_state->head = NULL; + mm_state->tail = NULL; + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + reset_mm_stats(lruvec, NULL, true); + success = true; + } + + spin_unlock(&mm_list->lock); + + return success; +} + +/****************************************************************************** + * refault feedback loop + ******************************************************************************/ + +/* + * A feedback loop based on Proportional-Integral-Derivative (PID) controller. + * + * The P term is refaulted/(evicted+protected) from a tier in the generation + * currently being evicted; the I term is the exponential moving average of the + * P term over the generations previously evicted, using the smoothing factor + * 1/2; the D term isn't supported. + * + * The setpoint (SP) is always the first tier of one type; the process variable + * (PV) is either any tier of the other type or any other tier of the same + * type. + * + * The error is the difference between the SP and the PV; the correction is to + * turn off protection when SP>PV or turn on protection when SPlrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + pos->total = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + pos->total += lrugen->protected[hist][type][tier - 1]; + pos->gain = gain; +} + +static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) +{ + int hist, tier; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; + + lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock); + + if (!carryover && !clear) + return; + + hist = lru_hist_from_seq(seq); + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + if (carryover) { + unsigned long sum; + + sum = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); + + sum = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + sum += lrugen->protected[hist][type][tier - 1]; + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); + } + + if (clear) { + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); + if (tier) + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + } + } +} + +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) +{ + /* + * Return true if the PV has a limited number of refaults or a lower + * refaulted/total than the SP. + */ + return pv->refaulted < MIN_LRU_BATCH || + pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= + (sp->refaulted + 1) * pv->total * pv->gain; +} + +/****************************************************************************** + * the aging + ******************************************************************************/ + +/* promote pages accessed through page tables */ +static int page_update_gen(struct page *page, int gen) +{ + unsigned long new_flags, old_flags = READ_ONCE(page->flags); + + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + /* lru_gen_del_page() has isolated this page? */ + if (!(old_flags & LRU_GEN_MASK)) { + /* for shrink_page_list() */ + new_flags = old_flags | BIT(PG_referenced); + continue; + } + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); + + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + +/* protect pages accessed multiple times through file descriptors */ +static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming) +{ + int type = page_is_file_cache(page); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + unsigned long new_flags, old_flags = READ_ONCE(page->flags); + + VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page); + + do { + new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + /* page_update_gen() has promoted this page? */ + if (new_gen >= 0 && new_gen != old_gen) + return new_gen; + + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; + /* for end_page_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags)); + + lru_gen_update_size(lruvec, page, old_gen, new_gen); + + return new_gen; +} + +static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page, + int old_gen, int new_gen) +{ + int type = page_is_file_cache(page); + int zone = page_zonenum(page); + int delta = hpage_nr_pages(page); + + VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); + + walk->batched++; + + walk->nr_pages[old_gen][type][zone] -= delta; + walk->nr_pages[new_gen][type][zone] += delta; +} + +static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) +{ + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + walk->batched = 0; + + for_each_gen_type_zone(gen, type, zone) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + int delta = walk->nr_pages[gen][type][zone]; + + if (!delta) + continue; + + walk->nr_pages[gen][type][zone] = 0; + WRITE_ONCE(lrugen->nr_pages[gen][type][zone], + lrugen->nr_pages[gen][type][zone] + delta); + + if (lru_gen_is_active(lruvec, gen)) + lru += LRU_ACTIVE; + __update_lru_size(lruvec, lru, zone, delta); + } +} + +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) +{ + struct address_space *mapping; + struct vm_area_struct *vma = args->vma; + struct lru_gen_mm_walk *walk = args->private; + + if (!vma_is_accessible(vma)) + return true; + + if (is_vm_hugetlb_page(vma)) + return true; + + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) + return true; + + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + if (vma_is_anonymous(vma)) + return !walk->can_swap; + + if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) + return true; + + mapping = vma->vm_file->f_mapping; + if (mapping_unevictable(mapping)) + return true; + + if (shmem_mapping(mapping)) + return !walk->can_swap; + + /* to exclude special mappings like dax, etc. */ + return !mapping->a_ops->readpage; +} + +/* + * Some userspace memory allocators map many single-page VMAs. Instead of + * returning back to the PGD table for each of such VMAs, finish an entire PMD + * table to reduce zigzags and improve cache performance. + */ +static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, + unsigned long *vm_start, unsigned long *vm_end) +{ + unsigned long start = round_up(*vm_end, size); + unsigned long end = (start | ~mask) + 1; + + VM_WARN_ON_ONCE(mask & size); + VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); + + while (args->vma) { + if (start >= args->vma->vm_end) { + args->vma = args->vma->vm_next; + continue; + } + + if (end && end <= args->vma->vm_start) + return false; + + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) { + args->vma = args->vma->vm_next; + continue; + } + + *vm_start = max(start, args->vma->vm_start); + *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; + + return true; + } + + return false; +} + +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pte_pfn(pte); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pte_present(pte) || is_zero_pfn(pfn)) + return -1; + + if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pmd_pfn(pmd); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) + return -1; + + if (WARN_ON_ONCE(pmd_devmap(pmd))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} +#endif + +static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg, + struct pglist_data *pgdat, bool can_swap) +{ + struct page *page; + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return NULL; + + page = compound_head(pfn_to_page(pfn)); + if (page_to_nid(page) != pgdat->node_id) + return NULL; + + if (page_memcg_rcu(page) != memcg) + return NULL; + + /* file VMAs can contain anon pages from COW */ + if (!page_is_file_cache(page) && !can_swap) + return NULL; + + return page; +} + +static bool suitable_to_scan(int total, int young) +{ + int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); + + /* suitable if the average number of young PTEs per cacheline is >=1 */ + return young * n >= total; +} + +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pte_t *pte; + spinlock_t *ptl; + unsigned long addr; + int total = 0; + int young = 0; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + + VM_WARN_ON_ONCE(pmd_leaf(*pmd)); + + ptl = pte_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) + return false; + + arch_enter_lazy_mmu_mode(); + + pte = pte_offset_map(pmd, start & PMD_MASK); +restart: + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + struct page *page; + + total++; + walk->mm_stats[MM_LEAF_TOTAL]++; + + pfn = get_pte_pfn(pte[i], args->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) { + walk->mm_stats[MM_LEAF_OLD]++; + continue; + } + + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap); + if (!page) + continue; + + if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; + + if (pte_dirty(pte[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page))) + set_page_dirty(page); + + old_gen = page_update_gen(page, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, page, old_gen, new_gen); + } + + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) + goto restart; + + pte_unmap(pte); + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); + + return suitable_to_scan(total, young); +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +{ + int i; + pmd_t *pmd; + spinlock_t *ptl; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ + if (*start == -1) { + *start = next; + return; + } + + i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); + if (i && i <= MIN_LRU_BATCH) { + __set_bit(i - 1, bitmap); + return; + } + + pmd = pmd_offset(pud, *start); + + ptl = pmd_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) + goto done; + + arch_enter_lazy_mmu_mode(); + + do { + unsigned long pfn; + struct page *page; + unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; + + pfn = get_pmd_pfn(pmd[i], vma, addr); + if (pfn == -1) + goto next; + + if (!pmd_trans_huge(pmd[i])) { + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && + get_cap(LRU_GEN_NONLEAF_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; + } + + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap); + if (!page) + goto next; + + if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + goto next; + + walk->mm_stats[MM_LEAF_YOUNG]++; + + if (pmd_dirty(pmd[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page))) + set_page_dirty(page); + + old_gen = page_update_gen(page, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, page, old_gen, new_gen); +next: + i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; + } while (i <= MIN_LRU_BATCH); + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); +done: + *start = -1; + bitmap_zero(bitmap, MIN_LRU_BATCH); +} +#else +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +{ +} +#endif + +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pmd_t *pmd; + unsigned long next; + unsigned long addr; + struct vm_area_struct *vma; + unsigned long pos = -1; + struct lru_gen_mm_walk *walk = args->private; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* + * Finish an entire PMD in two passes: the first only reaches to PTE + * tables to avoid taking the PMD lock; the second, if necessary, takes + * the PMD lock to clear the accessed bit in PMD entries. + */ + pmd = pmd_offset(pud, start & PUD_MASK); +restart: + /* walk_pte_range() may call get_next_vma() */ + vma = args->vma; + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { + pmd_t val = pmd_read_atomic(pmd + i); + + /* for pmd_read_atomic() */ + barrier(); + + next = pmd_addr_end(addr, end); + + if (!pmd_present(val) || is_huge_zero_pmd(val)) { + walk->mm_stats[MM_LEAF_TOTAL]++; + continue; + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(val)) { + unsigned long pfn = pmd_pfn(val); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + + walk->mm_stats[MM_LEAF_TOTAL]++; + + if (!pmd_young(val)) { + walk->mm_stats[MM_LEAF_OLD]++; + continue; + } + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + continue; + } +#endif + walk->mm_stats[MM_NONLEAF_TOTAL]++; + +#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG + if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (!pmd_young(val)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + } +#endif + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + continue; + + walk->mm_stats[MM_NONLEAF_FOUND]++; + + if (!walk_pte_range(&val, addr, next, args)) + continue; + + walk->mm_stats[MM_NONLEAF_ADDED]++; + + /* carry over to the next generation */ + update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); + } + + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); + + if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) + goto restart; +} + +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pud_t *pud; + unsigned long addr; + unsigned long next; + struct lru_gen_mm_walk *walk = args->private; + + VM_WARN_ON_ONCE(p4d_leaf(*p4d)); + + pud = pud_offset(p4d, start & P4D_MASK); +restart: + for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { + pud_t val = READ_ONCE(pud[i]); + + next = pud_addr_end(addr, end); + + if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) + continue; + + walk_pmd_range(&val, addr, next, args); + + if (need_resched() || walk->batched >= MAX_LRU_BATCH) { + end = (addr | ~PUD_MASK) + 1; + goto done; + } + } + + if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) + goto restart; + + end = round_up(end, P4D_SIZE); +done: + if (!end || !args->vma) + return 1; + + walk->next_addr = max(end, args->vma->vm_start); + + return -EAGAIN; +} + +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + static const struct mm_walk_ops mm_walk_ops = { + .test_walk = should_skip_vma, + .p4d_entry = walk_pud_range, + }; + + int err; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + walk->next_addr = FIRST_USER_ADDRESS; + + do { + DEFINE_MAX_SEQ(lruvec); + + err = -EBUSY; + + /* another thread might have called inc_max_seq() */ + if (walk->max_seq != max_seq) + break; + + /* page_update_gen() requires stable page_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + break; + + /* the caller might be holding the lock for write */ + if (down_read_trylock(&mm->mmap_sem)) { + err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); + + up_write(&mm->mmap_sem); + } + + mem_cgroup_unlock_pages(); + + if (walk->batched) { + spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock); + reset_batch_size(lruvec, walk); + spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock); + } + + cond_resched(); + } while (err == -EAGAIN); +} + +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + if (pgdat && current_is_kswapd()) { + VM_WARN_ON_ONCE(walk); + + walk = &pgdat->mm_walk; + } else if (!pgdat && !walk) { + VM_WARN_ON_ONCE(current_is_kswapd()); + + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + } + + current->reclaim_state->mm_walk = walk; + + return walk; +} + +static void clear_mm_walk(void) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); + VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); + + current->reclaim_state->mm_walk = NULL; + + if (!current_is_kswapd()) + kfree(walk); +} + +static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +{ + int zone; + int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + + if (type == LRU_GEN_ANON && !can_swap) + goto done; + + /* prevent cold/hot inversion if force_scan is true */ + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + struct list_head *head = &lrugen->lists[old_gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); + + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); + VM_WARN_ON_ONCE_PAGE(PageActive(page), page); + VM_WARN_ON_ONCE_PAGE(page_is_file_cache(page) != type, page); + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); + + new_gen = page_inc_gen(lruvec, page, false); + list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); + + if (!--remaining) + return false; + } + } +done: + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +} + +static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +{ + int gen, type, zone; + bool success = false; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + /* find the oldest populated generation */ + for (type = !can_swap; type < ANON_AND_FILE; type++) { + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { + gen = lru_gen_from_seq(min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + if (!list_empty(&lrugen->lists[gen][type][zone])) + goto next; + } + + min_seq[type]++; + } +next: + ; + } + + /* see the comment on lru_gen_struct */ + if (can_swap) { + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + } + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + if (min_seq[type] == lrugen->min_seq[type]) + continue; + + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } + + return success; +} + +static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) +{ + int prev, next; + int type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + for (type = ANON_AND_FILE - 1; type >= 0; type--) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; + + VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); + + while (!inc_min_seq(lruvec, type, can_swap)) { + spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock); + } + } + + /* + * Update the active/inactive LRU sizes for compatibility. Both sides of + * the current max_seq need to be covered, since max_seq+1 can overlap + * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do + * overlap, cold/hot inversion happens. + */ + prev = lru_gen_from_seq(lrugen->max_seq - 1); + next = lru_gen_from_seq(lrugen->max_seq + 1); + + for (type = 0; type < ANON_AND_FILE; type++) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + long delta = lrugen->nr_pages[prev][type][zone] - + lrugen->nr_pages[next][type][zone]; + + if (!delta) + continue; + + __update_lru_size(lruvec, lru, zone, delta); + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); + } + } + + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, type, false); + + WRITE_ONCE(lrugen->timestamps[next], jiffies); + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); + + spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock); +} + +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + struct scan_control *sc, bool can_swap, bool force_scan) +{ + bool success; + struct lru_gen_mm_walk *walk; + struct mm_struct *mm = NULL; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + + /* see the comment in iterate_mm_list() */ + if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { + success = false; + goto done; + } + + /* + * If the hardware doesn't automatically set the accessed bit, fallback + * to lru_gen_look_around(), which only clears the accessed bit in a + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ + if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } + + walk = set_mm_walk(NULL); + if (!walk) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } + + walk->lruvec = lruvec; + walk->max_seq = max_seq; + walk->can_swap = can_swap; + walk->force_scan = force_scan; + + do { + success = iterate_mm_list(lruvec, walk, &mm); + if (mm) + walk_mm(lruvec, mm, walk); + } while (mm); +done: + if (success) + inc_max_seq(lruvec, can_swap, force_scan); + + return success; +} + +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +{ + int gen, type, zone; + unsigned long old = 0; + unsigned long young = 0; + unsigned long total = 0; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + unsigned long size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + total += size; + if (seq == max_seq) + young += size; + else if (seq + MIN_NR_GENS == max_seq) + old += size; + } + } + + /* try to scrape all its memory if this memcg was deleted */ + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + + /* + * The aging tries to be lazy to reduce the overhead, while the eviction + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the + * ideal number of generations is MIN_NR_GENS+1. + */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) + return true; + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) + return false; + + /* + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) + * of the total number of pages for each generation. A reasonable range + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The + * aging cares about the upper bound of hot pages, while the eviction + * cares about the lower bound of cold pages. + */ + if (young * MIN_NR_GENS > total) + return true; + if (old * (MIN_NR_GENS + 2) < total) + return true; + + return false; +} + +static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) +{ + bool need_aging; + unsigned long nr_to_scan; + int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(sc->memcg_low_reclaim); + + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(memcg)) + return false; + + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); + + if (min_ttl) { + int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + if (time_is_after_jiffies(birth + min_ttl)) + return false; + + /* the size is likely too small to be helpful */ + if (!nr_to_scan && sc->priority != DEF_PRIORITY) + return false; + } + + if (need_aging) + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); + + return true; +} + +/* to protect the working set of the last N jiffies */ +static unsigned long lru_gen_min_ttl __read_mostly; + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + bool success = false; + unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + + VM_WARN_ON_ONCE(!current_is_kswapd()); + + sc->last_reclaimed = sc->nr_reclaimed; + + /* + * To reduce the chance of going into the aging path, which can be + * costly, optimistically skip it if the flag below was cleared in the + * eviction path. This improves the overall performance when multiple + * memcgs are available. + */ + if (!sc->memcgs_need_aging) { + sc->memcgs_need_aging = true; + return; + } + + set_mm_walk(pgdat); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + if (age_lruvec(lruvec, sc, min_ttl)) + success = true; + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + clear_mm_walk(); + + /* check the order to exclude compaction-induced reclaim */ + if (success || !min_ttl || sc->order) + return; + + /* + * The main goal is to OOM kill if every generation from all memcgs is + * younger than min_ttl. However, another possibility is all memcgs are + * either below min or empty. + */ + if (mutex_trylock(&oom_lock)) { + struct oom_control oc = { + .gfp_mask = sc->gfp_mask, + }; + + out_of_memory(&oc); + + mutex_unlock(&oom_lock); + } +} + +/* + * This function exploits spatial locality when shrink_page_list() walks the + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If + * the scan was done cacheline efficiently, it adds the PMD entry pointing to + * the PTE table to the Bloom filter. This forms a feedback loop between the + * eviction and the aging. + */ +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ + int i; + pte_t *pte; + unsigned long start; + unsigned long end; + unsigned long addr; + struct lru_gen_mm_walk *walk; + int young = 0; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + struct page *page = pvmw->page; + struct mem_cgroup *memcg = page_memcg(page); + struct pglist_data *pgdat = page_pgdat(page); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + DEFINE_MAX_SEQ(lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); + + lockdep_assert_held(pvmw->ptl); + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); + + if (spin_is_contended(pvmw->ptl)) + return; + + /* avoid taking the LRU lock under the PTL when possible */ + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; + + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); + end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { + if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + end = start + MIN_LRU_BATCH * PAGE_SIZE; + else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + start = end - MIN_LRU_BATCH * PAGE_SIZE; + else { + start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + } + } + + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + + rcu_read_lock(); + arch_enter_lazy_mmu_mode(); + + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + + pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) + continue; + + page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap); + if (!page) + continue; + + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + young++; + + if (pte_dirty(pte[i]) && !PageDirty(page) && + !(PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page))) + set_page_dirty(page); + + old_gen = page_lru_gen(page); + if (old_gen < 0) + SetPageReferenced(page); + else if (old_gen != new_gen) + __set_bit(i, bitmap); + } + + arch_leave_lazy_mmu_mode(); + rcu_read_unlock(); + + /* feedback from rmap walkers to page table walkers */ + if (suitable_to_scan(i, young)) + update_bloom_filter(lruvec, max_seq, pvmw->pmd); + + if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + page = pte_page(pte[i]); + activate_page(page); + } + return; + } + + /* page_update_gen() requires stable page_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; + + if (!walk) { + spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock); + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); + } + + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + page = compound_head(pte_page(pte[i])); + if (page_memcg_rcu(page) != memcg) + continue; + + old_gen = page_update_gen(page, new_gen); + if (old_gen < 0 || old_gen == new_gen) + continue; + + if (walk) + update_batch_size(walk, page, old_gen, new_gen); + else + lru_gen_update_size(lruvec, page, old_gen, new_gen); + } + + if (!walk) + spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock); + + mem_cgroup_unlock_pages(); +} + +/****************************************************************************** + * the eviction + ******************************************************************************/ + +static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx) +{ + bool success; + int gen = page_lru_gen(page); + int type = page_is_file_cache(page); + int zone = page_zonenum(page); + int delta = hpage_nr_pages(page); + int refs = page_lru_refs(page); + int tier = lru_tier_from_refs(refs); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page); + + /* unevictable */ + if (!page_evictable(page)) { + success = lru_gen_del_page(lruvec, page, true); + VM_WARN_ON_ONCE_PAGE(!success, page); + SetPageUnevictable(page); + add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); + __count_vm_events(UNEVICTABLE_PGCULLED, delta); + return true; + } + + /* dirty lazyfree */ + if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) { + enum lru_list lru = page_lru_base_type(page); + + success = lru_gen_del_page(lruvec, page, true); + VM_WARN_ON_ONCE_PAGE(!success, page); + SetPageSwapBacked(page); + add_page_to_lru_list_tail(page, lruvec, lru); + return true; + } + + /* promoted */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&page->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + /* protected */ + if (tier > tier_idx) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + gen = page_inc_gen(lruvec, page, false); + list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]); + + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], + lrugen->protected[hist][type][tier - 1] + delta); + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE, delta); + return true; + } + + /* waiting for writeback */ + if (PageLocked(page) || PageWriteback(page) || + (type == LRU_GEN_FILE && PageDirty(page))) { + gen = page_inc_gen(lruvec, page, true); + list_move(&page->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + return false; +} + +static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc) +{ + bool success; + + /* unmapping inhibited */ + if (!sc->may_unmap && page_mapped(page)) + return false; + + /* swapping inhibited */ + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + (PageDirty(page) || + (PageAnon(page) && !PageSwapCache(page)))) + return false; + + /* raced with release_pages() */ + if (!get_page_unless_zero(page)) + return false; + + /* raced with another isolation */ + if (!TestClearPageLRU(page)) { + put_page(page); + return false; + } + + /* see the comment on MAX_NR_TIERS */ + if (!PageReferenced(page)) + set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + + /* for shrink_page_list() */ + ClearPageReclaim(page); + ClearPageReferenced(page); + + success = lru_gen_del_page(lruvec, page, true); + VM_WARN_ON_ONCE_PAGE(!success, page); + + return true; +} + +static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, + int type, int tier, struct list_head *list) +{ + int gen, zone; + enum vm_event_item item; + int sorted = 0; + int scanned = 0; + int isolated = 0; + int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_WARN_ON_ONCE(!list_empty(list)); + + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) + return 0; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (zone = sc->reclaim_idx; zone >= 0; zone--) { + LIST_HEAD(moved); + int skipped = 0; + struct list_head *head = &lrugen->lists[gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); + int delta = hpage_nr_pages(page); + + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); + VM_WARN_ON_ONCE_PAGE(PageActive(page), page); + VM_WARN_ON_ONCE_PAGE(page_is_file_cache(page) != type, page); + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); + + scanned += delta; + + if (sort_page(lruvec, page, tier)) + sorted += delta; + else if (isolate_page(lruvec, page, sc)) { + list_add(&page->lru, list); + isolated += delta; + } else { + list_move(&page->lru, &moved); + skipped += delta; + } + + if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) + break; + } + + if (skipped) { + list_splice(&moved, head); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + } + + if (!remaining || isolated >= MIN_LRU_BATCH) + break; + } + + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (!cgroup_reclaim(sc)) { + __count_vm_events(item, isolated); + __count_vm_events(PGREFILL, sorted); + } + __count_memcg_events(memcg, item, isolated); + __count_memcg_events(memcg, PGREFILL, sorted); + + /* + * There might not be eligible pages due to reclaim_idx, may_unmap and + * may_writepage. Check the remaining to prevent livelock if it's not + * making progress. + */ + return isolated || !remaining ? scanned : 0; +} + +static int get_tier_idx(struct lruvec *lruvec, int type) +{ + int tier; + struct ctrl_pos sp, pv; + + /* + * To leave a margin for fluctuations, use a larger gain factor (1:2). + * This value is chosen because any other tier would have at least twice + * as many refaults as the first tier. + */ + read_ctrl_pos(lruvec, type, 0, 1, &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, 2, &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + return tier - 1; +} + +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +{ + int type, tier; + struct ctrl_pos sp, pv; + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + + /* + * Compare the first tier of anon with that of file to determine which + * type to scan. Also need to compare other tiers of the selected type + * with the first tier of the other type to determine the last tier (of + * the selected type) to evict. + */ + read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); + type = positive_ctrl_err(&sp, &pv); + + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, gain[type], &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + *tier_idx = tier - 1; + + return type; +} + +static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + int *type_scanned, struct list_head *list) +{ + int i; + int type; + int scanned; + int tier = -1; + DEFINE_MIN_SEQ(lruvec); + + /* + * Try to make the obvious choice first. When anon and file are both + * available from the same generation, interpret swappiness 1 as file + * first and 200 as anon first. + */ + if (!swappiness) + type = LRU_GEN_FILE; + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) + type = LRU_GEN_ANON; + else if (swappiness == 1) + type = LRU_GEN_FILE; + else if (swappiness == 200) + type = LRU_GEN_ANON; + else + type = get_type_to_scan(lruvec, swappiness, &tier); + + for (i = !swappiness; i < ANON_AND_FILE; i++) { + if (tier < 0) + tier = get_tier_idx(lruvec, type); + + scanned = scan_pages(lruvec, sc, type, tier, list); + if (scanned) + break; + + type = !type; + tier = -1; + } + + *type_scanned = type; + + return scanned; +} + +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + bool *need_swapping) +{ + int type; + int scanned; + int reclaimed; + LIST_HEAD(list); + struct page *page; + enum vm_event_item item; + struct reclaim_stat stat; + struct lru_gen_mm_walk *walk; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock); + + scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); + + scanned += try_to_inc_min_seq(lruvec, swappiness); + + if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + scanned = 0; + + spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock); + + if (list_empty(&list)) + return scanned; + + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); + + list_for_each_entry(page, &list, lru) { + /* restore LRU_REFS_FLAGS cleared by isolate_page() */ + if (PageWorkingset(page)) + SetPageReferenced(page); + + /* don't add rejected pages to the oldest generation */ + if (PageReclaim(page) && + (PageDirty(page) || PageWriteback(page))) + ClearPageActive(page); + else + SetPageActive(page); + } + + spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock); + + move_pages_to_lru(lruvec, &list); + + walk = current->reclaim_state->mm_walk; + if (walk && walk->batched) + reset_batch_size(lruvec, walk); + + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); + __count_memcg_events(memcg, item, reclaimed); + + spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock); + + mem_cgroup_uncharge_list(&list); + free_unref_page_list(&list); + + sc->nr_reclaimed += reclaimed; + + if (need_swapping && type == LRU_GEN_ANON) + *need_swapping = true; + + return scanned; +} + +/* + * For future optimizations: + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg + * reclaim. + */ +static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, + bool can_swap, bool *need_aging) +{ + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(memcg) || + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + + *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); + if (!*need_aging) + return nr_to_scan; + + /* skip the aging path at the default priority */ + if (sc->priority == DEF_PRIORITY) + goto done; + + /* leave the work to lru_gen_age_node() */ + if (current_is_kswapd()) + return 0; + + if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) + return nr_to_scan; +done: + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; +} + +static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, + struct scan_control *sc, bool need_swapping) +{ + int i; + DEFINE_MAX_SEQ(lruvec); + + if (!current_is_kswapd()) { + /* age each memcg at most once to ensure fairness */ + if (max_seq - seq > 1) + return true; + + /* over-swapping can increase allocation latency */ + if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) + return true; + + /* give this thread a chance to exit and free its memory */ + if (fatal_signal_pending(current)) { + sc->nr_reclaimed += MIN_LRU_BATCH; + return true; + } + + if (cgroup_reclaim(sc)) + return false; + } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) + return false; + + /* keep scanning at low priorities to ensure fairness */ + if (sc->priority > DEF_PRIORITY - 2) + return false; + + /* + * A minimum amount of work was done under global memory pressure. For + * kswapd, it may be overshooting. For direct reclaim, the allocation + * may succeed if all suitable zones are somewhat safe. In either case, + * it's better to stop now, and restart later if necessary. + */ + for (i = 0; i <= sc->reclaim_idx; i++) { + unsigned long wmark; + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + + if (!managed_zone(zone)) + continue; + + wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); + if (wmark > zone_page_state(zone, NR_FREE_PAGES)) + return false; + } + + sc->nr_reclaimed += MIN_LRU_BATCH; + + return true; +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + bool need_aging = false; + bool need_swapping = false; + unsigned long scanned = 0; + unsigned long reclaimed = sc->nr_reclaimed; + DEFINE_MAX_SEQ(lruvec); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(lruvec_pgdat(lruvec)); + + while (true) { + int delta; + int swappiness; + unsigned long nr_to_scan; + + if (sc->may_swap) + swappiness = get_swappiness(lruvec, sc); + else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) + swappiness = 1; + else + swappiness = 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); + if (!nr_to_scan) + goto done; + + delta = evict_pages(lruvec, sc, swappiness, &need_swapping); + if (!delta) + goto done; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + + if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) + break; + + cond_resched(); + } + + /* see the comment in lru_gen_age_node() */ + if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) + sc->memcgs_need_aging = false; +done: + clear_mm_walk(); + + blk_finish_plug(&plug); +} + +/****************************************************************************** + * state change + ******************************************************************************/ + +static bool __maybe_unused state_is_valid(struct lruvec *lruvec) +{ + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + if (lrugen->enabled) { + enum lru_list lru; + + for_each_evictable_lru(lru) { + if (!list_empty(&lruvec->lists[lru])) + return false; + } + } else { + int gen, type, zone; + + for_each_gen_type_zone(gen, type, zone) { + if (!list_empty(&lrugen->lists[gen][type][zone])) + return false; + } + } + + return true; +} + +static bool fill_evictable(struct lruvec *lruvec) +{ + enum lru_list lru; + int remaining = MAX_LRU_BATCH; + + for_each_evictable_lru(lru) { + int type = is_file_lru(lru); + bool active = is_active_lru(lru); + struct list_head *head = &lruvec->lists[lru]; + + while (!list_empty(head)) { + bool success; + struct page *page = lru_to_page(head); + enum lru_list lru = page_lru_base_type(page); + + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); + VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page); + VM_WARN_ON_ONCE_PAGE(page_is_file_cache(page) != type, page); + VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page); + + del_page_from_lru_list(page, lruvec, lru); + success = lru_gen_add_page(lruvec, page, false); + VM_WARN_ON_ONCE(!success); + + if (!--remaining) + return false; + } + } + + return true; +} + +static bool drain_evictable(struct lruvec *lruvec) +{ + int gen, type, zone; + int remaining = MAX_LRU_BATCH; + + for_each_gen_type_zone(gen, type, zone) { + struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; + + while (!list_empty(head)) { + bool success; + struct page *page = lru_to_page(head); + enum lru_list lru = page_lru_base_type(page); + + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page); + VM_WARN_ON_ONCE_PAGE(PageActive(page), page); + VM_WARN_ON_ONCE_PAGE(page_is_file_cache(page) != type, page); + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page); + + success = lru_gen_del_page(lruvec, page, false); + VM_WARN_ON_ONCE(!success); + add_page_to_lru_list(page, lruvec, lru); + + if (!--remaining) + return false; + } + } + + return true; +} + +static void lru_gen_change_state(bool enabled) +{ + static DEFINE_MUTEX(state_mutex); + + struct mem_cgroup *memcg; + + cgroup_lock(); + cpus_read_lock(); + get_online_mems(); + mutex_lock(&state_mutex); + + if (enabled == lru_gen_enabled()) + goto unlock; + + if (enabled) + static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + else + static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + VM_WARN_ON_ONCE(!state_is_valid(lruvec)); + + lruvec->lrugen.enabled = enabled; + + while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { + spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock); + } + + spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock); + } + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +unlock: + mutex_unlock(&state_mutex); + put_online_mems(); + cpus_read_unlock(); + cgroup_unlock(); +} + +/****************************************************************************** + * sysfs interface + ******************************************************************************/ + +static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned int msecs; + + if (kstrtouint(buf, 0, &msecs)) + return -EINVAL; + + WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); + + return len; +} + +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( + min_ttl_ms, 0644, show_min_ttl, store_min_ttl +); + +static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned int caps = 0; + + if (get_cap(LRU_GEN_CORE)) + caps |= BIT(LRU_GEN_CORE); + + if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) + caps |= BIT(LRU_GEN_MM_WALK); + + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) + caps |= BIT(LRU_GEN_NONLEAF_YOUNG); + + return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int i; + unsigned int caps; + + if (tolower(*buf) == 'n') + caps = 0; + else if (tolower(*buf) == 'y') + caps = -1; + else if (kstrtouint(buf, 0, &caps)) + return -EINVAL; + + for (i = 0; i < NR_LRU_GEN_CAPS; i++) { + bool enabled = caps & BIT(i); + + if (i == LRU_GEN_CORE) + lru_gen_change_state(enabled); + else if (enabled) + static_branch_enable(&lru_gen_caps[i]); + else + static_branch_disable(&lru_gen_caps[i]); + } + + return len; +} + +static struct kobj_attribute lru_gen_enabled_attr = __ATTR( + enabled, 0644, show_enabled, store_enabled +); + +static struct attribute *lru_gen_attrs[] = { + &lru_gen_min_ttl_attr.attr, + &lru_gen_enabled_attr.attr, + NULL +}; + +static struct attribute_group lru_gen_attr_group = { + .name = "lru_gen", + .attrs = lru_gen_attrs, +}; + +/****************************************************************************** + * debugfs interface + ******************************************************************************/ + +static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg; + loff_t nr_to_skip = *pos; + + m->private = kvmalloc(PATH_MAX, GFP_KERNEL); + if (!m->private) + return ERR_PTR(-ENOMEM); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node_state(nid, N_MEMORY) { + if (!nr_to_skip--) + return get_lruvec(memcg, nid); + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + return NULL; +} + +static void lru_gen_seq_stop(struct seq_file *m, void *v) +{ + if (!IS_ERR_OR_NULL(v)) + mem_cgroup_iter_break(NULL, lruvec_memcg(v)); + + kvfree(m->private); + m->private = NULL; +} + +static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int nid = lruvec_pgdat(v)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(v); + + ++*pos; + + nid = next_memory_node(nid); + if (nid == MAX_NUMNODES) { + memcg = mem_cgroup_iter(NULL, memcg, NULL); + if (!memcg) + return NULL; + + nid = first_memory_node; + } + + return get_lruvec(memcg, nid); +} + +static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, + unsigned long max_seq, unsigned long *min_seq, + unsigned long seq) +{ + int i; + int type, tier; + int hist = lru_hist_from_seq(seq); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + seq_printf(m, " %10d", tier); + for (type = 0; type < ANON_AND_FILE; type++) { + const char *s = " "; + unsigned long n[3] = {}; + + if (seq == max_seq) { + s = "RT "; + n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); + n[1] = READ_ONCE(lrugen->avg_total[type][tier]); + } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { + s = "rep"; + n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); + n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + } + + for (i = 0; i < 3; i++) + seq_printf(m, " %10lu%c", n[i], s[i]); + } + seq_putc(m, '\n'); + } + + seq_puts(m, " "); + for (i = 0; i < NR_MM_STATS; i++) { + const char *s = " "; + unsigned long n = 0; + + if (seq == max_seq && NR_HIST_GENS == 1) { + s = "LOYNFA"; + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + } else if (seq != max_seq && NR_HIST_GENS > 1) { + s = "loynfa"; + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + } + + seq_printf(m, " %10lu%c", n, s[i]); + } + seq_putc(m, '\n'); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static int lru_gen_seq_show(struct seq_file *m, void *v) +{ + unsigned long seq; + bool full = !debugfs_real_fops(m->file)->write; + struct lruvec *lruvec = v; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int nid = lruvec_pgdat(lruvec)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (nid == first_memory_node) { + const char *path = memcg ? m->private : ""; + +#ifdef CONFIG_MEMCG + if (memcg) + cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); +#endif + seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); + } + + seq_printf(m, " node %5d\n", nid); + + if (!full) + seq = min_seq[LRU_GEN_ANON]; + else if (max_seq >= MAX_NR_GENS) + seq = max_seq - MAX_NR_GENS + 1; + else + seq = 0; + + for (; seq <= max_seq; seq++) { + int type, zone; + int gen = lru_gen_from_seq(seq); + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); + + for (type = 0; type < ANON_AND_FILE; type++) { + unsigned long size = 0; + char mark = full && seq < min_seq[type] ? 'x' : ' '; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + seq_printf(m, " %10lu%c", size, mark); + } + + seq_putc(m, '\n'); + + if (full) + lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); + } + + return 0; +} + +static const struct seq_operations lru_gen_seq_ops = { + .start = lru_gen_seq_start, + .stop = lru_gen_seq_stop, + .next = lru_gen_seq_next, + .show = lru_gen_seq_show, +}; + +static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, + bool can_swap, bool force_scan) +{ + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (seq < max_seq) + return 0; + + if (seq > max_seq) + return -EINVAL; + + if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) + return -ERANGE; + + try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); + + return 0; +} + +static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, + int swappiness, unsigned long nr_to_reclaim) +{ + DEFINE_MAX_SEQ(lruvec); + + if (seq + MIN_NR_GENS > max_seq) + return -EINVAL; + + sc->nr_reclaimed = 0; + + while (!signal_pending(current)) { + DEFINE_MIN_SEQ(lruvec); + + if (seq < min_seq[!swappiness]) + return 0; + + if (sc->nr_reclaimed >= nr_to_reclaim) + return 0; + + if (!evict_pages(lruvec, sc, swappiness, NULL)) + return 0; + + cond_resched(); + } + + return -EINTR; +} + +static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, + struct scan_control *sc, int swappiness, unsigned long opt) +{ + struct lruvec *lruvec; + int err = -EINVAL; + struct mem_cgroup *memcg = NULL; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) + return -EINVAL; + + if (!mem_cgroup_disabled()) { + rcu_read_lock(); + memcg = mem_cgroup_from_id(memcg_id); +#ifdef CONFIG_MEMCG + if (memcg && !css_tryget(&memcg->css)) + memcg = NULL; +#endif + rcu_read_unlock(); + + if (!memcg) + return -EINVAL; + } + + if (memcg_id != mem_cgroup_id(memcg)) + goto done; + + lruvec = get_lruvec(memcg, nid); + + if (swappiness < 0) + swappiness = get_swappiness(lruvec, sc); + else if (swappiness > 200) + goto done; + + switch (cmd) { + case '+': + err = run_aging(lruvec, seq, sc, swappiness, opt); + break; + case '-': + err = run_eviction(lruvec, seq, sc, swappiness, opt); + break; + } +done: + mem_cgroup_put(memcg); + + return err; +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, + size_t len, loff_t *pos) +{ + void *buf; + char *cur, *next; + unsigned int flags; + struct blk_plug plug; + int err = -EINVAL; + struct scan_control sc = { + .may_writepage = true, + .may_unmap = true, + .may_swap = true, + .reclaim_idx = MAX_NR_ZONES - 1, + .gfp_mask = GFP_KERNEL, + }; + + buf = kvmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, src, len)) { + kvfree(buf); + return -EFAULT; + } + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + blk_start_plug(&plug); + if (!set_mm_walk(NULL)) { + err = -ENOMEM; + goto done; + } + + next = buf; + next[len] = '\0'; + + while ((cur = strsep(&next, ",;\n"))) { + int n; + int end; + char cmd; + unsigned int memcg_id; + unsigned int nid; + unsigned long seq; + unsigned int swappiness = -1; + unsigned long opt = -1; + + cur = skip_spaces(cur); + if (!*cur) + continue; + + n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, &swappiness, &end, &opt, &end); + if (n < 4 || cur[end]) { + err = -EINVAL; + break; + } + + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); + if (err) + break; + } +done: + clear_mm_walk(); + blk_finish_plug(&plug); + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); + + kvfree(buf); + + return err ? : len; +} + +static int lru_gen_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lru_gen_seq_ops); +} + +static const struct file_operations lru_gen_rw_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .write = lru_gen_seq_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations lru_gen_ro_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/****************************************************************************** + * initialization + ******************************************************************************/ + +void lru_gen_init_lruvec(struct lruvec *lruvec) +{ + int i; + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); + + for (i = 0; i <= MIN_NR_GENS + 1; i++) + lrugen->timestamps[i] = jiffies; + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + + lruvec->mm_state.seq = MIN_NR_GENS; +} + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ + INIT_LIST_HEAD(&memcg->mm_list.fifo); + spin_lock_init(&memcg->mm_list.lock); +} + +void lru_gen_exit_memcg(struct mem_cgroup *memcg) +{ + int i; + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); + + for (i = 0; i < NR_BLOOM_FILTERS; i++) { + bitmap_free(lruvec->mm_state.filters[i]); + lruvec->mm_state.filters[i] = NULL; + } + } +} +#endif + +static int __init init_lru_gen(void) +{ + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) + pr_err("lru_gen: failed to create sysfs group\n"); + + debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); + debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + + return 0; +}; +late_initcall(init_lru_gen); + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ +} + +#endif /* CONFIG_LRU_GEN */ + +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; + bool scan_adjusted; + + if (lru_gen_enabled()) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } + + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + + for_each_evictable_lru(lru) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + + cond_resched(); + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs are scanned + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; + } + blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, sc, true)) + if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2744,153 +5457,149 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; } -static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) -{ - return test_bit(PGDAT_CONGESTED, &pgdat->flags) || - (memcg && memcg_congested(pgdat, memcg)); -} - -static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) +static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { - struct reclaim_state *reclaim_state = current->reclaim_state; - unsigned long nr_reclaimed, nr_scanned; - bool reclaimable = false; + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + struct mem_cgroup *memcg; + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); do { - struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup *memcg; - - memset(&sc->nr, 0, sizeof(sc->nr)); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long reclaimed; + unsigned long scanned; - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; + mem_cgroup_calculate_protection(target_memcg, memcg); - memcg = mem_cgroup_iter(root, NULL, NULL); - do { - unsigned long reclaimed; - unsigned long scanned; - - switch (mem_cgroup_protected(root, memcg)) { - case MEMCG_PROT_MIN: - /* - * Hard protection. - * If there is no reclaimable memory, OOM. - */ + if (mem_cgroup_below_min(memcg)) { + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + } else if (mem_cgroup_below_low(memcg)) { + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; continue; - case MEMCG_PROT_LOW: - /* - * Soft protection. - * Respect the protection only as long as - * there is an unprotected supply - * of reclaimable memory from other cgroups. - */ - if (!sc->memcg_low_reclaim) { - sc->memcg_low_skipped = 1; - continue; - } - memcg_memory_event(memcg, MEMCG_LOW); - break; - case MEMCG_PROT_NONE: - /* - * All protection thresholds breached. We may - * still choose to vary the scan pressure - * applied based on by how much the cgroup in - * question has exceeded its protection - * thresholds (see get_scan_count). - */ - break; } + memcg_memory_event(memcg, MEMCG_LOW); + } - reclaimed = sc->nr_reclaimed; - scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc); + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; - shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, - sc->priority); + shrink_lruvec(lruvec, sc); - /* Record the group's reclaim efficiency */ - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); - } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); + /* Record the group's reclaim efficiency */ + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); +} - /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); +static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) +{ + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; - if (sc->nr_reclaimed - nr_reclaimed) - reclaimable = true; + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - if (current_is_kswapd()) { - /* - * If reclaim is isolating dirty pages under writeback, - * it implies that the long-lived page allocation rate - * is exceeding the page laundering rate. Either the - * global limits are not being effective at throttling - * processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing - * device. The only option is to throttle from reclaim - * context which is not ideal as there is no guarantee - * the dirtying process is throttled in the same way - * balance_dirty_pages() manages. - * - * Once a node is flagged PGDAT_WRITEBACK, kswapd will - * count the number of pages under pages flagged for - * immediate reclaim and stall if any are encountered - * in the nr_immediate check below. - */ - if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); +again: + memset(&sc->nr, 0, sizeof(sc->nr)); - /* - * Tag a node as congested if all the dirty pages - * scanned were backed by a congested BDI and - * wait_iff_congested will stall. - */ - if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); + prepare_scan_count(pgdat, sc); - /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it - * implies that pages are cycling through the LRU - * faster than they are written so also forcibly stall. - */ - if (sc->nr.immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } + shrink_node_memcgs(pgdat, sc); + + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + + /* Record the subtree's reclaim efficiency */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + + if (current_is_kswapd()) { /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling in wait_iff_congested(). + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. */ - if (!global_reclaim(sc) && sane_reclaim(sc) && - sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_memcg_congestion(pgdat, root, true); + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); /* - * Stall direct reclaim for IO completions if underlying BDIs - * and node is congested. Allow kswapd to continue until it - * starts encountering unqueued dirty pages or cycling through - * the LRU too quickly. + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle() && pgdat_memcg_congested(pgdat, root)) - wait_iff_congested(BLK_RW_ASYNC, HZ/10); + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } - } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, - sc)); + /* + * Tag a node/memcg as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. + * + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if ((current_is_kswapd() || + (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); + + if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, + sc)) + goto again; /* * Kswapd gives up on balancing particular nodes after too @@ -2970,7 +5679,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * Take care memory controller reclaiming has small influence * to global LRU. */ - if (global_reclaim(sc)) { + if (!cgroup_reclaim(sc)) { if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) continue; @@ -3029,19 +5738,17 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask = orig_mask; } -static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) +static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { - struct mem_cgroup *memcg; + struct lruvec *target_lruvec; + unsigned long refaults; - memcg = mem_cgroup_iter(root_memcg, NULL, NULL); - do { - unsigned long refaults; - struct lruvec *lruvec; + if (lru_gen_enabled()) + return; - lruvec = mem_cgroup_lruvec(pgdat, memcg); - refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - lruvec->refaults = refaults; - } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE); + target_lruvec->refaults = refaults; } /* @@ -3070,7 +5777,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, retry: delayacct_freepages_start(); - if (global_reclaim(sc)) + if (!cgroup_reclaim(sc)) __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { @@ -3099,8 +5806,16 @@ retry: if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; + snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); - set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false); + + if (cgroup_reclaim(sc)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, + zone->zone_pgdat); + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + } } delayacct_freepages_end(); @@ -3112,9 +5827,27 @@ retry: if (sc->compaction_ready) return 1; + /* + * We make inactive:active ratio decisions based on the node's + * composition of memory, but a restrictive reclaim_idx or a + * memory.low cgroup setting can exempt large amounts of + * memory from reclaim. Neither of which are very common, so + * instead of doing costly eligibility calculations of the + * entire cgroup subtree up front, we assume the estimates are + * good, and retry with forcible deactivation if that fails. + */ + if (sc->skipped_deactivate) { + sc->priority = initial_priority; + sc->force_deactivate = 1; + sc->skipped_deactivate = 0; + goto retry; + } + /* Untapped cgroup reserves? Don't OOM, retry. */ if (sc->memcg_low_skipped) { sc->priority = initial_priority; + sc->force_deactivate = 0; + sc->skipped_deactivate = 0; sc->memcg_low_reclaim = 1; sc->memcg_low_skipped = 0; goto retry; @@ -3306,6 +6039,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, pg_data_t *pgdat, unsigned long *nr_scanned) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, @@ -3330,7 +6064,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_node_memcg(pgdat, memcg, &sc); + shrink_lruvec(lruvec, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -3388,22 +6122,29 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, } #endif -static void age_active_anon(struct pglist_data *pgdat, - struct scan_control *sc) +static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; + struct lruvec *lruvec; + if (lru_gen_enabled()) { + lru_gen_age_node(pgdat, sc); + return; + } + + /* FIXME? */ if (!total_swap_pages) return; + lruvec = mem_cgroup_lruvec(NULL, pgdat); + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + return; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - - if (inactive_list_is_low(lruvec, false, sc, true)) - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, - sc, LRU_ACTIVE_ANON); - + lruvec = mem_cgroup_lruvec(memcg, pgdat); + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); memcg = mem_cgroup_iter(NULL, memcg, NULL); } while (memcg); } @@ -3471,7 +6212,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) /* Clear pgdat state for congested, dirty or under writeback. */ static void clear_pgdat_congested(pg_data_t *pgdat) { - clear_bit(PGDAT_CONGESTED, &pgdat->flags); + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } @@ -3672,12 +6415,11 @@ restart: sc.may_swap = !nr_boost_reclaim; /* - * Do some background aging of the anon list, to give - * pages a chance to be referenced before reclaiming. All - * pages are rotated regardless of classzone as this is - * about consistent aging. + * Do some background aging, to give pages a chance to be + * referenced before reclaiming. All pages are rotated + * regardless of classzone as this is about consistent aging. */ - age_active_anon(pgdat, &sc); + kswapd_age_node(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage @@ -4341,6 +7083,10 @@ void check_move_unevictable_pages(struct pagevec *pvec) struct pglist_data *pagepgdat = page_pgdat(page); pgscanned++; + + if (!TestClearPageLRU(page)) + continue; + if (pagepgdat != pgdat) { if (pgdat) spin_unlock_irq(&pgdat->lru_lock); @@ -4349,10 +7095,7 @@ void check_move_unevictable_pages(struct pagevec *pvec) } lruvec = mem_cgroup_page_lruvec(page, pgdat); - if (!PageLRU(page) || !PageUnevictable(page)) - continue; - - if (page_evictable(page)) { + if (page_evictable(page) && PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); VM_BUG_ON_PAGE(PageActive(page), page); @@ -4361,12 +7104,15 @@ void check_move_unevictable_pages(struct pagevec *pvec) add_page_to_lru_list(page, lruvec, lru); pgrescued++; } + SetPageLRU(page); } if (pgdat) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); spin_unlock_irq(&pgdat->lru_lock); + } else if (pgscanned) { + count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages);