#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
/* The file pages on the current node are dangerously low */
unsigned int file_is_tiny:1;
+ /* Always discard instead of demoting to lower tier memory */
+ unsigned int no_demotion:1;
+
/* Allocation order */
s8 order;
return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
}
+static bool can_demote(int nid, struct scan_control *sc)
+{
+ if (!numa_demotion_enabled)
+ return false;
+ if (sc) {
+ if (sc->no_demotion)
+ return false;
+ /* It is pointless to do demotion in memcg reclaim */
+ if (cgroup_reclaim(sc))
+ return false;
+ }
+ if (next_demotion_node(nid) == NUMA_NO_NODE)
+ return false;
+
+ return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+ int nid,
+ struct scan_control *sc)
+{
+ if (memcg == NULL) {
+ /*
+ * For non-memcg reclaim, is there
+ * space in any swap device?
+ */
+ if (get_nr_swap_pages() > 0)
+ return true;
+ } else {
+ /* Is the memcg below its swap limit? */
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+ return true;
+ }
+
+ /*
+ * The page can not be swapped.
+ *
+ * Can it be reclaimed from this node via demotion?
+ */
+ return can_demote(nid, sc);
+}
+
/*
* This misses isolated pages which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
void drop_slab_node(int nid)
{
unsigned long freed;
+ int shift = 0;
do {
struct mem_cgroup *memcg = NULL;
do {
freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- } while (freed > 10);
+ } while ((freed >> shift++) > 1);
}
void drop_slab(void)
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed, struct mem_cgroup *target_memcg)
{
- unsigned long flags;
int refcount;
void *shadow = NULL;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xa_lock_irq(&mapping->i_pages);
/*
* The non racy check for a busy page.
*
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_swap_cache(page, swap, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
if (freepage != NULL)
freepage(page);
return 1;
cannot_free:
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xa_unlock_irq(&mapping->i_pages);
return 0;
}
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+ __GFP_THISNODE | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = node
+ };
+
+ return alloc_migration_target(page, (unsigned long)&mtc);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node. Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+ struct pglist_data *pgdat)
+{
+ int target_nid = next_demotion_node(pgdat->node_id);
+ unsigned int nr_succeeded;
+ int err;
+
+ if (list_empty(demote_pages))
+ return 0;
+
+ if (target_nid == NUMA_NO_NODE)
+ return 0;
+
+ /* Demotion ignores all cpuset and mempolicy settings */
+ err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+ target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+ &nr_succeeded);
+
+ if (current_is_kswapd())
+ __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
+ else
+ __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+
+ return nr_succeeded;
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
+ LIST_HEAD(demote_pages);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
+ bool do_demote_pass;
memset(stat, 0, sizeof(*stat));
cond_resched();
+ do_demote_pass = can_demote(pgdat->node_id, sc);
+retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
}
/*
+ * Before reclaiming the page, try to relocate
+ * its contents to another node.
+ */
+ if (do_demote_pass &&
+ (thp_migration_supported() || !PageTransHuge(page))) {
+ list_add(&page->lru, &demote_pages);
+ unlock_page(page);
+ continue;
+ }
+
+ /*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
* Lazyfree page could be freed directly
/* follow __remove_mapping for reference */
if (!page_ref_freeze(page, 1))
goto keep_locked;
- if (PageDirty(page)) {
- page_ref_unfreeze(page, 1);
- goto keep_locked;
- }
-
+ /*
+ * The page has only one reference left, which is
+ * from the isolation. After the caller puts the
+ * page back on lru and drops the reference, the
+ * page will be freed anyway. It doesn't matter
+ * which lru it goes. So we don't bother checking
+ * PageDirty here.
+ */
count_vm_event(PGLAZYFREED);
count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true,
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ /* 'page_list' is always empty here */
+
+ /* Migrate pages selected for demotion */
+ nr_reclaimed += demote_page_list(&demote_pages, pgdat);
+ /* Pages that could not be demoted are still in @demote_pages */
+ if (!list_empty(&demote_pages)) {
+ /* Pages which failed to demoted go back on @page_list for retry: */
+ list_splice_init(&demote_pages, page_list);
+ do_demote_pass = false;
+ goto retry;
+ }
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_unmap = 1,
};
struct reclaim_stat stat;
}
/*
- * Attempt to remove the specified page from its LRU. Only take this page
- * if it is of the appropriate PageActive status. Pages which are being
- * freed elsewhere are also ignored.
- *
- * page: page to consider
- * mode: one of the LRU isolation modes defined above
- *
- * returns true on success, false on failure.
- */
-bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
-{
- /* Only take pages on the LRU. */
- if (!PageLRU(page))
- return false;
-
- /* Compaction should not handle unevictable pages but CMA can do so */
- if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
- return false;
-
- /*
- * To minimise LRU disruption, the caller can indicate that it only
- * wants to isolate pages it will be able to operate on without
- * blocking - clean pages for the most part.
- *
- * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
- * that it is possible to migrate without blocking
- */
- if (mode & ISOLATE_ASYNC_MIGRATE) {
- /* All the caller can do on PageWriteback is block */
- if (PageWriteback(page))
- return false;
-
- if (PageDirty(page)) {
- struct address_space *mapping;
- bool migrate_dirty;
-
- /*
- * Only pages without mappings or that have a
- * ->migratepage callback are possible to migrate
- * without blocking. However, we can be racing with
- * truncation so it's necessary to lock the page
- * to stabilise the mapping as truncation holds
- * the page lock until after the page is removed
- * from the page cache.
- */
- if (!trylock_page(page))
- return false;
-
- mapping = page_mapping(page);
- migrate_dirty = !mapping || mapping->a_ops->migratepage;
- unlock_page(page);
- if (!migrate_dirty)
- return false;
- }
- }
-
- if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
- return false;
-
- return true;
-}
-
-/*
* Update LRU sizes after isolating pages. The LRU size updates must
* be complete before mem_cgroup_update_lru_size due to a sanity check.
*/
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(pages_skipped);
- isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
total_scan = 0;
scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
+ struct list_head *move_to = src;
struct page *page;
page = lru_to_page(src);
total_scan += nr_pages;
if (page_zonenum(page) > sc->reclaim_idx) {
- list_move(&page->lru, &pages_skipped);
nr_skipped[page_zonenum(page)] += nr_pages;
- continue;
+ move_to = &pages_skipped;
+ goto move;
}
/*
* return with no isolated pages if the LRU mostly contains
* ineligible pages. This causes the VM to not reclaim any
* pages, triggering a premature OOM.
- *
- * Account all tail pages of THP. This would not cause
- * premature OOM since __isolate_lru_page() returns -EBUSY
- * only when the page is being freed somewhere else.
+ * Account all tail pages of THP.
*/
scan += nr_pages;
- if (!__isolate_lru_page_prepare(page, mode)) {
- /* It is being freed elsewhere */
- list_move(&page->lru, src);
- continue;
- }
+
+ if (!PageLRU(page))
+ goto move;
+ if (!sc->may_unmap && page_mapped(page))
+ goto move;
+
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
- if (unlikely(!get_page_unless_zero(page))) {
- list_move(&page->lru, src);
- continue;
- }
+ if (unlikely(!get_page_unless_zero(page)))
+ goto move;
if (!TestClearPageLRU(page)) {
/* Another thread is already isolating this page */
put_page(page);
- list_move(&page->lru, src);
- continue;
+ goto move;
}
nr_taken += nr_pages;
nr_zone_taken[page_zonenum(page)] += nr_pages;
- list_move(&page->lru, dst);
+ move_to = dst;
+move:
+ list_move(&page->lru, move_to);
}
/*
}
*nr_scanned = total_scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
- total_scan, skipped, nr_taken, mode, lru);
+ total_scan, skipped, nr_taken,
+ sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
unsigned int noreclaim_flag;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
- .priority = DEF_PRIORITY,
.may_writepage = 1,
.may_unmap = 1,
.may_swap = 1,
+ .no_demotion = 1,
};
noreclaim_flag = memalloc_noreclaim_save();
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
enum lru_list lru;
/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE;
goto out;
}
cgroup_size = max(cgroup_size, protection);
scan = lruvec_size - lruvec_size * protection /
- cgroup_size;
+ (cgroup_size + 1);
/*
* Minimally target SWAP_CLUSTER_MAX pages to keep
}
}
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+ struct scan_control *sc)
+{
+ /* Aging the anon LRU is valuable if swap is present: */
+ if (total_swap_pages > 0)
+ return true;
+
+ /* Also valuable if anon pages can be demoted: */
+ return can_demote(pgdat->node_id, sc);
+}
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ bool proportional_reclaim;
struct blk_plug plug;
- bool scan_adjusted;
get_scan_count(lruvec, sc, nr);
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
- sc->priority == DEF_PRIORITY);
+ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
cond_resched();
- if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
continue;
/*
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
-
- scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+ if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+ inactive_is_low(lruvec, LRU_INACTIVE_ANON))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
*/
pages_for_compaction = compact_gap(sc->order);
inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
+ if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
return inactive_lru_pages > pages_for_compaction;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
* blocked waiting on the same lock. Instead, throttle for up to a
* second before continuing.
*/
- if (!(gfp_mask & __GFP_FS)) {
+ if (!(gfp_mask & __GFP_FS))
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
allow_direct_reclaim(pgdat), HZ);
+ else
+ /* Throttle until kswapd wakes the process */
+ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+ allow_direct_reclaim(pgdat));
- goto check_pending;
- }
-
- /* Throttle until kswapd wakes the process */
- wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
- allow_direct_reclaim(pgdat));
-
-check_pending:
if (fatal_signal_pending(current))
return true;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
- if (!total_swap_pages)
+ if (!can_age_anon_pages(pgdat, sc))
return;
lruvec = mem_cgroup_lruvec(NULL, pgdat);
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
- __fs_reclaim_acquire();
+ __fs_reclaim_acquire(_THIS_IP_);
count_vm_event(PAGEOUTRUN);
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
- __fs_reclaim_release();
+ __fs_reclaim_release(_THIS_IP_);
ret = try_to_freeze();
- __fs_reclaim_acquire();
+ __fs_reclaim_acquire(_THIS_IP_);
if (ret || kthread_should_stop())
break;
}
snapshot_refaults(NULL, pgdat);
- __fs_reclaim_release();
+ __fs_reclaim_release(_THIS_IP_);
psi_memstall_leave(&pflags);
set_task_reclaim_state(current, NULL);
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
-int kswapd_run(int nid)
+void kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- int ret = 0;
if (pgdat->kswapd)
- return 0;
+ return;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
- ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
}
- return ret;
}
/*