BACKPORT: mm: multi-gen LRU: support page table walks

author Yu Zhao <yuzhao@google.com>

Sun, 18 Sep 2022 08:00:05 +0000 (02:00 -0600)

committer Marek Szyprowski <m.szyprowski@samsung.com>

Wed, 17 Jan 2024 17:15:54 +0000 (18:15 +0100)
author Yu Zhao <yuzhao@google.com>
Sun, 18 Sep 2022 08:00:05 +0000 (02:00 -0600)
committer Marek Szyprowski <m.szyprowski@samsung.com>
Wed, 17 Jan 2024 17:15:54 +0000 (18:15 +0100)
diff --git a/fs/exec.c b/fs/exec.c

index d62cd1d..9536a61 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1036,10 +1036,12 @@ static int exec_mmap(struct mm_struct *mm)
         membarrier_exec_mmap(mm);
         tsk->mm = mm;
         tsk->active_mm = mm;
+       lru_gen_add_mm(mm);
         activate_mm(active_mm, mm);
         tsk->mm->vmacache_seqnum = 0;
         vmacache_flush(tsk);
         task_unlock(tsk);
+       lru_gen_use_mm(mm);
         if (old_mm) {
                 up_read(&old_mm->mmap_sem);
                 BUG_ON(active_mm != old_mm);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index c00fb7e..5737ce8 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -325,6 +325,11 @@ struct mem_cgroup {
         struct deferred_split deferred_split_queue;
  #endif
  
+#ifdef CONFIG_LRU_GEN
+       /* per-memcg mm_struct list */
+       struct lru_gen_mm_list mm_list;
+#endif
+
         struct mem_cgroup_per_node *nodeinfo[0];
         /* WARNING: nodeinfo must be the last member here */
  };
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 270aa8f..ab14fd7 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -524,6 +524,22 @@ struct mm_struct {
                 atomic_long_t hugetlb_usage;
  #endif
                 struct work_struct async_put_work;
+#ifdef CONFIG_LRU_GEN
+               struct {
+                       /* this mm_struct is on lru_gen_mm_list */
+                       struct list_head list;
+                       /*
+                        * Set when switching to this mm_struct, as a hint of
+                        * whether it has been used since the last time per-node
+                        * page table walkers cleared the corresponding bits.
+                        */
+                       unsigned long bitmap;
+#ifdef CONFIG_MEMCG
+                       /* points to the memcg of "owner" above */
+                       struct mem_cgroup *memcg;
+#endif
+               } lru_gen;
+#endif /* CONFIG_LRU_GEN */
         } __randomize_layout;
  
         /*
@@ -550,6 +566,66 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
         return (struct cpumask *)&mm->cpu_bitmap;
  }
  
+#ifdef CONFIG_LRU_GEN
+
+struct lru_gen_mm_list {
+       /* mm_struct list for page table walkers */
+       struct list_head fifo;
+       /* protects the list above */
+       spinlock_t lock;
+};
+
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+       INIT_LIST_HEAD(&mm->lru_gen.list);
+       mm->lru_gen.bitmap = 0;
+#ifdef CONFIG_MEMCG
+       mm->lru_gen.memcg = NULL;
+#endif
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+       /*
+        * When the bitmap is set, page reclaim knows this mm_struct has been
+        * used since the last time it cleared the bitmap. So it might be worth
+        * walking the page tables of this mm_struct to clear the accessed bit.
+        */
+       WRITE_ONCE(mm->lru_gen.bitmap, -1);
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_use_mm(struct mm_struct *mm)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
  struct mmu_gather;
  extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
                                 unsigned long start, unsigned long end);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 5c24469..4a5f97c 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -394,7 +394,7 @@ enum {
   * min_seq behind.
   *
   * The number of pages in each generation is eventually consistent and therefore
- * can be transiently negative.
+ * can be transiently negative when reset_batch_size() is pending.
   */
  struct lru_gen_struct {
         /* the aging increments the youngest generation number */
@@ -416,6 +416,53 @@ struct lru_gen_struct {
         atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
  };
  
+enum {
+       MM_LEAF_TOTAL,          /* total leaf entries */
+       MM_LEAF_OLD,            /* old leaf entries */
+       MM_LEAF_YOUNG,          /* young leaf entries */
+       MM_NONLEAF_TOTAL,       /* total non-leaf entries */
+       MM_NONLEAF_FOUND,       /* non-leaf entries found in Bloom filters */
+       MM_NONLEAF_ADDED,       /* non-leaf entries added to Bloom filters */
+       NR_MM_STATS
+};
+
+/* double-buffering Bloom filters */
+#define NR_BLOOM_FILTERS       2
+
+struct lru_gen_mm_state {
+       /* set to max_seq after each iteration */
+       unsigned long seq;
+       /* where the current iteration continues (inclusive) */
+       struct list_head *head;
+       /* where the last iteration ended (exclusive) */
+       struct list_head *tail;
+       /* to wait for the last page table walker to finish */
+       struct wait_queue_head wait;
+       /* Bloom filters flip after each iteration */
+       unsigned long *filters[NR_BLOOM_FILTERS];
+       /* the mm stats for debugging */
+       unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
+       /* the number of concurrent page table walkers */
+       int nr_walkers;
+};
+
+struct lru_gen_mm_walk {
+       /* the lruvec under reclaim */
+       struct lruvec *lruvec;
+       /* unstable max_seq from lru_gen_struct */
+       unsigned long max_seq;
+       /* the next address within an mm to scan */
+       unsigned long next_addr;
+       /* to batch promoted pages */
+       int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+       /* to batch the mm stats */
+       int mm_stats[NR_MM_STATS];
+       /* total batched items */
+       int batched;
+       bool can_swap;
+       bool force_scan;
+};
+
  void lru_gen_init_lruvec(struct lruvec *lruvec);
  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
  
@@ -458,6 +505,8 @@ struct lruvec {
  #ifdef CONFIG_LRU_GEN
         /* evictable pages divided into generations */
         struct lru_gen_struct           lrugen;
+       /* to concurrently iterate lru_gen_mm_list */
+       struct lru_gen_mm_state         mm_state;
  #endif
  #ifdef CONFIG_MEMCG
         struct pglist_data *pgdat;
@@ -944,6 +993,11 @@ typedef struct pglist_data {
  
         unsigned long           flags;
  
+#ifdef CONFIG_LRU_GEN
+       /* kswap mm walk data */
+       struct lru_gen_mm_walk  mm_walk;
+#endif
+
         ZONE_PADDING(_pad2_)
  
         /* Per-node vmstats */
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 1e99f7a..e78b759 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -130,6 +130,10 @@ union swap_header {
   */
  struct reclaim_state {
         unsigned long reclaimed_slab;
+#ifdef CONFIG_LRU_GEN
+       /* per-thread mm walk data */
+       struct lru_gen_mm_walk *mm_walk;
+#endif
  };
  
  #ifdef __KERNEL__
diff --git a/kernel/exit.c b/kernel/exit.c

index fa46977..874d6a9 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -423,6 +423,7 @@ assign_new_owner:
                 goto retry;
         }
         WRITE_ONCE(mm->owner, c);
+       lru_gen_migrate_mm(mm);
         task_unlock(c);
         put_task_struct(c);
  }
diff --git a/kernel/fork.c b/kernel/fork.c

index 9180f44..9a8abbd 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1044,6 +1044,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
                 goto fail_nocontext;
  
         mm->user_ns = get_user_ns(user_ns);
+       lru_gen_init_mm(mm);
         return mm;
  
  fail_nocontext:
@@ -1086,6 +1087,7 @@ static inline void __mmput(struct mm_struct *mm)
         }
         if (mm->binfmt)
                 module_put(mm->binfmt->module);
+       lru_gen_del_mm(mm);
         mmdrop(mm);
  }
  
@@ -2377,6 +2379,13 @@ long _do_fork(struct kernel_clone_args *args)
                 get_task_struct(p);
         }
  
+       if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+               /* lock the task to synchronize with memcg migration */
+               task_lock(p);
+               lru_gen_add_mm(p->mm);
+               task_unlock(p);
+       }
+
         wake_up_new_task(p);
  
         /* forking complete and child started to run, tell ptracer */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 7238ef4..f9753ce 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3364,6 +3364,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                  * finish_task_switch()'s mmdrop().
                  */
                 switch_mm_irqs_off(prev->active_mm, next->mm, next);
+               lru_gen_use_mm(next->mm);
  
                 if (!prev->mm) {                        // from kernel
                         /* will mmdrop() in finish_task_switch(). */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 1aebb4b..b1812f8 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6075,6 +6075,30 @@ static void mem_cgroup_move_task(void)
  }
  #endif
  
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+       struct task_struct *task;
+       struct cgroup_subsys_state *css;
+
+       /* find the first leader if there is any */
+       cgroup_taskset_for_each_leader(task, css, tset)
+               break;
+
+       if (!task)
+               return;
+
+       task_lock(task);
+       if (task->mm && READ_ONCE(task->mm->owner) == task)
+               lru_gen_migrate_mm(task->mm);
+       task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
  /*
   * Cgroup retains root cgroups across [un]mount cycles making it necessary
   * to verify whether we're attached to the default hierarchy on each mount
@@ -6375,6 +6399,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
         .css_free = mem_cgroup_css_free,
         .css_reset = mem_cgroup_css_reset,
         .can_attach = mem_cgroup_can_attach,
+       .attach = mem_cgroup_attach,
         .cancel_attach = mem_cgroup_cancel_attach,
         .post_attach = mem_cgroup_move_task,
         .bind = mem_cgroup_bind,
diff --git a/mm/vmscan.c b/mm/vmscan.c

index bac5931..0c186aa 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,8 @@
  #include <linux/printk.h>
  #include <linux/dax.h>
  #include <linux/psi.h>
+#include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -2561,7 +2563,7 @@ out:
                 for ((type) = 0; (type) < ANON_AND_FILE; (type)++)      \
                         for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
  
-static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
+static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
  {
         struct pglist_data *pgdat = NODE_DATA(nid);
  
@@ -2608,6 +2610,371 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
  }
  
  /******************************************************************************
+ *                          mm_struct list
+ ******************************************************************************/
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+       static struct lru_gen_mm_list mm_list = {
+               .fifo = LIST_HEAD_INIT(mm_list.fifo),
+               .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
+       };
+
+#ifdef CONFIG_MEMCG
+       if (memcg)
+               return &memcg->mm_list;
+#endif
+       VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+       return &mm_list;
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+       int nid;
+       struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+       struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+       VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
+#ifdef CONFIG_MEMCG
+       VM_WARN_ON_ONCE(mm->lru_gen.memcg);
+       mm->lru_gen.memcg = memcg;
+#endif
+       spin_lock(&mm_list->lock);
+
+       for_each_node_state(nid, N_MEMORY) {
+               struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+               if (!lruvec)
+                       continue;
+
+               /* the first addition since the last iteration */
+               if (lruvec->mm_state.tail == &mm_list->fifo)
+                       lruvec->mm_state.tail = &mm->lru_gen.list;
+       }
+
+       list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
+
+       spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+       int nid;
+       struct lru_gen_mm_list *mm_list;
+       struct mem_cgroup *memcg = NULL;
+
+       if (list_empty(&mm->lru_gen.list))
+               return;
+
+#ifdef CONFIG_MEMCG
+       memcg = mm->lru_gen.memcg;
+#endif
+       mm_list = get_mm_list(memcg);
+
+       spin_lock(&mm_list->lock);
+
+       for_each_node(nid) {
+               struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+               if (!lruvec)
+                       continue;
+
+               /* where the last iteration ended (exclusive) */
+               if (lruvec->mm_state.tail == &mm->lru_gen.list)
+                       lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+
+               /* where the current iteration continues (inclusive) */
+               if (lruvec->mm_state.head != &mm->lru_gen.list)
+                       continue;
+
+               lruvec->mm_state.head = lruvec->mm_state.head->next;
+               /* the deletion ends the current iteration */
+               if (lruvec->mm_state.head == &mm_list->fifo)
+                       WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
+       }
+
+       list_del_init(&mm->lru_gen.list);
+
+       spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+       mem_cgroup_put(mm->lru_gen.memcg);
+       mm->lru_gen.memcg = NULL;
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+       struct mem_cgroup *memcg;
+       struct task_struct *task = rcu_dereference_protected(mm->owner, true);
+
+       VM_WARN_ON_ONCE(task->mm != mm);
+       lockdep_assert_held(&task->alloc_lock);
+
+       /* for mm_update_next_owner() */
+       if (mem_cgroup_disabled())
+               return;
+
+       rcu_read_lock();
+       memcg = mem_cgroup_from_task(task);
+       rcu_read_unlock();
+       if (memcg == mm->lru_gen.memcg)
+               return;
+
+       VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
+       VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
+
+       lru_gen_del_mm(mm);
+       lru_gen_add_mm(mm);
+}
+#endif
+
+/*
+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
+ * bits in a bitmap, k is the number of hash functions and n is the number of
+ * inserted items.
+ *
+ * Page table walkers use one of the two filters to reduce their search space.
+ * To get rid of non-leaf entries that no longer have enough leaf entries, the
+ * aging uses the double-buffering technique to flip to the other filter each
+ * time it produces a new generation. For non-leaf entries that have enough
+ * leaf entries, the aging carries them over to the next generation in
+ * walk_pmd_range(); the eviction also report them when walking the rmap
+ * in lru_gen_look_around().
+ *
+ * For future optimizations:
+ * 1. It's not necessary to keep both filters all the time. The spare one can be
+ *    freed after the RCU grace period and reallocated if needed again.
+ * 2. And when reallocating, it's worth scaling its size according to the number
+ *    of inserted entries in the other filter, to reduce the memory overhead on
+ *    small systems and false positives on large systems.
+ * 3. Jenkins' hash function is an alternative to Knuth's.
+ */
+#define BLOOM_FILTER_SHIFT     15
+
+static inline int filter_gen_from_seq(unsigned long seq)
+{
+       return seq % NR_BLOOM_FILTERS;
+}
+
+static void get_item_key(void *item, int *key)
+{
+       u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
+
+       BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
+
+       key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
+       key[1] = hash >> BLOOM_FILTER_SHIFT;
+}
+
+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+{
+       unsigned long *filter;
+       int gen = filter_gen_from_seq(seq);
+
+       filter = lruvec->mm_state.filters[gen];
+       if (filter) {
+               bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
+               return;
+       }
+
+       filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
+                              __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+       WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+}
+
+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+       int key[2];
+       unsigned long *filter;
+       int gen = filter_gen_from_seq(seq);
+
+       filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+       if (!filter)
+               return;
+
+       get_item_key(item, key);
+
+       if (!test_bit(key[0], filter))
+               set_bit(key[0], filter);
+       if (!test_bit(key[1], filter))
+               set_bit(key[1], filter);
+}
+
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+       int key[2];
+       unsigned long *filter;
+       int gen = filter_gen_from_seq(seq);
+
+       filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+       if (!filter)
+               return true;
+
+       get_item_key(item, key);
+
+       return test_bit(key[0], filter) && test_bit(key[1], filter);
+}
+
+static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+{
+       int i;
+       int hist;
+
+       lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+       if (walk) {
+               hist = lru_hist_from_seq(walk->max_seq);
+
+               for (i = 0; i < NR_MM_STATS; i++) {
+                       WRITE_ONCE(lruvec->mm_state.stats[hist][i],
+                                  lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
+                       walk->mm_stats[i] = 0;
+               }
+       }
+
+       if (NR_HIST_GENS > 1 && last) {
+               hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
+
+               for (i = 0; i < NR_MM_STATS; i++)
+                       WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
+       }
+}
+
+static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+       int type;
+       unsigned long size = 0;
+       struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+       int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
+
+       if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
+               return true;
+
+       clear_bit(key, &mm->lru_gen.bitmap);
+
+       for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
+               size += type ? get_mm_counter(mm, MM_FILEPAGES) :
+                              get_mm_counter(mm, MM_ANONPAGES) +
+                              get_mm_counter(mm, MM_SHMEMPAGES);
+       }
+
+       if (size < MIN_LRU_BATCH)
+               return true;
+
+       return !mmget_not_zero(mm);
+}
+
+static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
+                           struct mm_struct **iter)
+{
+       bool first = false;
+       bool last = true;
+       struct mm_struct *mm = NULL;
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+       struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+       struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+       /*
+        * There are four interesting cases for this page table walker:
+        * 1. It tries to start a new iteration of mm_list with a stale max_seq;
+        *    there is nothing left to do.
+        * 2. It's the first of the current generation, and it needs to reset
+        *    the Bloom filter for the next generation.
+        * 3. It reaches the end of mm_list, and it needs to increment
+        *    mm_state->seq; the iteration is done.
+        * 4. It's the last of the current generation, and it needs to reset the
+        *    mm stats counters for the next generation.
+        */
+       spin_lock(&mm_list->lock);
+
+       VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
+       VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
+       VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
+
+       if (walk->max_seq <= mm_state->seq) {
+               if (!*iter)
+                       last = false;
+               goto done;
+       }
+
+       if (!mm_state->nr_walkers) {
+               VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
+
+               mm_state->head = mm_list->fifo.next;
+               first = true;
+       }
+
+       while (!mm && mm_state->head != &mm_list->fifo) {
+               mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+
+               mm_state->head = mm_state->head->next;
+
+               /* force scan for those added after the last iteration */
+               if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
+                       mm_state->tail = mm_state->head;
+                       walk->force_scan = true;
+               }
+
+               if (should_skip_mm(mm, walk))
+                       mm = NULL;
+       }
+
+       if (mm_state->head == &mm_list->fifo)
+               WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+done:
+       if (*iter && !mm)
+               mm_state->nr_walkers--;
+       if (!*iter && mm)
+               mm_state->nr_walkers++;
+
+       if (mm_state->nr_walkers)
+               last = false;
+
+       if (*iter || last)
+               reset_mm_stats(lruvec, walk, last);
+
+       spin_unlock(&mm_list->lock);
+
+       if (mm && first)
+               reset_bloom_filter(lruvec, walk->max_seq + 1);
+
+       if (*iter)
+               mmput_async(*iter);
+
+       *iter = mm;
+
+       return last;
+}
+
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+{
+       bool success = false;
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+       struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+       struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+       spin_lock(&mm_list->lock);
+
+       VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+
+       if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
+               VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
+
+               WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+               reset_mm_stats(lruvec, NULL, true);
+               success = true;
+       }
+
+       spin_unlock(&mm_list->lock);
+
+       return success;
+}
+
+/******************************************************************************
   *                          refault feedback loop
   ******************************************************************************/
  
@@ -2757,6 +3124,118 @@ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaimin
         return new_gen;
  }
  
+static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page,
+                             int old_gen, int new_gen)
+{
+       int type = page_is_file_cache(page);
+       int zone = page_zonenum(page);
+       int delta = hpage_nr_pages(page);
+
+       VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
+       VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
+
+       walk->batched++;
+
+       walk->nr_pages[old_gen][type][zone] -= delta;
+       walk->nr_pages[new_gen][type][zone] += delta;
+}
+
+static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+{
+       int gen, type, zone;
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+       walk->batched = 0;
+
+       for_each_gen_type_zone(gen, type, zone) {
+               enum lru_list lru = type * LRU_INACTIVE_FILE;
+               int delta = walk->nr_pages[gen][type][zone];
+
+               if (!delta)
+                       continue;
+
+               walk->nr_pages[gen][type][zone] = 0;
+               WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
+                          lrugen->nr_pages[gen][type][zone] + delta);
+
+               if (lru_gen_is_active(lruvec, gen))
+                       lru += LRU_ACTIVE;
+               __update_lru_size(lruvec, lru, zone, delta);
+       }
+}
+
+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
+{
+       struct address_space *mapping;
+       struct vm_area_struct *vma = args->vma;
+       struct lru_gen_mm_walk *walk = args->private;
+
+       if (!vma_is_accessible(vma))
+               return true;
+
+       if (is_vm_hugetlb_page(vma))
+               return true;
+
+       if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
+               return true;
+
+       if (vma == get_gate_vma(vma->vm_mm))
+               return true;
+
+       if (vma_is_anonymous(vma))
+               return !walk->can_swap;
+
+       if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
+               return true;
+
+       mapping = vma->vm_file->f_mapping;
+       if (mapping_unevictable(mapping))
+               return true;
+
+       if (shmem_mapping(mapping))
+               return !walk->can_swap;
+
+       /* to exclude special mappings like dax, etc. */
+       return !mapping->a_ops->readpage;
+}
+
+/*
+ * Some userspace memory allocators map many single-page VMAs. Instead of
+ * returning back to the PGD table for each of such VMAs, finish an entire PMD
+ * table to reduce zigzags and improve cache performance.
+ */
+static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
+                        unsigned long *vm_start, unsigned long *vm_end)
+{
+       unsigned long start = round_up(*vm_end, size);
+       unsigned long end = (start | ~mask) + 1;
+
+       VM_WARN_ON_ONCE(mask & size);
+       VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
+
+       while (args->vma) {
+               if (start >= args->vma->vm_end) {
+                       args->vma = args->vma->vm_next;
+                       continue;
+               }
+
+               if (end && end <= args->vma->vm_start)
+                       return false;
+
+               if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) {
+                       args->vma = args->vma->vm_next;
+                       continue;
+               }
+
+               *vm_start = max(start, args->vma->vm_start);
+               *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
+
+               return true;
+       }
+
+       return false;
+}
+
  static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
  {
         unsigned long pfn = pte_pfn(pte);
@@ -2775,8 +3254,28 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
         return pfn;
  }
  
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
+{
+       unsigned long pfn = pmd_pfn(pmd);
+
+       VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+       if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
+               return -1;
+
+       if (WARN_ON_ONCE(pmd_devmap(pmd)))
+               return -1;
+
+       if (WARN_ON_ONCE(!pfn_valid(pfn)))
+               return -1;
+
+       return pfn;
+}
+#endif
+
  static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
-                                struct pglist_data *pgdat)
+                                struct pglist_data *pgdat, bool can_swap)
  {
         struct page *page;
  
@@ -2791,9 +3290,375 @@ static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
         if (page_memcg_rcu(page) != memcg)
                 return NULL;
  
+       /* file VMAs can contain anon pages from COW */
+       if (!page_is_file_cache(page) && !can_swap)
+               return NULL;
+
         return page;
  }
  
+static bool suitable_to_scan(int total, int young)
+{
+       int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
+
+       /* suitable if the average number of young PTEs per cacheline is >=1 */
+       return young * n >= total;
+}
+
+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+                          struct mm_walk *args)
+{
+       int i;
+       pte_t *pte;
+       spinlock_t *ptl;
+       unsigned long addr;
+       int total = 0;
+       int young = 0;
+       struct lru_gen_mm_walk *walk = args->private;
+       struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+       int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+
+       VM_WARN_ON_ONCE(pmd_leaf(*pmd));
+
+       ptl = pte_lockptr(args->mm, pmd);
+       if (!spin_trylock(ptl))
+               return false;
+
+       arch_enter_lazy_mmu_mode();
+
+       pte = pte_offset_map(pmd, start & PMD_MASK);
+restart:
+       for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+               unsigned long pfn;
+               struct page *page;
+
+               total++;
+               walk->mm_stats[MM_LEAF_TOTAL]++;
+
+               pfn = get_pte_pfn(pte[i], args->vma, addr);
+               if (pfn == -1)
+                       continue;
+
+               if (!pte_young(pte[i])) {
+                       walk->mm_stats[MM_LEAF_OLD]++;
+                       continue;
+               }
+
+               page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
+               if (!page)
+                       continue;
+
+               if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
+                       VM_WARN_ON_ONCE(true);
+
+               young++;
+               walk->mm_stats[MM_LEAF_YOUNG]++;
+
+               if (pte_dirty(pte[i]) && !PageDirty(page) &&
+                   !(PageAnon(page) && PageSwapBacked(page) &&
+                     !PageSwapCache(page)))
+                       set_page_dirty(page);
+
+               old_gen = page_update_gen(page, new_gen);
+               if (old_gen >= 0 && old_gen != new_gen)
+                       update_batch_size(walk, page, old_gen, new_gen);
+       }
+
+       if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
+               goto restart;
+
+       pte_unmap(pte);
+
+       arch_leave_lazy_mmu_mode();
+       spin_unlock(ptl);
+
+       return suitable_to_scan(total, young);
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+                                 struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
+{
+       int i;
+       pmd_t *pmd;
+       spinlock_t *ptl;
+       struct lru_gen_mm_walk *walk = args->private;
+       struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+       int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+
+       VM_WARN_ON_ONCE(pud_leaf(*pud));
+
+       /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
+       if (*start == -1) {
+               *start = next;
+               return;
+       }
+
+       i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
+       if (i && i <= MIN_LRU_BATCH) {
+               __set_bit(i - 1, bitmap);
+               return;
+       }
+
+       pmd = pmd_offset(pud, *start);
+
+       ptl = pmd_lockptr(args->mm, pmd);
+       if (!spin_trylock(ptl))
+               goto done;
+
+       arch_enter_lazy_mmu_mode();
+
+       do {
+               unsigned long pfn;
+               struct page *page;
+               unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
+
+               pfn = get_pmd_pfn(pmd[i], vma, addr);
+               if (pfn == -1)
+                       goto next;
+
+               if (!pmd_trans_huge(pmd[i])) {
+                       if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
+                               pmdp_test_and_clear_young(vma, addr, pmd + i);
+                       goto next;
+               }
+
+               page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
+               if (!page)
+                       goto next;
+
+               if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
+                       goto next;
+
+               walk->mm_stats[MM_LEAF_YOUNG]++;
+
+               if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
+                   !(PageAnon(page) && PageSwapBacked(page) &&
+                     !PageSwapCache(page)))
+                       set_page_dirty(page);
+
+               old_gen = page_update_gen(page, new_gen);
+               if (old_gen >= 0 && old_gen != new_gen)
+                       update_batch_size(walk, page, old_gen, new_gen);
+next:
+               i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
+       } while (i <= MIN_LRU_BATCH);
+
+       arch_leave_lazy_mmu_mode();
+       spin_unlock(ptl);
+done:
+       *start = -1;
+       bitmap_zero(bitmap, MIN_LRU_BATCH);
+}
+#else
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+                                 struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
+{
+}
+#endif
+
+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+                          struct mm_walk *args)
+{
+       int i;
+       pmd_t *pmd;
+       unsigned long next;
+       unsigned long addr;
+       struct vm_area_struct *vma;
+       unsigned long pos = -1;
+       struct lru_gen_mm_walk *walk = args->private;
+       unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+
+       VM_WARN_ON_ONCE(pud_leaf(*pud));
+
+       /*
+        * Finish an entire PMD in two passes: the first only reaches to PTE
+        * tables to avoid taking the PMD lock; the second, if necessary, takes
+        * the PMD lock to clear the accessed bit in PMD entries.
+        */
+       pmd = pmd_offset(pud, start & PUD_MASK);
+restart:
+       /* walk_pte_range() may call get_next_vma() */
+       vma = args->vma;
+       for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
+               pmd_t val = pmd_read_atomic(pmd + i);
+
+               /* for pmd_read_atomic() */
+               barrier();
+
+               next = pmd_addr_end(addr, end);
+
+               if (!pmd_present(val) || is_huge_zero_pmd(val)) {
+                       walk->mm_stats[MM_LEAF_TOTAL]++;
+                       continue;
+               }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+               if (pmd_trans_huge(val)) {
+                       unsigned long pfn = pmd_pfn(val);
+                       struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+
+                       walk->mm_stats[MM_LEAF_TOTAL]++;
+
+                       if (!pmd_young(val)) {
+                               walk->mm_stats[MM_LEAF_OLD]++;
+                               continue;
+                       }
+
+                       /* try to avoid unnecessary memory loads */
+                       if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+                               continue;
+
+                       walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+                       continue;
+               }
+#endif
+               walk->mm_stats[MM_NONLEAF_TOTAL]++;
+
+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+               if (!pmd_young(val))
+                       continue;
+
+               walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+#endif
+               if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+                       continue;
+
+               walk->mm_stats[MM_NONLEAF_FOUND]++;
+
+               if (!walk_pte_range(&val, addr, next, args))
+                       continue;
+
+               walk->mm_stats[MM_NONLEAF_ADDED]++;
+
+               /* carry over to the next generation */
+               update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
+       }
+
+       walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
+
+       if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
+               goto restart;
+}
+
+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
+                         struct mm_walk *args)
+{
+       int i;
+       pud_t *pud;
+       unsigned long addr;
+       unsigned long next;
+       struct lru_gen_mm_walk *walk = args->private;
+
+       VM_WARN_ON_ONCE(p4d_leaf(*p4d));
+
+       pud = pud_offset(p4d, start & P4D_MASK);
+restart:
+       for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
+               pud_t val = READ_ONCE(pud[i]);
+
+               next = pud_addr_end(addr, end);
+
+               if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
+                       continue;
+
+               walk_pmd_range(&val, addr, next, args);
+
+               /* a racy check to curtail the waiting time */
+               if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
+                       return 1;
+
+               if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
+                       end = (addr | ~PUD_MASK) + 1;
+                       goto done;
+               }
+       }
+
+       if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
+               goto restart;
+
+       end = round_up(end, P4D_SIZE);
+done:
+       if (!end || !args->vma)
+               return 1;
+
+       walk->next_addr = max(end, args->vma->vm_start);
+
+       return -EAGAIN;
+}
+
+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+       static const struct mm_walk_ops mm_walk_ops = {
+               .test_walk = should_skip_vma,
+               .p4d_entry = walk_pud_range,
+       };
+
+       int err;
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+       walk->next_addr = FIRST_USER_ADDRESS;
+
+       do {
+               err = -EBUSY;
+
+               /* page_update_gen() requires stable page_memcg() */
+               if (!mem_cgroup_trylock_pages(memcg))
+                       break;
+
+               /* the caller might be holding the lock for write */
+               if (down_read_trylock(&mm->mmap_sem)) {
+                       err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
+
+                       up_write(&mm->mmap_sem);
+               }
+
+               mem_cgroup_unlock_pages();
+
+               if (walk->batched) {
+                       spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+                       reset_batch_size(lruvec, walk);
+                       spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+               }
+
+               cond_resched();
+       } while (err == -EAGAIN);
+}
+
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
+{
+       struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
+       if (pgdat && current_is_kswapd()) {
+               VM_WARN_ON_ONCE(walk);
+
+               walk = &pgdat->mm_walk;
+       } else if (!pgdat && !walk) {
+               VM_WARN_ON_ONCE(current_is_kswapd());
+
+               walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+       }
+
+       current->reclaim_state->mm_walk = walk;
+
+       return walk;
+}
+
+static void clear_mm_walk(void)
+{
+       struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
+       VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
+       VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
+
+       current->reclaim_state->mm_walk = NULL;
+
+       if (!current_is_kswapd())
+               kfree(walk);
+}
+
  static void inc_min_seq(struct lruvec *lruvec, int type)
  {
         struct lru_gen_struct *lrugen = &lruvec->lrugen;
@@ -2845,7 +3710,7 @@ next:
         return success;
  }
  
-static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
  {
         int prev, next;
         int type, zone;
@@ -2855,9 +3720,6 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s
  
         VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
  
-       if (max_seq != lrugen->max_seq)
-               goto unlock;
-
         for (type = ANON_AND_FILE - 1; type >= 0; type--) {
                 if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
                         continue;
@@ -2895,10 +3757,74 @@ static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_s
  
         /* make sure preceding modifications appear */
         smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
-unlock:
+
         spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock);
  }
  
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+                              struct scan_control *sc, bool can_swap)
+{
+       bool success;
+       struct lru_gen_mm_walk *walk;
+       struct mm_struct *mm = NULL;
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+       VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+
+       /* see the comment in iterate_mm_list() */
+       if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
+               success = false;
+               goto done;
+       }
+
+       /*
+        * If the hardware doesn't automatically set the accessed bit, fallback
+        * to lru_gen_look_around(), which only clears the accessed bit in a
+        * handful of PTEs. Spreading the work out over a period of time usually
+        * is less efficient, but it avoids bursty page faults.
+        */
+       if (!arch_has_hw_pte_young()) {
+               success = iterate_mm_list_nowalk(lruvec, max_seq);
+               goto done;
+       }
+
+       walk = set_mm_walk(NULL);
+       if (!walk) {
+               success = iterate_mm_list_nowalk(lruvec, max_seq);
+               goto done;
+       }
+
+       walk->lruvec = lruvec;
+       walk->max_seq = max_seq;
+       walk->can_swap = can_swap;
+       walk->force_scan = false;
+
+       do {
+               success = iterate_mm_list(lruvec, walk, &mm);
+               if (mm)
+                       walk_mm(lruvec, mm, walk);
+
+               cond_resched();
+       } while (mm);
+done:
+       if (!success) {
+               if (sc->priority <= DEF_PRIORITY - 2)
+                       wait_event_killable(lruvec->mm_state.wait,
+                                           max_seq < READ_ONCE(lrugen->max_seq));
+
+               return max_seq < READ_ONCE(lrugen->max_seq);
+       }
+
+       VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+
+       inc_max_seq(lruvec, can_swap);
+       /* either this sees any waiters or they will see updated max_seq */
+       if (wq_has_sleeper(&lruvec->mm_state.wait))
+               wake_up_all(&lruvec->mm_state.wait);
+
+       return true;
+}
+
  static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
                              struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
  {
@@ -2974,7 +3900,7 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  
         need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
         if (need_aging)
-               inc_max_seq(lruvec, max_seq, swappiness);
+               try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
  }
  
  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
@@ -2983,6 +3909,8 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  
         VM_WARN_ON_ONCE(!current_is_kswapd());
  
+       set_mm_walk(pgdat);
+
         memcg = mem_cgroup_iter(NULL, NULL, NULL);
         do {
                 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
@@ -2991,11 +3919,16 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  
                 cond_resched();
         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+       clear_mm_walk();
  }
  
  /*
   * This function exploits spatial locality when shrink_page_list() walks the
- * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
+ * the scan was done cacheline efficiently, it adds the PMD entry pointing to
+ * the PTE table to the Bloom filter. This forms a feedback loop between the
+ * eviction and the aging.
   */
  void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
  {
@@ -3004,6 +3937,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
         unsigned long start;
         unsigned long end;
         unsigned long addr;
+       struct lru_gen_mm_walk *walk;
+       int young = 0;
         unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
         struct page *page = pvmw->page;
         struct mem_cgroup *memcg = page_memcg(page);
@@ -3018,6 +3953,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
         if (spin_is_contended(pvmw->ptl))
                 return;
  
+       /* avoid taking the LRU lock under the PTL when possible */
+       walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
+
         start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
         end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
  
@@ -3047,13 +3985,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
                 if (!pte_young(pte[i]))
                         continue;
  
-               page = get_pfn_page(pfn, memcg, pgdat);
+               page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap);
                 if (!page)
                         continue;
  
                 if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
                         VM_WARN_ON_ONCE(true);
  
+               young++;
+
                 if (pte_dirty(pte[i]) && !PageDirty(page) &&
                     !(PageAnon(page) && PageSwapBacked(page) &&
                       !PageSwapCache(page)))
@@ -3069,7 +4009,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
         arch_leave_lazy_mmu_mode();
         rcu_read_unlock();
  
-       if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+       /* feedback from rmap walkers to page table walkers */
+       if (suitable_to_scan(i, young))
+               update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+
+       if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
                 for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
                         page = pte_page(pte[i]);
                         activate_page(page);
@@ -3081,8 +4025,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
         if (!mem_cgroup_trylock_pages(memcg))
                 return;
  
-       spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock);
-       new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+       if (!walk) {
+               spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+               new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+       }
  
         for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
                 page = compound_head(pte_page(pte[i]));
@@ -3093,10 +4039,14 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
                 if (old_gen < 0 || old_gen == new_gen)
                         continue;
  
-               lru_gen_update_size(lruvec, page, old_gen, new_gen);
+               if (walk)
+                       update_batch_size(walk, page, old_gen, new_gen);
+               else
+                       lru_gen_update_size(lruvec, page, old_gen, new_gen);
         }
  
-       spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+       if (!walk)
+               spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock);
  
         mem_cgroup_unlock_pages();
  }
@@ -3380,6 +4330,7 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
         struct page *page;
         enum vm_event_item item;
         struct reclaim_stat stat;
+       struct lru_gen_mm_walk *walk;
         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  
@@ -3416,6 +4367,10 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
  
         move_pages_to_lru(lruvec, &list);
  
+       walk = current->reclaim_state->mm_walk;
+       if (walk && walk->batched)
+               reset_batch_size(lruvec, walk);
+
         item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
         if (!cgroup_reclaim(sc))
                 __count_vm_events(item, reclaimed);
@@ -3431,6 +4386,11 @@ static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swapp
         return scanned;
  }
  
+/*
+ * For future optimizations:
+ * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
+ *    reclaim.
+ */
  static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
                                     bool can_swap)
  {
@@ -3456,7 +4416,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
         if (current_is_kswapd())
                 return 0;
  
-       inc_max_seq(lruvec, max_seq, can_swap);
+       if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
+               return nr_to_scan;
  done:
         return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
  }
@@ -3470,6 +4431,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
  
         blk_start_plug(&plug);
  
+       set_mm_walk(lruvec_pgdat(lruvec));
+
         while (true) {
                 int delta;
                 int swappiness;
@@ -3497,6 +4460,8 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
                 cond_resched();
         }
  
+       clear_mm_walk();
+
         blk_finish_plug(&plug);
  }
  
@@ -3513,15 +4478,21 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
  
         for_each_gen_type_zone(gen, type, zone)
                 INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+
+       lruvec->mm_state.seq = MIN_NR_GENS;
+       init_waitqueue_head(&lruvec->mm_state.wait);
  }
  
  #ifdef CONFIG_MEMCG
  void lru_gen_init_memcg(struct mem_cgroup *memcg)
  {
+       INIT_LIST_HEAD(&memcg->mm_list.fifo);
+       spin_lock_init(&memcg->mm_list.lock);
  }
  
  void lru_gen_exit_memcg(struct mem_cgroup *memcg)
  {
+       int i;
         int nid;
  
         for_each_node(nid) {
@@ -3529,6 +4500,11 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
  
                 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
                                            sizeof(lruvec->lrugen.nr_pages)));
+
+               for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+                       bitmap_free(lruvec->mm_state.filters[i]);
+                       lruvec->mm_state.filters[i] = NULL;
+               }
         }
  }
  #endif
author	Yu Zhao <yuzhao@google.com>
	Sun, 18 Sep 2022 08:00:05 +0000 (02:00 -0600)
committer	Marek Szyprowski <m.szyprowski@samsung.com>
	Wed, 17 Jan 2024 17:15:54 +0000 (18:15 +0100)
fs/exec.c		patch \| blob \| history
include/linux/memcontrol.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
kernel/exit.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history