BACKPORT: mm: multi-gen LRU: minimal implementation

author Yu Zhao <yuzhao@google.com>

Sun, 18 Sep 2022 08:00:03 +0000 (02:00 -0600)

committer Marek Szyprowski <m.szyprowski@samsung.com>

Wed, 17 Jan 2024 17:15:54 +0000 (18:15 +0100)
author Yu Zhao <yuzhao@google.com>
Sun, 18 Sep 2022 08:00:03 +0000 (02:00 -0600)
committer Marek Szyprowski <m.szyprowski@samsung.com>
Wed, 17 Jan 2024 17:15:54 +0000 (18:15 +0100)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h

index 59098de..9a7b7f1 100644 (file)
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -124,6 +124,33 @@ static inline int lru_gen_from_seq(unsigned long seq)
         return seq % MAX_NR_GENS;
  }
  
+static inline int lru_hist_from_seq(unsigned long seq)
+{
+       return seq % NR_HIST_GENS;
+}
+
+static inline int lru_tier_from_refs(int refs)
+{
+       VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
+
+       /* see the comment in page_lru_refs() */
+       return order_base_2(refs + 1);
+}
+
+static inline int page_lru_refs(struct page *page)
+{
+       unsigned long flags = READ_ONCE(page->flags);
+       bool workingset = flags & BIT(PG_workingset);
+
+       /*
+        * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
+        * total number of accesses is N>1, since N=0,1 both map to the first
+        * tier. lru_tier_from_refs() will account for this off-by-one. Also see
+        * the comment on MAX_NR_TIERS.
+        */
+       return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
+}
+
  static inline int page_lru_gen(struct page *page)
  {
         unsigned long flags = READ_ONCE(page->flags);
@@ -176,6 +203,15 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
                 __update_lru_size(lruvec, lru, zone, -delta);
                 return;
         }
+
+       /* promotion */
+       if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
+               __update_lru_size(lruvec, lru, zone, -delta);
+               __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
+       }
+
+       /* demotion requires isolation, e.g., lru_deactivate_fn() */
+       VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
  }
  
  static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 2d3a8c9..a7cbcec 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -336,6 +336,28 @@ enum lruvec_flags {
  #define MIN_NR_GENS            2U
  #define MAX_NR_GENS            4U
  
+/*
+ * Each generation is divided into multiple tiers. A page accessed N times
+ * through file descriptors is in tier order_base_2(N). A page in the first tier
+ * (N=0,1) is marked by PG_referenced unless it was faulted in through page
+ * tables or read ahead. A page in any other tier (N>1) is marked by
+ * PG_referenced and PG_workingset. This implies a minimum of two tiers is
+ * supported without using additional bits in page->flags.
+ *
+ * In contrast to moving across generations which requires the LRU lock, moving
+ * across tiers only involves atomic operations on page->flags and therefore
+ * has a negligible cost in the buffered access path. In the eviction path,
+ * comparisons of refaulted/(evicted+protected) from the first tier and the
+ * rest infer whether pages accessed multiple times through file descriptors
+ * are statistically hot and thus worth protecting.
+ *
+ * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
+ * page->flags.
+ */
+#define MAX_NR_TIERS           4U
+
  #ifndef __GENERATING_BOUNDS_H
  
  struct lruvec;
@@ -350,6 +372,16 @@ enum {
         LRU_GEN_FILE,
  };
  
+#define MIN_LRU_BATCH          BITS_PER_LONG
+#define MAX_LRU_BATCH          (MIN_LRU_BATCH * 64)
+
+/* whether to keep historical stats from evicted generations */
+#ifdef CONFIG_LRU_GEN_STATS
+#define NR_HIST_GENS           MAX_NR_GENS
+#else
+#define NR_HIST_GENS           1U
+#endif
+
  /*
   * The youngest generation number is stored in max_seq for both anon and file
   * types as they are aged on an equal footing. The oldest generation numbers are
@@ -372,6 +404,15 @@ struct lru_gen_struct {
         struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
         /* the multi-gen LRU sizes, eventually consistent */
         long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+       /* the exponential moving average of refaulted */
+       unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
+       /* the exponential moving average of evicted+protected */
+       unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
+       /* the first tier doesn't need protection, hence the minus one */
+       unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
+       /* can be modified without holding the LRU lock */
+       atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+       atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
  };
  
  void lru_gen_init_lruvec(struct lruvec *lruvec);
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h

index 706c361..154f74a 100644 (file)
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -96,7 +96,10 @@
  #error "Not enough bits in page flags"
  #endif
  
-#define LRU_REFS_WIDTH 0
+/* see the comment on MAX_NR_TIERS */
+#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
+                           ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
+                           NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
  
  /*
   * We are going to use the flags for the page to node mapping if its in
diff --git a/kernel/bounds.c b/kernel/bounds.c

index 5ee6077..b529182 100644 (file)
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -24,8 +24,10 @@ int main(void)
         DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
  #ifdef CONFIG_LRU_GEN
         DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+       DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
  #else
         DEFINE(LRU_GEN_WIDTH, 0);
+       DEFINE(__LRU_REFS_WIDTH, 0);
  #endif
         /* End of constants */
  
diff --git a/mm/Kconfig b/mm/Kconfig

index 6e5675a..f1db1f2 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -761,6 +761,7 @@ config ARCH_HAS_PTE_SPECIAL
  config ARCH_HAS_HUGEPD
         bool
  
+# multi-gen LRU {
  config LRU_GEN
         bool "Multi-Gen LRU"
         depends on MMU
@@ -769,4 +770,14 @@ config LRU_GEN
         help
           A high performance LRU implementation to overcommit memory.
  
+config LRU_GEN_STATS
+       bool "Full stats for debugging"
+       depends on LRU_GEN
+       help
+         Do not enable this option unless you plan to look at historical stats
+         from evicted generations for debugging purpose.
+
+         This option has a per-memcg and per-node memory overhead.
+# }
+
  endmenu
diff --git a/mm/swap.c b/mm/swap.c

index 5974b5f..8d0c6ce 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -360,6 +360,40 @@ static void __lru_cache_activate_page(struct page *page)
         put_cpu_var(lru_add_pvec);
  }
  
+#ifdef CONFIG_LRU_GEN
+static void page_inc_refs(struct page *page)
+{
+       unsigned long new_flags, old_flags = READ_ONCE(page->flags);
+
+       if (PageUnevictable(page))
+               return;
+
+       if (!PageReferenced(page)) {
+               SetPageReferenced(page);
+               return;
+       }
+
+       if (!PageWorkingset(page)) {
+               SetPageWorkingset(page);
+               return;
+       }
+
+       /* see the comment on MAX_NR_TIERS */
+       do {
+               new_flags = old_flags & LRU_REFS_MASK;
+               if (new_flags == LRU_REFS_MASK)
+                       break;
+
+               new_flags += BIT(LRU_REFS_PGOFF);
+               new_flags |= old_flags & ~LRU_REFS_MASK;
+       } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
+}
+#else
+static void page_inc_refs(struct page *page)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
  /*
   * Mark a page as having seen activity.
   *
@@ -373,9 +407,14 @@ static void __lru_cache_activate_page(struct page *page)
  void mark_page_accessed(struct page *page)
  {
         page = compound_head(page);
+
+       if (lru_gen_enabled()) {
+               page_inc_refs(page);
+               return;
+       }
+
         if (!PageActive(page) && !PageUnevictable(page) &&
                         PageReferenced(page)) {
-
                 /*
                  * If the page is on the LRU, queue it for activation via
                  * activate_page_pvecs. Otherwise, assume the page is on a
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 12759c4..b10eda9 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2229,6 +2229,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
         unsigned long file;
         struct lruvec *target_lruvec;
  
+       if (lru_gen_enabled())
+               return;
+
         target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
  
         /*
@@ -2537,6 +2540,17 @@ out:
   *                          shorthand helpers
   ******************************************************************************/
  
+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
+
+#define DEFINE_MAX_SEQ(lruvec)                                         \
+       unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
+
+#define DEFINE_MIN_SEQ(lruvec)                                         \
+       unsigned long min_seq[ANON_AND_FILE] = {                        \
+               READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]),      \
+               READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]),      \
+       }
+
  #define for_each_gen_type_zone(gen, type, zone)                                \
         for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)                   \
                 for ((type) = 0; (type) < ANON_AND_FILE; (type)++)      \
@@ -2562,6 +2576,746 @@ static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int ni
         return pgdat ? &pgdat->__lruvec : NULL;
  }
  
+static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
+{
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+       /* struct pglist_data *pgdat = lruvec_pgdat(lruvec); */
+
+       /* FIXME: see a2a36488a61c + 26aa2d199d6f */
+       if (/* !can_demote(pgdat->node_id, sc) && */
+           mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
+               return 0;
+
+       return mem_cgroup_swappiness(memcg);
+}
+
+static int get_nr_gens(struct lruvec *lruvec, int type)
+{
+       return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
+}
+
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
+{
+       /* see the comment on lru_gen_struct */
+       return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
+              get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
+              get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+}
+
+/******************************************************************************
+ *                          refault feedback loop
+ ******************************************************************************/
+
+/*
+ * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
+ *
+ * The P term is refaulted/(evicted+protected) from a tier in the generation
+ * currently being evicted; the I term is the exponential moving average of the
+ * P term over the generations previously evicted, using the smoothing factor
+ * 1/2; the D term isn't supported.
+ *
+ * The setpoint (SP) is always the first tier of one type; the process variable
+ * (PV) is either any tier of the other type or any other tier of the same
+ * type.
+ *
+ * The error is the difference between the SP and the PV; the correction is to
+ * turn off protection when SP>PV or turn on protection when SP<PV.
+ *
+ * For future optimizations:
+ * 1. The D term may discount the other two terms over time so that long-lived
+ *    generations can resist stale information.
+ */
+struct ctrl_pos {
+       unsigned long refaulted;
+       unsigned long total;
+       int gain;
+};
+
+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
+                         struct ctrl_pos *pos)
+{
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+       int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+       pos->refaulted = lrugen->avg_refaulted[type][tier] +
+                        atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+       pos->total = lrugen->avg_total[type][tier] +
+                    atomic_long_read(&lrugen->evicted[hist][type][tier]);
+       if (tier)
+               pos->total += lrugen->protected[hist][type][tier - 1];
+       pos->gain = gain;
+}
+
+static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
+{
+       int hist, tier;
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+       bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
+       unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
+
+       lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
+
+       if (!carryover && !clear)
+               return;
+
+       hist = lru_hist_from_seq(seq);
+
+       for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+               if (carryover) {
+                       unsigned long sum;
+
+                       sum = lrugen->avg_refaulted[type][tier] +
+                             atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+                       WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
+
+                       sum = lrugen->avg_total[type][tier] +
+                             atomic_long_read(&lrugen->evicted[hist][type][tier]);
+                       if (tier)
+                               sum += lrugen->protected[hist][type][tier - 1];
+                       WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
+               }
+
+               if (clear) {
+                       atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
+                       atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
+                       if (tier)
+                               WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+               }
+       }
+}
+
+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
+{
+       /*
+        * Return true if the PV has a limited number of refaults or a lower
+        * refaulted/total than the SP.
+        */
+       return pv->refaulted < MIN_LRU_BATCH ||
+              pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
+              (sp->refaulted + 1) * pv->total * pv->gain;
+}
+
+/******************************************************************************
+ *                          the aging
+ ******************************************************************************/
+
+/* protect pages accessed multiple times through file descriptors */
+static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
+{
+       int type = page_is_file_cache(page);
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+       int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+       unsigned long new_flags, old_flags = READ_ONCE(page->flags);
+
+       VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
+
+       do {
+               new_gen = (old_gen + 1) % MAX_NR_GENS;
+
+               new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+               new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
+               /* for end_page_writeback() */
+               if (reclaiming)
+                       new_flags |= BIT(PG_reclaim);
+       } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
+
+       lru_gen_update_size(lruvec, page, old_gen, new_gen);
+
+       return new_gen;
+}
+
+static void inc_min_seq(struct lruvec *lruvec, int type)
+{
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+       reset_ctrl_pos(lruvec, type, true);
+       WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
+}
+
+static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+{
+       int gen, type, zone;
+       bool success = false;
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+       DEFINE_MIN_SEQ(lruvec);
+
+       VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+
+       /* find the oldest populated generation */
+       for (type = !can_swap; type < ANON_AND_FILE; type++) {
+               while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
+                       gen = lru_gen_from_seq(min_seq[type]);
+
+                       for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                               if (!list_empty(&lrugen->lists[gen][type][zone]))
+                                       goto next;
+                       }
+
+                       min_seq[type]++;
+               }
+next:
+               ;
+       }
+
+       /* see the comment on lru_gen_struct */
+       if (can_swap) {
+               min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
+               min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+       }
+
+       for (type = !can_swap; type < ANON_AND_FILE; type++) {
+               if (min_seq[type] == lrugen->min_seq[type])
+                       continue;
+
+               reset_ctrl_pos(lruvec, type, true);
+               WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
+               success = true;
+       }
+
+       return success;
+}
+
+static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
+{
+       int prev, next;
+       int type, zone;
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+       spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+
+       VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+
+       if (max_seq != lrugen->max_seq)
+               goto unlock;
+
+       for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+               if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+                       continue;
+
+               VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
+
+               inc_min_seq(lruvec, type);
+       }
+
+       /*
+        * Update the active/inactive LRU sizes for compatibility. Both sides of
+        * the current max_seq need to be covered, since max_seq+1 can overlap
+        * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
+        * overlap, cold/hot inversion happens.
+        */
+       prev = lru_gen_from_seq(lrugen->max_seq - 1);
+       next = lru_gen_from_seq(lrugen->max_seq + 1);
+
+       for (type = 0; type < ANON_AND_FILE; type++) {
+               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                       enum lru_list lru = type * LRU_INACTIVE_FILE;
+                       long delta = lrugen->nr_pages[prev][type][zone] -
+                                    lrugen->nr_pages[next][type][zone];
+
+                       if (!delta)
+                               continue;
+
+                       __update_lru_size(lruvec, lru, zone, delta);
+                       __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
+               }
+       }
+
+       for (type = 0; type < ANON_AND_FILE; type++)
+               reset_ctrl_pos(lruvec, type, false);
+
+       /* make sure preceding modifications appear */
+       smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+unlock:
+       spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+}
+
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
+                            struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+{
+       int gen, type, zone;
+       unsigned long old = 0;
+       unsigned long young = 0;
+       unsigned long total = 0;
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+       for (type = !can_swap; type < ANON_AND_FILE; type++) {
+               unsigned long seq;
+
+               for (seq = min_seq[type]; seq <= max_seq; seq++) {
+                       unsigned long size = 0;
+
+                       gen = lru_gen_from_seq(seq);
+
+                       for (zone = 0; zone < MAX_NR_ZONES; zone++)
+                               size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+                       total += size;
+                       if (seq == max_seq)
+                               young += size;
+                       else if (seq + MIN_NR_GENS == max_seq)
+                               old += size;
+               }
+       }
+
+       /* try to scrape all its memory if this memcg was deleted */
+       *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+
+       /*
+        * The aging tries to be lazy to reduce the overhead, while the eviction
+        * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+        * ideal number of generations is MIN_NR_GENS+1.
+        */
+       if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
+               return true;
+       if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+               return false;
+
+       /*
+        * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
+        * of the total number of pages for each generation. A reasonable range
+        * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
+        * aging cares about the upper bound of hot pages, while the eviction
+        * cares about the lower bound of cold pages.
+        */
+       if (young * MIN_NR_GENS > total)
+               return true;
+       if (old * (MIN_NR_GENS + 2) < total)
+               return true;
+
+       return false;
+}
+
+static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+       bool need_aging;
+       unsigned long nr_to_scan;
+       int swappiness = get_swappiness(lruvec, sc);
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+       DEFINE_MAX_SEQ(lruvec);
+       DEFINE_MIN_SEQ(lruvec);
+
+       VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
+
+       mem_cgroup_calculate_protection(NULL, memcg);
+
+       if (mem_cgroup_below_min(memcg))
+               return;
+
+       need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
+       if (need_aging)
+               inc_max_seq(lruvec, max_seq, swappiness);
+}
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+       struct mem_cgroup *memcg;
+
+       VM_WARN_ON_ONCE(!current_is_kswapd());
+
+       memcg = mem_cgroup_iter(NULL, NULL, NULL);
+       do {
+               struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+               age_lruvec(lruvec, sc);
+
+               cond_resched();
+       } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+}
+
+/******************************************************************************
+ *                          the eviction
+ ******************************************************************************/
+
+static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx)
+{
+       bool success;
+       int gen = page_lru_gen(page);
+       int type = page_is_file_cache(page);
+       int zone = page_zonenum(page);
+       int delta = hpage_nr_pages(page);
+       int refs = page_lru_refs(page);
+       int tier = lru_tier_from_refs(refs);
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+       VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
+
+       /* unevictable */
+       if (!page_evictable(page)) {
+               success = lru_gen_del_page(lruvec, page, true);
+               VM_WARN_ON_ONCE_PAGE(!success, page);
+               SetPageUnevictable(page);
+               add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
+               __count_vm_events(UNEVICTABLE_PGCULLED, delta);
+               return true;
+       }
+
+       /* dirty lazyfree */
+       if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) {
+               enum lru_list lru = page_lru_base_type(page);
+
+               success = lru_gen_del_page(lruvec, page, true);
+               VM_WARN_ON_ONCE_PAGE(!success, page);
+               SetPageSwapBacked(page);
+               add_page_to_lru_list_tail(page, lruvec, lru);
+               return true;
+       }
+
+       /* protected */
+       if (tier > tier_idx) {
+               int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+               gen = page_inc_gen(lruvec, page, false);
+               list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
+
+               WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
+                          lrugen->protected[hist][type][tier - 1] + delta);
+               __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE, delta);
+               return true;
+       }
+
+       /* waiting for writeback */
+       if (PageLocked(page) || PageWriteback(page) ||
+           (type == LRU_GEN_FILE && PageDirty(page))) {
+               gen = page_inc_gen(lruvec, page, true);
+               list_move(&page->lru, &lrugen->lists[gen][type][zone]);
+               return true;
+       }
+
+       return false;
+}
+
+static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc)
+{
+       bool success;
+
+       /* unmapping inhibited */
+       if (!sc->may_unmap && page_mapped(page))
+               return false;
+
+       /* swapping inhibited */
+       if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
+           (PageDirty(page) ||
+            (PageAnon(page) && !PageSwapCache(page))))
+               return false;
+
+       /* raced with release_pages() */
+       if (!get_page_unless_zero(page))
+               return false;
+
+       /* raced with another isolation */
+       if (!TestClearPageLRU(page)) {
+               put_page(page);
+               return false;
+       }
+
+       /* see the comment on MAX_NR_TIERS */
+       if (!PageReferenced(page))
+               set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+
+       /* for shrink_page_list() */
+       ClearPageReclaim(page);
+       ClearPageReferenced(page);
+
+       success = lru_gen_del_page(lruvec, page, true);
+       VM_WARN_ON_ONCE_PAGE(!success, page);
+
+       return true;
+}
+
+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
+                     int type, int tier, struct list_head *list)
+{
+       int gen, zone;
+       enum vm_event_item item;
+       int sorted = 0;
+       int scanned = 0;
+       int isolated = 0;
+       int remaining = MAX_LRU_BATCH;
+       struct lru_gen_struct *lrugen = &lruvec->lrugen;
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+       VM_WARN_ON_ONCE(!list_empty(list));
+
+       if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
+               return 0;
+
+       gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+       for (zone = sc->reclaim_idx; zone >= 0; zone--) {
+               LIST_HEAD(moved);
+               int skipped = 0;
+               struct list_head *head = &lrugen->lists[gen][type][zone];
+
+               while (!list_empty(head)) {
+                       struct page *page = lru_to_page(head);
+                       int delta = hpage_nr_pages(page);
+
+                       VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
+                       VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
+                       VM_WARN_ON_ONCE_PAGE(page_is_file_cache(page) != type, page);
+                       VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
+
+                       scanned += delta;
+
+                       if (sort_page(lruvec, page, tier))
+                               sorted += delta;
+                       else if (isolate_page(lruvec, page, sc)) {
+                               list_add(&page->lru, list);
+                               isolated += delta;
+                       } else {
+                               list_move(&page->lru, &moved);
+                               skipped += delta;
+                       }
+
+                       if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
+                               break;
+               }
+
+               if (skipped) {
+                       list_splice(&moved, head);
+                       __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
+               }
+
+               if (!remaining || isolated >= MIN_LRU_BATCH)
+                       break;
+       }
+
+       item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+       if (!cgroup_reclaim(sc)) {
+               __count_vm_events(item, isolated);
+               __count_vm_events(PGREFILL, sorted);
+       }
+       __count_memcg_events(memcg, item, isolated);
+       __count_memcg_events(memcg, PGREFILL, sorted);
+
+       /*
+        * There might not be eligible pages due to reclaim_idx, may_unmap and
+        * may_writepage. Check the remaining to prevent livelock if it's not
+        * making progress.
+        */
+       return isolated || !remaining ? scanned : 0;
+}
+
+static int get_tier_idx(struct lruvec *lruvec, int type)
+{
+       int tier;
+       struct ctrl_pos sp, pv;
+
+       /*
+        * To leave a margin for fluctuations, use a larger gain factor (1:2).
+        * This value is chosen because any other tier would have at least twice
+        * as many refaults as the first tier.
+        */
+       read_ctrl_pos(lruvec, type, 0, 1, &sp);
+       for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+               read_ctrl_pos(lruvec, type, tier, 2, &pv);
+               if (!positive_ctrl_err(&sp, &pv))
+                       break;
+       }
+
+       return tier - 1;
+}
+
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+{
+       int type, tier;
+       struct ctrl_pos sp, pv;
+       int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+
+       /*
+        * Compare the first tier of anon with that of file to determine which
+        * type to scan. Also need to compare other tiers of the selected type
+        * with the first tier of the other type to determine the last tier (of
+        * the selected type) to evict.
+        */
+       read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
+       read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
+       type = positive_ctrl_err(&sp, &pv);
+
+       read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
+       for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+               read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
+               if (!positive_ctrl_err(&sp, &pv))
+                       break;
+       }
+
+       *tier_idx = tier - 1;
+
+       return type;
+}
+
+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+                        int *type_scanned, struct list_head *list)
+{
+       int i;
+       int type;
+       int scanned;
+       int tier = -1;
+       DEFINE_MIN_SEQ(lruvec);
+
+       /*
+        * Try to make the obvious choice first. When anon and file are both
+        * available from the same generation, interpret swappiness 1 as file
+        * first and 200 as anon first.
+        */
+       if (!swappiness)
+               type = LRU_GEN_FILE;
+       else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
+               type = LRU_GEN_ANON;
+       else if (swappiness == 1)
+               type = LRU_GEN_FILE;
+       else if (swappiness == 200)
+               type = LRU_GEN_ANON;
+       else
+               type = get_type_to_scan(lruvec, swappiness, &tier);
+
+       for (i = !swappiness; i < ANON_AND_FILE; i++) {
+               if (tier < 0)
+                       tier = get_tier_idx(lruvec, type);
+
+               scanned = scan_pages(lruvec, sc, type, tier, list);
+               if (scanned)
+                       break;
+
+               type = !type;
+               tier = -1;
+       }
+
+       *type_scanned = type;
+
+       return scanned;
+}
+
+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+{
+       int type;
+       int scanned;
+       int reclaimed;
+       LIST_HEAD(list);
+       struct page *page;
+       enum vm_event_item item;
+       struct reclaim_stat stat;
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+       spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+
+       scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
+
+       scanned += try_to_inc_min_seq(lruvec, swappiness);
+
+       if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+               scanned = 0;
+
+       spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+
+       if (list_empty(&list))
+               return scanned;
+
+       reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
+
+       list_for_each_entry(page, &list, lru) {
+               /* restore LRU_REFS_FLAGS cleared by isolate_page() */
+               if (PageWorkingset(page))
+                       SetPageReferenced(page);
+
+               /* don't add rejected pages to the oldest generation */
+               if (PageReclaim(page) &&
+                   (PageDirty(page) || PageWriteback(page)))
+                       ClearPageActive(page);
+               else
+                       SetPageActive(page);
+       }
+
+       spin_lock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+
+       move_pages_to_lru(lruvec, &list);
+
+       item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+       if (!cgroup_reclaim(sc))
+               __count_vm_events(item, reclaimed);
+       __count_memcg_events(memcg, item, reclaimed);
+
+       spin_unlock_irq(&lruvec_pgdat(lruvec)->lru_lock);
+
+       mem_cgroup_uncharge_list(&list);
+       free_unref_page_list(&list);
+
+       sc->nr_reclaimed += reclaimed;
+
+       return scanned;
+}
+
+static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+                                   bool can_swap)
+{
+       bool need_aging;
+       unsigned long nr_to_scan;
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+       DEFINE_MAX_SEQ(lruvec);
+       DEFINE_MIN_SEQ(lruvec);
+
+       if (mem_cgroup_below_min(memcg) ||
+           (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+               return 0;
+
+       need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+       if (!need_aging)
+               return nr_to_scan;
+
+       /* skip the aging path at the default priority */
+       if (sc->priority == DEF_PRIORITY)
+               goto done;
+
+       /* leave the work to lru_gen_age_node() */
+       if (current_is_kswapd())
+               return 0;
+
+       inc_max_seq(lruvec, max_seq, can_swap);
+done:
+       return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+       struct blk_plug plug;
+       unsigned long scanned = 0;
+
+       lru_add_drain();
+
+       blk_start_plug(&plug);
+
+       while (true) {
+               int delta;
+               int swappiness;
+               unsigned long nr_to_scan;
+
+               if (sc->may_swap)
+                       swappiness = get_swappiness(lruvec, sc);
+               else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
+                       swappiness = 1;
+               else
+                       swappiness = 0;
+
+               nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+               if (!nr_to_scan)
+                       break;
+
+               delta = evict_pages(lruvec, sc, swappiness);
+               if (!delta)
+                       break;
+
+               scanned += delta;
+               if (scanned >= nr_to_scan)
+                       break;
+
+               cond_resched();
+       }
+
+       blk_finish_plug(&plug);
+}
+
  /******************************************************************************
   *                          initialization
   ******************************************************************************/
@@ -2604,6 +3358,16 @@ static int __init init_lru_gen(void)
  };
  late_initcall(init_lru_gen);
  
+#else /* !CONFIG_LRU_GEN */
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+}
+
  #endif /* CONFIG_LRU_GEN */
  
  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
@@ -2617,6 +3381,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
         struct blk_plug plug;
         bool scan_adjusted;
  
+       if (lru_gen_enabled()) {
+               lru_gen_shrink_lruvec(lruvec, sc);
+               return;
+       }
+
         get_scan_count(lruvec, sc, nr);
  
         /* Record the original scan target for proportional adjustments later */
@@ -3077,6 +3846,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
         struct lruvec *target_lruvec;
         unsigned long refaults;
  
+       if (lru_gen_enabled())
+               return;
+
         target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
         refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
         target_lruvec->refaults = refaults;
@@ -3453,12 +4225,17 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
  }
  #endif
  
-static void age_active_anon(struct pglist_data *pgdat,
-                               struct scan_control *sc)
+static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  {
         struct mem_cgroup *memcg;
         struct lruvec *lruvec;
  
+       if (lru_gen_enabled()) {
+               lru_gen_age_node(pgdat, sc);
+               return;
+       }
+
+       /* FIXME? */
         if (!total_swap_pages)
                 return;
  
@@ -3741,12 +4518,11 @@ restart:
                 sc.may_swap = !nr_boost_reclaim;
  
                 /*
-                * Do some background aging of the anon list, to give
-                * pages a chance to be referenced before reclaiming. All
-                * pages are rotated regardless of classzone as this is
-                * about consistent aging.
+                * Do some background aging, to give pages a chance to be
+                * referenced before reclaiming. All pages are rotated
+                * regardless of classzone as this is about consistent aging.
                  */
-               age_active_anon(pgdat, &sc);
+               kswapd_age_node(pgdat, &sc);
  
                 /*
                  * If we're getting trouble reclaiming, start doing writepage
diff --git a/mm/workingset.c b/mm/workingset.c

index 474186b..745fd72 100644 (file)
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -15,6 +15,7 @@
  #include <linux/dax.h>
  #include <linux/fs.h>
  #include <linux/mm.h>
+#include <linux/mm_inline.h>
  
  /*
   *             Double CLOCK lists
@@ -184,7 +185,6 @@ static unsigned int bucket_order __read_mostly;
  static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
                          bool workingset)
  {
-       eviction >>= bucket_order;
         eviction &= EVICTION_MASK;
         eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
         eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
@@ -209,11 +209,109 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
  
         *memcgidp = memcgid;
         *pgdat = NODE_DATA(nid);
-       *evictionp = entry << bucket_order;
+       *evictionp = entry;
         *workingsetp = workingset;
  }
  
+#ifdef CONFIG_LRU_GEN
+
+static void *lru_gen_eviction(struct page *page)
+{
+       int hist;
+       unsigned long token;
+       unsigned long min_seq;
+       struct lruvec *lruvec;
+       struct lru_gen_struct *lrugen;
+       int type = page_is_file_cache(page);
+       int delta = hpage_nr_pages(page);
+       int refs = page_lru_refs(page);
+       int tier = lru_tier_from_refs(refs);
+       struct mem_cgroup *memcg = page_memcg(page);
+       struct pglist_data *pgdat = page_pgdat(page);
+
+       BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
+
+       lruvec = mem_cgroup_lruvec(memcg, pgdat);
+       lrugen = &lruvec->lrugen;
+       min_seq = READ_ONCE(lrugen->min_seq[type]);
+       token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
+
+       hist = lru_hist_from_seq(min_seq);
+       atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
+
+       return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
+}
+
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+       int hist, tier, refs;
+       int memcg_id;
+       bool workingset;
+       unsigned long token;
+       unsigned long min_seq;
+       struct lruvec *lruvec;
+       struct lru_gen_struct *lrugen;
+       struct mem_cgroup *memcg;
+       struct pglist_data *pgdat;
+       int type = page_is_file_cache(page);
+       int delta = hpage_nr_pages(page);
+
+       unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
+
+       if (pgdat != page_pgdat(page))
+               return;
+
+       rcu_read_lock();
+
+       memcg = page_memcg_rcu(page);
+       if (memcg_id != mem_cgroup_id(memcg))
+               goto unlock;
+
+       lruvec = mem_cgroup_lruvec(memcg, pgdat);
+       lrugen = &lruvec->lrugen;
+
+       min_seq = READ_ONCE(lrugen->min_seq[type]);
+       if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
+               goto unlock;
+
+       hist = lru_hist_from_seq(min_seq);
+       /* see the comment in page_lru_refs() */
+       refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
+       tier = lru_tier_from_refs(refs);
+
+       atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
+       mod_lruvec_state(lruvec, WORKINGSET_REFAULT, delta);
+
+       /*
+        * Count the following two cases as stalls:
+        * 1. For pages accessed through page tables, hotter pages pushed out
+        *    hot pages which refaulted immediately.
+        * 2. For pages accessed multiple times through file descriptors,
+        *    numbers of accesses might have been out of the range.
+        */
+       if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
+               SetPageWorkingset(page);
+               mod_lruvec_state(lruvec, WORKINGSET_RESTORE, delta);
+       }
+unlock:
+       rcu_read_unlock();
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct page *page)
+{
+       return NULL;
+}
+
+static void lru_gen_refault(struct page *page, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
  static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
+
  {
         /*
          * Reclaiming a cgroup means reclaiming all its children in a
@@ -254,12 +352,16 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
         VM_BUG_ON_PAGE(page_count(page), page);
         VM_BUG_ON_PAGE(!PageLocked(page), page);
  
+       if (lru_gen_enabled())
+               return lru_gen_eviction(page);
+
         advance_inactive_age(page_memcg(page), pgdat);
  
         lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
         /* XXX: target_memcg can be NULL, go through lruvec */
         memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
         eviction = atomic_long_read(&lruvec->inactive_age);
+       eviction >>= bucket_order;
         return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
  }
  
@@ -286,7 +388,13 @@ void workingset_refault(struct page *page, void *shadow)
         bool workingset;
         int memcgid;
  
+       if (lru_gen_enabled()) {
+               lru_gen_refault(page, shadow);
+               return;
+       }
+
         unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+       eviction <<= bucket_order;
  
         rcu_read_lock();
         /*
author	Yu Zhao <yuzhao@google.com>
	Sun, 18 Sep 2022 08:00:03 +0000 (02:00 -0600)
committer	Marek Szyprowski <m.szyprowski@samsung.com>
	Wed, 17 Jan 2024 17:15:54 +0000 (18:15 +0100)
include/linux/mm_inline.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
include/linux/page-flags-layout.h		patch \| blob \| history
kernel/bounds.c		patch \| blob \| history
mm/Kconfig		patch \| blob \| history
mm/swap.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
mm/workingset.c		patch \| blob \| history