mm/demotion: demote pages according to allocation fallback order

author Jagdish Gediya <jvgediya.oss@gmail.com>

Thu, 18 Aug 2022 13:10:40 +0000 (18:40 +0530)

committer Andrew Morton <akpm@linux-foundation.org>

Tue, 27 Sep 2022 02:46:12 +0000 (19:46 -0700)
author Jagdish Gediya <jvgediya.oss@gmail.com>
Thu, 18 Aug 2022 13:10:40 +0000 (18:40 +0530)
committer Andrew Morton <akpm@linux-foundation.org>
Tue, 27 Sep 2022 02:46:12 +0000 (19:46 -0700)
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h

index 7ca52ad2789f557c5bbd3f41c6899ea18a0e7326..42791554b9b97e386c7c4769b06d880c4d5b7af4 100644 (file)
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -5,6 +5,7 @@
  #include <linux/types.h>
  #include <linux/nodemask.h>
  #include <linux/kref.h>
+#include <linux/mmzone.h>
  /*
   * Each tier cover a abstrace distance chunk size of 128
   */
@@ -38,11 +39,17 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
  void clear_node_memory_type(int node, struct memory_dev_type *memtype);
  #ifdef CONFIG_MIGRATION
  int next_demotion_node(int node);
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
  #else
  static inline int next_demotion_node(int node)
  {
         return NUMA_NO_NODE;
  }
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+       *targets = NODE_MASK_NONE;
+}
  #endif
  
  #else
@@ -75,5 +82,10 @@ static inline int next_demotion_node(int node)
  {
         return NUMA_NO_NODE;
  }
+
+static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+       *targets = NODE_MASK_NONE;
+}
  #endif /* CONFIG_NUMA */
  #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c

index 0e2bd32375d6ceb3947f12be92731d1a21a925f2..45dd6fa4e2d169d2696ade491177804e9c01c94b 100644 (file)
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -4,7 +4,6 @@
  #include <linux/sysfs.h>
  #include <linux/kobject.h>
  #include <linux/memory.h>
-#include <linux/mmzone.h>
  #include <linux/memory-tiers.h>
  
  #include "internal.h"
@@ -20,6 +19,8 @@ struct memory_tier {
          * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
          */
         int adistance_start;
+       /* All the nodes that are part of all the lower memory tiers. */
+       nodemask_t lower_tier_mask;
  };
  
  struct demotion_nodes {
@@ -161,6 +162,24 @@ static struct memory_tier *__node_get_memory_tier(int node)
  }
  
  #ifdef CONFIG_MIGRATION
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+       struct memory_tier *memtier;
+
+       /*
+        * pg_data_t.memtier updates includes a synchronize_rcu()
+        * which ensures that we either find NULL or a valid memtier
+        * in NODE_DATA. protect the access via rcu_read_lock();
+        */
+       rcu_read_lock();
+       memtier = rcu_dereference(pgdat->memtier);
+       if (memtier)
+               *targets = memtier->lower_tier_mask;
+       else
+               *targets = NODE_MASK_NONE;
+       rcu_read_unlock();
+}
+
  /**
   * next_demotion_node() - Get the next node in the demotion path
   * @node: The starting node to lookup the next node
@@ -208,10 +227,19 @@ int next_demotion_node(int node)
  
  static void disable_all_demotion_targets(void)
  {
+       struct memory_tier *memtier;
         int node;
  
-       for_each_node_state(node, N_MEMORY)
+       for_each_node_state(node, N_MEMORY) {
                 node_demotion[node].preferred = NODE_MASK_NONE;
+               /*
+                * We are holding memory_tier_lock, it is safe
+                * to access pgda->memtier.
+                */
+               memtier = __node_get_memory_tier(node);
+               if (memtier)
+                       memtier->lower_tier_mask = NODE_MASK_NONE;
+       }
         /*
          * Ensure that the "disable" is visible across the system.
          * Readers will see either a combination of before+disable
@@ -243,7 +271,7 @@ static void establish_demotion_targets(void)
         struct demotion_nodes *nd;
         int target = NUMA_NO_NODE, node;
         int distance, best_distance;
-       nodemask_t tier_nodes;
+       nodemask_t tier_nodes, lower_tier;
  
         lockdep_assert_held_once(&memory_tier_lock);
  
@@ -291,6 +319,23 @@ static void establish_demotion_targets(void)
                         }
                 } while (1);
         }
+       /*
+        * Now build the lower_tier mask for each node collecting node mask from
+        * all memory tier below it. This allows us to fallback demotion page
+        * allocation to a set of nodes that is closer the above selected
+        * perferred node.
+        */
+       lower_tier = node_states[N_MEMORY];
+       list_for_each_entry(memtier, &memory_tiers, list) {
+               /*
+                * Keep removing current tier from lower_tier nodes,
+                * This will remove all nodes in current and above
+                * memory tier from the lower_tier mask.
+                */
+               tier_nodes = get_memtier_nodemask(memtier);
+               nodes_andnot(lower_tier, lower_tier, tier_nodes);
+               memtier->lower_tier_mask = lower_tier;
+       }
  }
  
  #else
diff --git a/mm/vmscan.c b/mm/vmscan.c

index b7e9d8f8f649ae7f88878e7d9a2caacc7f65fc76..809df16c7c0df256e91938d37ab6105864294a29 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1533,21 +1533,34 @@ static void folio_check_dirty_writeback(struct folio *folio,
                 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
  }
  
-static struct page *alloc_demote_page(struct page *page, unsigned long node)
+static struct page *alloc_demote_page(struct page *page, unsigned long private)
  {
-       struct migration_target_control mtc = {
-               /*
-                * Allocate from 'node', or fail quickly and quietly.
-                * When this happens, 'page' will likely just be discarded
-                * instead of migrated.
-                */
-               .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
-                           __GFP_THISNODE  | __GFP_NOWARN |
-                           __GFP_NOMEMALLOC | GFP_NOWAIT,
-               .nid = node
-       };
+       struct page *target_page;
+       nodemask_t *allowed_mask;
+       struct migration_target_control *mtc;
+
+       mtc = (struct migration_target_control *)private;
+
+       allowed_mask = mtc->nmask;
+       /*
+        * make sure we allocate from the target node first also trying to
+        * demote or reclaim pages from the target node via kswapd if we are
+        * low on free memory on target node. If we don't do this and if
+        * we have free memory on the slower(lower) memtier, we would start
+        * allocating pages from slower(lower) memory tiers without even forcing
+        * a demotion of cold pages from the target memtier. This can result
+        * in the kernel placing hot pages in slower(lower) memory tiers.
+        */
+       mtc->nmask = NULL;
+       mtc->gfp_mask |= __GFP_THISNODE;
+       target_page = alloc_migration_target(page, (unsigned long)mtc);
+       if (target_page)
+               return target_page;
  
-       return alloc_migration_target(page, (unsigned long)&mtc);
+       mtc->gfp_mask &= ~__GFP_THISNODE;
+       mtc->nmask = allowed_mask;
+
+       return alloc_migration_target(page, (unsigned long)mtc);
  }
  
  /*
@@ -1560,6 +1573,19 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
  {
         int target_nid = next_demotion_node(pgdat->node_id);
         unsigned int nr_succeeded;
+       nodemask_t allowed_mask;
+
+       struct migration_target_control mtc = {
+               /*
+                * Allocate from 'node', or fail quickly and quietly.
+                * When this happens, 'page' will likely just be discarded
+                * instead of migrated.
+                */
+               .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
+                       __GFP_NOMEMALLOC | GFP_NOWAIT,
+               .nid = target_nid,
+               .nmask = &allowed_mask
+       };
  
         if (list_empty(demote_pages))
                 return 0;
@@ -1567,10 +1593,12 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
         if (target_nid == NUMA_NO_NODE)
                 return 0;
  
+       node_get_allowed_targets(pgdat, &allowed_mask);
+
         /* Demotion ignores all cpuset and mempolicy settings */
         migrate_pages(demote_pages, alloc_demote_page, NULL,
-                           target_nid, MIGRATE_ASYNC, MR_DEMOTION,
-                           &nr_succeeded);
+                     (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
+                     &nr_succeeded);
  
         if (current_is_kswapd())
                 __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
author	Jagdish Gediya <jvgediya.oss@gmail.com>
	Thu, 18 Aug 2022 13:10:40 +0000 (18:40 +0530)
committer	Andrew Morton <akpm@linux-foundation.org>
	Tue, 27 Sep 2022 02:46:12 +0000 (19:46 -0700)
include/linux/memory-tiers.h		patch \| blob \| history
mm/memory-tiers.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history