memory tiering: rate limit NUMA migration throughput

author Huang Ying <ying.huang@intel.com>

Wed, 13 Jul 2022 08:39:52 +0000 (16:39 +0800)

committer Andrew Morton <akpm@linux-foundation.org>

Mon, 12 Sep 2022 03:25:54 +0000 (20:25 -0700)
author Huang Ying <ying.huang@intel.com>
Wed, 13 Jul 2022 08:39:52 +0000 (16:39 +0800)
committer Andrew Morton <akpm@linux-foundation.org>
Mon, 12 Sep 2022 03:25:54 +0000 (20:25 -0700)
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst

index ee6572b1edadaffb3968047f2d568f777203c2ca..835c8844bba48c08dfd939c592cb42a409be8512 100644 (file)
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -635,6 +635,17 @@ different types of memory (represented as different NUMA nodes) to
  place the hot pages in the fast memory.  This is implemented based on
  unmapping and page fault too.
  
+numa_balancing_promote_rate_limit_MBps
+======================================
+
+Too high promotion/demotion throughput between different memory types
+may hurt application latency.  This can be used to rate limit the
+promotion throughput.  The per-node max promotion throughput in MB/s
+will be limited to be no more than the set value.
+
+A rule of thumb is to set this to less than 1/10 of the PMEM node
+write bandwidth.
+
  oops_all_cpu_backtrace
  ======================
  
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 8f571dc7c5247f79b6b8ac5bbd08ada8a022e269..a0003eaa751f3c21ad092b7dfb315263b3112dda 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -221,6 +221,7 @@ enum node_stat_item {
  #endif
  #ifdef CONFIG_NUMA_BALANCING
         PGPROMOTE_SUCCESS,      /* promote successfully */
+       PGPROMOTE_CANDIDATE,    /* candidate pages to promote */
  #endif
         NR_VM_NODE_STAT_ITEMS
  };
@@ -998,6 +999,12 @@ typedef struct pglist_data {
         struct deferred_split deferred_split_queue;
  #endif
  
+#ifdef CONFIG_NUMA_BALANCING
+       /* start time in ms of current promote rate limit period */
+       unsigned int nbp_rl_start;
+       /* number of promote candidate pages at start time of current rate limit period */
+       unsigned long nbp_rl_nr_cand;
+#endif
         /* Fields commonly accessed by the page reclaim scanner */
  
         /*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index e650946816d008eb77eb72cd7a8c8104d2a85ef3..303ee7dd0c7e2a67d9901755d994003f1ae7fc8b 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -27,6 +27,7 @@ enum sched_tunable_scaling {
  
  #ifdef CONFIG_NUMA_BALANCING
  extern int sysctl_numa_balancing_mode;
+extern unsigned int sysctl_numa_balancing_promote_rate_limit;
  #else
  #define sysctl_numa_balancing_mode     0
  #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 06db566c76609d3a747f14dc266ee55d8c69d51a..1d1dd88daaab9b938f6d77cf8d8adbbfa583aa2c 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1097,6 +1097,9 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;
  /* The page with hint page fault latency < threshold in ms is considered hot */
  unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
  
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
  struct numa_group {
         refcount_t refcount;
  
@@ -1501,6 +1504,29 @@ static int numa_hint_fault_latency(struct page *page)
         return (time - last_time) & PAGE_ACCESS_TIME_MASK;
  }
  
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency.  So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+                                     unsigned long rate_limit, int nr)
+{
+       unsigned long nr_cand;
+       unsigned int now, start;
+
+       now = jiffies_to_msecs(jiffies);
+       mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+       nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+       start = pgdat->nbp_rl_start;
+       if (now - start > MSEC_PER_SEC &&
+           cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+               pgdat->nbp_rl_nr_cand = nr_cand;
+       if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+               return true;
+       return false;
+}
+
  bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                                 int src_nid, int dst_cpu)
  {
@@ -1515,7 +1541,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
         if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
             !node_is_toptier(src_nid)) {
                 struct pglist_data *pgdat;
-               unsigned long latency, th;
+               unsigned long rate_limit, latency, th;
  
                 pgdat = NODE_DATA(dst_nid);
                 if (pgdat_free_space_enough(pgdat))
@@ -1526,7 +1552,10 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                 if (latency >= th)
                         return false;
  
-               return true;
+               rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+                       (20 - PAGE_SHIFT);
+               return !numa_promotion_rate_limit(pgdat, rate_limit,
+                                                 thp_nr_pages(page));
         }
  
         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 205d605cacc5bb24fd7967168d220c038ec515e8..f10a610aa834791a46810239aca0cbd07bfabbcf 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1641,6 +1641,14 @@ static struct ctl_table kern_table[] = {
                 .extra1         = SYSCTL_ZERO,
                 .extra2         = SYSCTL_FOUR,
         },
+       {
+               .procname       = "numa_balancing_promote_rate_limit_MBps",
+               .data           = &sysctl_numa_balancing_promote_rate_limit,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+       },
  #endif /* CONFIG_NUMA_BALANCING */
         {
                 .procname       = "panic",
diff --git a/mm/vmstat.c b/mm/vmstat.c

index 90af9a8572f5a7073520ddaf2f4d1d3aaec2b7ac..c109167a669c1ab9562bc416598fed27e1e606fa 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1252,6 +1252,7 @@ const char * const vmstat_text[] = {
  #endif
  #ifdef CONFIG_NUMA_BALANCING
         "pgpromote_success",
+       "pgpromote_candidate",
  #endif
  
         /* enum writeback_stat_item counters */
author	Huang Ying <ying.huang@intel.com>
	Wed, 13 Jul 2022 08:39:52 +0000 (16:39 +0800)
committer	Andrew Morton <akpm@linux-foundation.org>
	Mon, 12 Sep 2022 03:25:54 +0000 (20:25 -0700)
Documentation/admin-guide/sysctl/kernel.rst		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
include/linux/sched/sysctl.h		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history