Merge tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git...

[platform/kernel/linux-starfive.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 5ffec43..e4a0b8b 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -40,6 +40,7 @@
  
  #include <linux/cpuidle.h>
  #include <linux/interrupt.h>
+#include <linux/memory-tiers.h>
  #include <linux/mempolicy.h>
  #include <linux/mutex_api.h>
  #include <linux/profile.h>
@@ -1090,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
  unsigned int sysctl_numa_balancing_scan_delay = 1000;
  
+/* The page with hint page fault latency < threshold in ms is considered hot */
+unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
+
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+
  struct numa_group {
         refcount_t refcount;
  
@@ -1432,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
         return 1000 * faults / total_faults;
  }
  
+/*
+ * If memory tiering mode is enabled, cpupid of slow memory page is
+ * used to record scan time instead of CPU and PID.  When tiering mode
+ * is disabled at run time, the scan time (in cpupid) will be
+ * interpreted as CPU and PID.  So CPU needs to be checked to avoid to
+ * access out of array bound.
+ */
+static inline bool cpupid_valid(int cpupid)
+{
+       return cpupid_to_cpu(cpupid) < nr_cpu_ids;
+}
+
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+       int z;
+       unsigned long enough_wmark;
+
+       enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+                          pgdat->node_present_pages >> 4);
+       for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+               struct zone *zone = pgdat->node_zones + z;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               if (zone_watermark_ok(zone, 0,
+                                     wmark_pages(zone, WMARK_PROMO) + enough_wmark,
+                                     ZONE_MOVABLE, 0))
+                       return true;
+       }
+       return false;
+}
+
+/*
+ * For memory tiering mode, when page tables are scanned, the scan
+ * time will be recorded in struct page in addition to make page
+ * PROT_NONE for slow memory page.  So when the page is accessed, in
+ * hint page fault handler, the hint page fault latency is calculated
+ * via,
+ *
+ *     hint page fault latency = hint page fault time - scan time
+ *
+ * The smaller the hint page fault latency, the higher the possibility
+ * for the page to be hot.
+ */
+static int numa_hint_fault_latency(struct page *page)
+{
+       int last_time, time;
+
+       time = jiffies_to_msecs(jiffies);
+       last_time = xchg_page_access_time(page, time);
+
+       return (time - last_time) & PAGE_ACCESS_TIME_MASK;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency.  So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
+                                     unsigned long rate_limit, int nr)
+{
+       unsigned long nr_cand;
+       unsigned int now, start;
+
+       now = jiffies_to_msecs(jiffies);
+       mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+       nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+       start = pgdat->nbp_rl_start;
+       if (now - start > MSEC_PER_SEC &&
+           cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
+               pgdat->nbp_rl_nr_cand = nr_cand;
+       if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+               return true;
+       return false;
+}
+
+#define NUMA_MIGRATION_ADJUST_STEPS    16
+
+static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
+                                           unsigned long rate_limit,
+                                           unsigned int ref_th)
+{
+       unsigned int now, start, th_period, unit_th, th;
+       unsigned long nr_cand, ref_cand, diff_cand;
+
+       now = jiffies_to_msecs(jiffies);
+       th_period = sysctl_numa_balancing_scan_period_max;
+       start = pgdat->nbp_th_start;
+       if (now - start > th_period &&
+           cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
+               ref_cand = rate_limit *
+                       sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
+               nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+               diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+               unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
+               th = pgdat->nbp_threshold ? : ref_th;
+               if (diff_cand > ref_cand * 11 / 10)
+                       th = max(th - unit_th, unit_th);
+               else if (diff_cand < ref_cand * 9 / 10)
+                       th = min(th + unit_th, ref_th * 2);
+               pgdat->nbp_th_nr_cand = nr_cand;
+               pgdat->nbp_threshold = th;
+       }
+}
+
  bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                                 int src_nid, int dst_cpu)
  {
@@ -1439,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
         int dst_nid = cpu_to_node(dst_cpu);
         int last_cpupid, this_cpupid;
  
+       /*
+        * The pages in slow memory node should be migrated according
+        * to hot/cold instead of private/shared.
+        */
+       if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+           !node_is_toptier(src_nid)) {
+               struct pglist_data *pgdat;
+               unsigned long rate_limit;
+               unsigned int latency, th, def_th;
+
+               pgdat = NODE_DATA(dst_nid);
+               if (pgdat_free_space_enough(pgdat)) {
+                       /* workload changed, reset hot threshold */
+                       pgdat->nbp_threshold = 0;
+                       return true;
+               }
+
+               def_th = sysctl_numa_balancing_hot_threshold;
+               rate_limit = sysctl_numa_balancing_promote_rate_limit << \
+                       (20 - PAGE_SHIFT);
+               numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
+
+               th = pgdat->nbp_threshold ? : def_th;
+               latency = numa_hint_fault_latency(page);
+               if (latency >= th)
+                       return false;
+
+               return !numa_promotion_rate_limit(pgdat, rate_limit,
+                                                 thp_nr_pages(page));
+       }
+
         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
         last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
  
+       if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+           !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
+               return false;
+
         /*
          * Allow first faults or private faults to migrate immediately early in
          * the lifetime of a task. The magic number 4 is based on waiting for
@@ -2681,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
         if (!p->mm)
                 return;
  
+       /*
+        * NUMA faults statistics are unnecessary for the slow memory
+        * node for memory tiering mode.
+        */
+       if (!node_is_toptier(mem_node) &&
+           (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
+            !cpupid_valid(last_cpupid)))
+               return;
+
         /* Allocate buffer to track faults on a per-node basis */
         if (unlikely(!p->numa_faults)) {
                 int size = sizeof(*p->numa_faults) *
@@ -2761,6 +2926,7 @@ static void task_numa_work(struct callback_head *work)
         struct task_struct *p = current;
         struct mm_struct *mm = p->mm;
         u64 runtime = p->se.sum_exec_runtime;
+       MA_STATE(mas, &mm->mm_mt, 0, 0);
         struct vm_area_struct *vma;
         unsigned long start, end;
         unsigned long nr_pte_updates = 0;
@@ -2817,13 +2983,16 @@ static void task_numa_work(struct callback_head *work)
  
         if (!mmap_read_trylock(mm))
                 return;
-       vma = find_vma(mm, start);
+       mas_set(&mas, start);
+       vma = mas_find(&mas, ULONG_MAX);
         if (!vma) {
                 reset_ptenuma_scan(p);
                 start = 0;
-               vma = mm->mmap;
+               mas_set(&mas, start);
+               vma = mas_find(&mas, ULONG_MAX);
         }
-       for (; vma; vma = vma->vm_next) {
+
+       for (; vma; vma = mas_find(&mas, ULONG_MAX)) {
                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
                         is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
                         continue;