mm: numa: Limit NUMA scanning to migrate-on-fault VMAs

[platform/adaptation/renesas_rcar/renesas_kernel.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 0bb3e0a..d98175d 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -877,38 +877,149 @@ static unsigned int task_scan_max(struct task_struct *p)
         return max(smin, smax);
  }
  
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+
+static inline int task_faults_idx(int nid, int priv)
+{
+       return 2 * nid + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+       if (!p->numa_faults)
+               return 0;
+
+       return p->numa_faults[task_faults_idx(nid, 0)] +
+               p->numa_faults[task_faults_idx(nid, 1)];
+}
+
+static unsigned long weighted_cpuload(const int cpu);
+
+
+static int
+find_idlest_cpu_node(int this_cpu, int nid)
+{
+       unsigned long load, min_load = ULONG_MAX;
+       int i, idlest_cpu = this_cpu;
+
+       BUG_ON(cpu_to_node(this_cpu) == nid);
+
+       rcu_read_lock();
+       for_each_cpu(i, cpumask_of_node(nid)) {
+               load = weighted_cpuload(i);
+
+               if (load < min_load) {
+                       min_load = load;
+                       idlest_cpu = i;
+               }
+       }
+       rcu_read_unlock();
+
+       return idlest_cpu;
+}
+
  static void task_numa_placement(struct task_struct *p)
  {
-       int seq;
+       int seq, nid, max_nid = -1;
+       unsigned long max_faults = 0;
  
-       if (!p->mm)     /* for example, ksmd faulting in a user's mm */
-               return;
         seq = ACCESS_ONCE(p->mm->numa_scan_seq);
         if (p->numa_scan_seq == seq)
                 return;
         p->numa_scan_seq = seq;
+       p->numa_migrate_seq++;
         p->numa_scan_period_max = task_scan_max(p);
  
-       /* FIXME: Scheduling placement policy hints go here */
+       /* Find the node with the highest number of faults */
+       for_each_online_node(nid) {
+               unsigned long faults;
+               int priv, i;
+
+               for (priv = 0; priv < 2; priv++) {
+                       i = task_faults_idx(nid, priv);
+
+                       /* Decay existing window, copy faults since last scan */
+                       p->numa_faults[i] >>= 1;
+                       p->numa_faults[i] += p->numa_faults_buffer[i];
+                       p->numa_faults_buffer[i] = 0;
+               }
+
+               /* Find maximum private faults */
+               faults = p->numa_faults[task_faults_idx(nid, 1)];
+               if (faults > max_faults) {
+                       max_faults = faults;
+                       max_nid = nid;
+               }
+       }
+
+       /*
+        * Record the preferred node as the node with the most faults,
+        * requeue the task to be running on the idlest CPU on the
+        * preferred node and reset the scanning rate to recheck
+        * the working set placement.
+        */
+       if (max_faults && max_nid != p->numa_preferred_nid) {
+               int preferred_cpu;
+
+               /*
+                * If the task is not on the preferred node then find the most
+                * idle CPU to migrate to.
+                */
+               preferred_cpu = task_cpu(p);
+               if (cpu_to_node(preferred_cpu) != max_nid) {
+                       preferred_cpu = find_idlest_cpu_node(preferred_cpu,
+                                                            max_nid);
+               }
+
+               /* Update the preferred nid and migrate task if possible */
+               p->numa_preferred_nid = max_nid;
+               p->numa_migrate_seq = 1;
+               migrate_task_to(p, preferred_cpu);
+       }
  }
  
  /*
   * Got a PROT_NONE fault for a page on @node.
   */
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_nidpid, int node, int pages, bool migrated)
  {
         struct task_struct *p = current;
+       int priv;
  
         if (!numabalancing_enabled)
                 return;
  
+       /* for example, ksmd faulting in a user's mm */
+       if (!p->mm)
+               return;
+
+       /*
+        * First accesses are treated as private, otherwise consider accesses
+        * to be private if the accessing pid has not changed
+        */
+       if (!nidpid_pid_unset(last_nidpid))
+               priv = ((p->pid & LAST__PID_MASK) == nidpid_to_pid(last_nidpid));
+       else
+               priv = 1;
+
         /* Allocate buffer to track faults on a per-node basis */
         if (unlikely(!p->numa_faults)) {
-               int size = sizeof(*p->numa_faults) * nr_node_ids;
+               int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
  
-               p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+               /* numa_faults and numa_faults_buffer share the allocation */
+               p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
                 if (!p->numa_faults)
                         return;
+
+               BUG_ON(p->numa_faults_buffer);
+               p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
         }
  
         /*
@@ -926,7 +1037,7 @@ void task_numa_fault(int node, int pages, bool migrated)
  
         task_numa_placement(p);
  
-       p->numa_faults[node] += pages;
+       p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
  }
  
  static void reset_ptenuma_scan(struct task_struct *p)
@@ -1019,11 +1130,7 @@ void task_numa_work(struct callback_head *work)
                 vma = mm->mmap;
         }
         for (; vma; vma = vma->vm_next) {
-               if (!vma_migratable(vma))
-                       continue;
-
-               /* Skip small VMAs. They are not likely to be of relevance */
-               if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+               if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
                         continue;
  
                 do {
@@ -4014,6 +4121,20 @@ static void move_task(struct task_struct *p, struct lb_env *env)
         set_task_cpu(p, env->dst_cpu);
         activate_task(env->dst_rq, p, 0);
         check_preempt_curr(env->dst_rq, p, 0);
+#ifdef CONFIG_NUMA_BALANCING
+       if (p->numa_preferred_nid != -1) {
+               int src_nid = cpu_to_node(env->src_cpu);
+               int dst_nid = cpu_to_node(env->dst_cpu);
+
+               /*
+                * If the load balancer has moved the task then limit
+                * migrations from taking place in the short term in
+                * case this is a short-lived migration.
+                */
+               if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid)
+                       p->numa_migrate_seq = 0;
+       }
+#endif
  }
  
  /*
@@ -4048,6 +4169,69 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
         return delta < (s64)sysctl_sched_migration_cost;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+       int src_nid, dst_nid;
+
+       if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+           !(env->sd->flags & SD_NUMA)) {
+               return false;
+       }
+
+       src_nid = cpu_to_node(env->src_cpu);
+       dst_nid = cpu_to_node(env->dst_cpu);
+
+       if (src_nid == dst_nid ||
+           p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+               return false;
+
+       if (dst_nid == p->numa_preferred_nid ||
+           task_faults(p, dst_nid) > task_faults(p, src_nid))
+               return true;
+
+       return false;
+}
+
+
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+       int src_nid, dst_nid;
+
+       if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+               return false;
+
+       if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+               return false;
+
+       src_nid = cpu_to_node(env->src_cpu);
+       dst_nid = cpu_to_node(env->dst_cpu);
+
+       if (src_nid == dst_nid ||
+           p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+               return false;
+
+       if (task_faults(p, dst_nid) < task_faults(p, src_nid))
+               return true;
+
+       return false;
+}
+
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+                                            struct lb_env *env)
+{
+       return false;
+}
+
+static inline bool migrate_degrades_locality(struct task_struct *p,
+                                            struct lb_env *env)
+{
+       return false;
+}
+#endif
+
  /*
   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
   */
@@ -4105,11 +4289,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  
         /*
          * Aggressive migration if:
-        * 1) task is cache cold, or
-        * 2) too many balance attempts have failed.
+        * 1) destination numa is preferred
+        * 2) task is cache cold, or
+        * 3) too many balance attempts have failed.
          */
-
         tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+       if (!tsk_cache_hot)
+               tsk_cache_hot = migrate_degrades_locality(p, env);
+
+       if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+               if (tsk_cache_hot) {
+                       schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                       schedstat_inc(p, se.statistics.nr_forced_migrations);
+               }
+#endif
+               return 1;
+       }
+
         if (!tsk_cache_hot ||
                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {