sched/numa: Take false sharing into account when adapting scan rate
[platform/adaptation/renesas_rcar/renesas_kernel.git] / kernel / sched / fair.c
index b1de7c5..d26a16e 100644 (file)
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
+
 static inline void __update_task_entity_contrib(struct sched_entity *se);
 
 /* Give new task start runnable values to heavy its load in infant time */
@@ -884,7 +886,37 @@ static unsigned int task_scan_max(struct task_struct *p)
  * the preferred node but still allow the scheduler to move the task again if
  * the nodes CPUs are overloaded.
  */
-unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running += (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
+struct numa_group {
+       atomic_t refcount;
+
+       spinlock_t lock; /* nr_tasks, tasks */
+       int nr_tasks;
+       pid_t gid;
+       struct list_head task_list;
+
+       struct rcu_head rcu;
+       atomic_long_t total_faults;
+       atomic_long_t faults[0];
+};
+
+pid_t task_numa_group_id(struct task_struct *p)
+{
+       return p->numa_group ? p->numa_group->gid : 0;
+}
 
 static inline int task_faults_idx(int nid, int priv)
 {
@@ -900,35 +932,355 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
                p->numa_faults[task_faults_idx(nid, 1)];
 }
 
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+       if (!p->numa_group)
+               return 0;
+
+       return atomic_long_read(&p->numa_group->faults[2*nid]) +
+              atomic_long_read(&p->numa_group->faults[2*nid+1]);
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node.  The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+       unsigned long total_faults;
+
+       if (!p->numa_faults)
+               return 0;
+
+       total_faults = p->total_numa_faults;
+
+       if (!total_faults)
+               return 0;
+
+       return 1000 * task_faults(p, nid) / total_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+       unsigned long total_faults;
+
+       if (!p->numa_group)
+               return 0;
+
+       total_faults = atomic_long_read(&p->numa_group->total_faults);
+
+       if (!total_faults)
+               return 0;
+
+       return 1000 * group_faults(p, nid) / total_faults;
+}
+
 static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+       unsigned long nr_running;
+       unsigned long load;
 
+       /* Total compute capacity of CPUs on a node */
+       unsigned long power;
 
-static int
-find_idlest_cpu_node(int this_cpu, int nid)
+       /* Approximate capacity in terms of runnable tasks on a node */
+       unsigned long capacity;
+       int has_capacity;
+};
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
 {
-       unsigned long load, min_load = ULONG_MAX;
-       int i, idlest_cpu = this_cpu;
+       int cpu;
+
+       memset(ns, 0, sizeof(*ns));
+       for_each_cpu(cpu, cpumask_of_node(nid)) {
+               struct rq *rq = cpu_rq(cpu);
+
+               ns->nr_running += rq->nr_running;
+               ns->load += weighted_cpuload(cpu);
+               ns->power += power_of(cpu);
+       }
+
+       ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+       ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+       ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+
+struct task_numa_env {
+       struct task_struct *p;
 
-       BUG_ON(cpu_to_node(this_cpu) == nid);
+       int src_cpu, src_nid;
+       int dst_cpu, dst_nid;
+
+       struct numa_stats src_stats, dst_stats;
+
+       int imbalance_pct, idx;
+
+       struct task_struct *best_task;
+       long best_imp;
+       int best_cpu;
+};
+
+static void task_numa_assign(struct task_numa_env *env,
+                            struct task_struct *p, long imp)
+{
+       if (env->best_task)
+               put_task_struct(env->best_task);
+       if (p)
+               get_task_struct(p);
+
+       env->best_task = p;
+       env->best_imp = imp;
+       env->best_cpu = env->dst_cpu;
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+                             long taskimp, long groupimp)
+{
+       struct rq *src_rq = cpu_rq(env->src_cpu);
+       struct rq *dst_rq = cpu_rq(env->dst_cpu);
+       struct task_struct *cur;
+       long dst_load, src_load;
+       long load;
+       long imp = (groupimp > 0) ? groupimp : taskimp;
 
        rcu_read_lock();
-       for_each_cpu(i, cpumask_of_node(nid)) {
-               load = weighted_cpuload(i);
+       cur = ACCESS_ONCE(dst_rq->curr);
+       if (cur->pid == 0) /* idle */
+               cur = NULL;
 
-               if (load < min_load) {
-                       min_load = load;
-                       idlest_cpu = i;
+       /*
+        * "imp" is the fault differential for the source task between the
+        * source and destination node. Calculate the total differential for
+        * the source task and potential destination task. The more negative
+        * the value is, the more rmeote accesses that would be expected to
+        * be incurred if the tasks were swapped.
+        */
+       if (cur) {
+               /* Skip this swap candidate if cannot move to the source cpu */
+               if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+                       goto unlock;
+
+               /*
+                * If dst and source tasks are in the same NUMA group, or not
+                * in any group then look only at task weights.
+                */
+               if (cur->numa_group == env->p->numa_group) {
+                       imp = taskimp + task_weight(cur, env->src_nid) -
+                             task_weight(cur, env->dst_nid);
+                       /*
+                        * Add some hysteresis to prevent swapping the
+                        * tasks within a group over tiny differences.
+                        */
+                       if (cur->numa_group)
+                               imp -= imp/16;
+               } else {
+                       /*
+                        * Compare the group weights. If a task is all by
+                        * itself (not part of a group), use the task weight
+                        * instead.
+                        */
+                       if (env->p->numa_group)
+                               imp = groupimp;
+                       else
+                               imp = taskimp;
+
+                       if (cur->numa_group)
+                               imp += group_weight(cur, env->src_nid) -
+                                      group_weight(cur, env->dst_nid);
+                       else
+                               imp += task_weight(cur, env->src_nid) -
+                                      task_weight(cur, env->dst_nid);
                }
        }
+
+       if (imp < env->best_imp)
+               goto unlock;
+
+       if (!cur) {
+               /* Is there capacity at our destination? */
+               if (env->src_stats.has_capacity &&
+                   !env->dst_stats.has_capacity)
+                       goto unlock;
+
+               goto balance;
+       }
+
+       /* Balance doesn't matter much if we're running a task per cpu */
+       if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+               goto assign;
+
+       /*
+        * In the overloaded case, try and keep the load balanced.
+        */
+balance:
+       dst_load = env->dst_stats.load;
+       src_load = env->src_stats.load;
+
+       /* XXX missing power terms */
+       load = task_h_load(env->p);
+       dst_load += load;
+       src_load -= load;
+
+       if (cur) {
+               load = task_h_load(cur);
+               dst_load -= load;
+               src_load += load;
+       }
+
+       /* make src_load the smaller */
+       if (dst_load < src_load)
+               swap(dst_load, src_load);
+
+       if (src_load * env->imbalance_pct < dst_load * 100)
+               goto unlock;
+
+assign:
+       task_numa_assign(env, cur, imp);
+unlock:
        rcu_read_unlock();
+}
 
-       return idlest_cpu;
+static void task_numa_find_cpu(struct task_numa_env *env,
+                               long taskimp, long groupimp)
+{
+       int cpu;
+
+       for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+               /* Skip this CPU if the source task cannot migrate */
+               if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+                       continue;
+
+               env->dst_cpu = cpu;
+               task_numa_compare(env, taskimp, groupimp);
+       }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+       struct task_numa_env env = {
+               .p = p,
+
+               .src_cpu = task_cpu(p),
+               .src_nid = task_node(p),
+
+               .imbalance_pct = 112,
+
+               .best_task = NULL,
+               .best_imp = 0,
+               .best_cpu = -1
+       };
+       struct sched_domain *sd;
+       unsigned long taskweight, groupweight;
+       int nid, ret;
+       long taskimp, groupimp;
+
+       /*
+        * Pick the lowest SD_NUMA domain, as that would have the smallest
+        * imbalance and would be the first to start moving tasks about.
+        *
+        * And we want to avoid any moving of tasks about, as that would create
+        * random movement of tasks -- counter the numa conditions we're trying
+        * to satisfy here.
+        */
+       rcu_read_lock();
+       sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+       env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+       rcu_read_unlock();
+
+       taskweight = task_weight(p, env.src_nid);
+       groupweight = group_weight(p, env.src_nid);
+       update_numa_stats(&env.src_stats, env.src_nid);
+       env.dst_nid = p->numa_preferred_nid;
+       taskimp = task_weight(p, env.dst_nid) - taskweight;
+       groupimp = group_weight(p, env.dst_nid) - groupweight;
+       update_numa_stats(&env.dst_stats, env.dst_nid);
+
+       /* If the preferred nid has capacity, try to use it. */
+       if (env.dst_stats.has_capacity)
+               task_numa_find_cpu(&env, taskimp, groupimp);
+
+       /* No space available on the preferred nid. Look elsewhere. */
+       if (env.best_cpu == -1) {
+               for_each_online_node(nid) {
+                       if (nid == env.src_nid || nid == p->numa_preferred_nid)
+                               continue;
+
+                       /* Only consider nodes where both task and groups benefit */
+                       taskimp = task_weight(p, nid) - taskweight;
+                       groupimp = group_weight(p, nid) - groupweight;
+                       if (taskimp < 0 && groupimp < 0)
+                               continue;
+
+                       env.dst_nid = nid;
+                       update_numa_stats(&env.dst_stats, env.dst_nid);
+                       task_numa_find_cpu(&env, taskimp, groupimp);
+               }
+       }
+
+       /* No better CPU than the current one was found. */
+       if (env.best_cpu == -1)
+               return -EAGAIN;
+
+       sched_setnuma(p, env.dst_nid);
+
+       if (env.best_task == NULL) {
+               int ret = migrate_task_to(p, env.best_cpu);
+               return ret;
+       }
+
+       ret = migrate_swap(p, env.best_task);
+       put_task_struct(env.best_task);
+       return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+       /* Success if task is already running on preferred CPU */
+       p->numa_migrate_retry = 0;
+       if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) {
+               /*
+                * If migration is temporarily disabled due to a task migration
+                * then re-enable it now as the task is running on its
+                * preferred node and memory should migrate locally
+                */
+               if (!p->numa_migrate_seq)
+                       p->numa_migrate_seq++;
+               return;
+       }
+
+       /* This task has no NUMA fault statistics yet */
+       if (unlikely(p->numa_preferred_nid == -1))
+               return;
+
+       /* Otherwise, try migrate to a CPU on the preferred node */
+       if (task_numa_migrate(p) != 0)
+               p->numa_migrate_retry = jiffies + HZ*5;
 }
 
 static void task_numa_placement(struct task_struct *p)
 {
-       int seq, nid, max_nid = -1;
-       unsigned long max_faults = 0;
+       int seq, nid, max_nid = -1, max_group_nid = -1;
+       unsigned long max_faults = 0, max_group_faults = 0;
+       spinlock_t *group_lock = NULL;
 
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
@@ -937,60 +1289,230 @@ static void task_numa_placement(struct task_struct *p)
        p->numa_migrate_seq++;
        p->numa_scan_period_max = task_scan_max(p);
 
+       /* If the task is part of a group prevent parallel updates to group stats */
+       if (p->numa_group) {
+               group_lock = &p->numa_group->lock;
+               spin_lock(group_lock);
+       }
+
        /* Find the node with the highest number of faults */
        for_each_online_node(nid) {
-               unsigned long faults;
+               unsigned long faults = 0, group_faults = 0;
                int priv, i;
 
                for (priv = 0; priv < 2; priv++) {
+                       long diff;
+
                        i = task_faults_idx(nid, priv);
+                       diff = -p->numa_faults[i];
 
                        /* Decay existing window, copy faults since last scan */
                        p->numa_faults[i] >>= 1;
                        p->numa_faults[i] += p->numa_faults_buffer[i];
                        p->numa_faults_buffer[i] = 0;
+
+                       faults += p->numa_faults[i];
+                       diff += p->numa_faults[i];
+                       p->total_numa_faults += diff;
+                       if (p->numa_group) {
+                               /* safe because we can only change our own group */
+                               atomic_long_add(diff, &p->numa_group->faults[i]);
+                               atomic_long_add(diff, &p->numa_group->total_faults);
+                               group_faults += atomic_long_read(&p->numa_group->faults[i]);
+                       }
                }
 
-               /* Find maximum private faults */
-               faults = p->numa_faults[task_faults_idx(nid, 1)];
                if (faults > max_faults) {
                        max_faults = faults;
                        max_nid = nid;
                }
-       }
 
-       /*
-        * Record the preferred node as the node with the most faults,
-        * requeue the task to be running on the idlest CPU on the
-        * preferred node and reset the scanning rate to recheck
-        * the working set placement.
-        */
-       if (max_faults && max_nid != p->numa_preferred_nid) {
-               int preferred_cpu;
+               if (group_faults > max_group_faults) {
+                       max_group_faults = group_faults;
+                       max_group_nid = nid;
+               }
+       }
 
+       if (p->numa_group) {
                /*
-                * If the task is not on the preferred node then find the most
-                * idle CPU to migrate to.
+                * If the preferred task and group nids are different,
+                * iterate over the nodes again to find the best place.
                 */
-               preferred_cpu = task_cpu(p);
-               if (cpu_to_node(preferred_cpu) != max_nid) {
-                       preferred_cpu = find_idlest_cpu_node(preferred_cpu,
-                                                            max_nid);
+               if (max_nid != max_group_nid) {
+                       unsigned long weight, max_weight = 0;
+
+                       for_each_online_node(nid) {
+                               weight = task_weight(p, nid) + group_weight(p, nid);
+                               if (weight > max_weight) {
+                                       max_weight = weight;
+                                       max_nid = nid;
+                               }
+                       }
                }
 
+               spin_unlock(group_lock);
+       }
+
+       /* Preferred node as the node with the most faults */
+       if (max_faults && max_nid != p->numa_preferred_nid) {
                /* Update the preferred nid and migrate task if possible */
-               p->numa_preferred_nid = max_nid;
-               p->numa_migrate_seq = 0;
-               migrate_task_to(p, preferred_cpu);
+               sched_setnuma(p, max_nid);
+               numa_migrate_preferred(p);
+       }
+}
+
+static inline int get_numa_group(struct numa_group *grp)
+{
+       return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+       if (atomic_dec_and_test(&grp->refcount))
+               kfree_rcu(grp, rcu);
+}
+
+static void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+       if (l1 > l2)
+               swap(l1, l2);
+
+       spin_lock(l1);
+       spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+                       int *priv)
+{
+       struct numa_group *grp, *my_grp;
+       struct task_struct *tsk;
+       bool join = false;
+       int cpu = cpupid_to_cpu(cpupid);
+       int i;
+
+       if (unlikely(!p->numa_group)) {
+               unsigned int size = sizeof(struct numa_group) +
+                                   2*nr_node_ids*sizeof(atomic_long_t);
+
+               grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+               if (!grp)
+                       return;
+
+               atomic_set(&grp->refcount, 1);
+               spin_lock_init(&grp->lock);
+               INIT_LIST_HEAD(&grp->task_list);
+               grp->gid = p->pid;
+
+               for (i = 0; i < 2*nr_node_ids; i++)
+                       atomic_long_set(&grp->faults[i], p->numa_faults[i]);
+
+               atomic_long_set(&grp->total_faults, p->total_numa_faults);
+
+               list_add(&p->numa_entry, &grp->task_list);
+               grp->nr_tasks++;
+               rcu_assign_pointer(p->numa_group, grp);
+       }
+
+       rcu_read_lock();
+       tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+
+       if (!cpupid_match_pid(tsk, cpupid))
+               goto unlock;
+
+       grp = rcu_dereference(tsk->numa_group);
+       if (!grp)
+               goto unlock;
+
+       my_grp = p->numa_group;
+       if (grp == my_grp)
+               goto unlock;
+
+       /*
+        * Only join the other group if its bigger; if we're the bigger group,
+        * the other task will join us.
+        */
+       if (my_grp->nr_tasks > grp->nr_tasks)
+               goto unlock;
+
+       /*
+        * Tie-break on the grp address.
+        */
+       if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+               goto unlock;
+
+       /* Always join threads in the same process. */
+       if (tsk->mm == current->mm)
+               join = true;
+
+       /* Simple filter to avoid false positives due to PID collisions */
+       if (flags & TNF_SHARED)
+               join = true;
+
+       /* Update priv based on whether false sharing was detected */
+       *priv = !join;
+
+       if (join && !get_numa_group(grp))
+               join = false;
+
+unlock:
+       rcu_read_unlock();
+
+       if (!join)
+               return;
+
+       for (i = 0; i < 2*nr_node_ids; i++) {
+               atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]);
+               atomic_long_add(p->numa_faults[i], &grp->faults[i]);
        }
+       atomic_long_sub(p->total_numa_faults, &my_grp->total_faults);
+       atomic_long_add(p->total_numa_faults, &grp->total_faults);
+
+       double_lock(&my_grp->lock, &grp->lock);
+
+       list_move(&p->numa_entry, &grp->task_list);
+       my_grp->nr_tasks--;
+       grp->nr_tasks++;
+
+       spin_unlock(&my_grp->lock);
+       spin_unlock(&grp->lock);
+
+       rcu_assign_pointer(p->numa_group, grp);
+
+       put_numa_group(my_grp);
+}
+
+void task_numa_free(struct task_struct *p)
+{
+       struct numa_group *grp = p->numa_group;
+       int i;
+       void *numa_faults = p->numa_faults;
+
+       if (grp) {
+               for (i = 0; i < 2*nr_node_ids; i++)
+                       atomic_long_sub(p->numa_faults[i], &grp->faults[i]);
+
+               atomic_long_sub(p->total_numa_faults, &grp->total_faults);
+
+               spin_lock(&grp->lock);
+               list_del(&p->numa_entry);
+               grp->nr_tasks--;
+               spin_unlock(&grp->lock);
+               rcu_assign_pointer(p->numa_group, NULL);
+               put_numa_group(grp);
+       }
+
+       p->numa_faults = NULL;
+       p->numa_faults_buffer = NULL;
+       kfree(numa_faults);
 }
 
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
-void task_numa_fault(int last_nidpid, int node, int pages, bool migrated)
+void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 {
        struct task_struct *p = current;
+       bool migrated = flags & TNF_MIGRATED;
        int priv;
 
        if (!numabalancing_enabled)
@@ -1000,14 +1522,9 @@ void task_numa_fault(int last_nidpid, int node, int pages, bool migrated)
        if (!p->mm)
                return;
 
-       /*
-        * First accesses are treated as private, otherwise consider accesses
-        * to be private if the accessing pid has not changed
-        */
-       if (!nidpid_pid_unset(last_nidpid))
-               priv = ((p->pid & LAST__PID_MASK) == nidpid_to_pid(last_nidpid));
-       else
-               priv = 1;
+       /* Do not worry about placement if exiting */
+       if (p->state == TASK_DEAD)
+               return;
 
        /* Allocate buffer to track faults on a per-node basis */
        if (unlikely(!p->numa_faults)) {
@@ -1020,6 +1537,19 @@ void task_numa_fault(int last_nidpid, int node, int pages, bool migrated)
 
                BUG_ON(p->numa_faults_buffer);
                p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+               p->total_numa_faults = 0;
+       }
+
+       /*
+        * First accesses are treated as private, otherwise consider accesses
+        * to be private if the accessing pid has not changed
+        */
+       if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+               priv = 1;
+       } else {
+               priv = cpupid_match_pid(p, last_cpupid);
+               if (!priv && !(flags & TNF_NO_GROUP))
+                       task_numa_group(p, last_cpupid, flags, &priv);
        }
 
        /*
@@ -1037,6 +1567,13 @@ void task_numa_fault(int last_nidpid, int node, int pages, bool migrated)
 
        task_numa_placement(p);
 
+       /* Retry task to preferred node migration if it previously failed */
+       if (p->numa_migrate_retry && time_after(jiffies, p->numa_migrate_retry))
+               numa_migrate_preferred(p);
+
+       if (migrated)
+               p->numa_pages_migrated += pages;
+
        p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
 }
 
@@ -1130,7 +1667,17 @@ void task_numa_work(struct callback_head *work)
                vma = mm->mmap;
        }
        for (; vma; vma = vma->vm_next) {
-               if (!vma_migratable(vma))
+               if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+                       continue;
+
+               /*
+                * Shared library pages mapped by multiple processes are not
+                * migrated as it is expected they are cache replicated. Avoid
+                * hinting faults in read-only file-backed mappings or the vdso
+                * as migrating the pages will be of marginal benefit.
+                */
+               if (!vma->vm_mm ||
+                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                        continue;
 
                do {
@@ -1217,6 +1764,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -1226,8 +1781,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (!parent_entity(se))
                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-       if (entity_is_task(se))
-               list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+       if (entity_is_task(se)) {
+               struct rq *rq = rq_of(cfs_rq);
+
+               account_numa_enqueue(rq, task_of(se));
+               list_add(&se->group_node, &rq->cfs_tasks);
+       }
 #endif
        cfs_rq->nr_running++;
 }
@@ -1238,8 +1797,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-       if (entity_is_task(se))
+       if (entity_is_task(se)) {
+               account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                list_del_init(&se->group_node);
+       }
        cfs_rq->nr_running--;
 }
 
@@ -3292,7 +3853,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
 
-       if (!tg->parent)        /* the trivial, non-cgroup case */
+       if (!tg->parent || !wl) /* the trivial, non-cgroup case */
                return wl;
 
        for_each_sched_entity(se) {
@@ -3345,8 +3906,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 }
 #else
 
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
-               unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        return wl;
 }
@@ -3599,11 +4159,10 @@ done:
  * preempt must be disabled.
  */
 static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
-       int prev_cpu = task_cpu(p);
        int new_cpu = cpu;
        int want_affine = 0;
        int sync = wake_flags & WF_SYNC;
@@ -4083,6 +4642,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
+enum fbq_type { regular, remote, all };
+
 #define LBF_ALL_PINNED 0x01
 #define LBF_NEED_BREAK 0x02
 #define LBF_DST_PINNED  0x04
@@ -4109,6 +4670,8 @@ struct lb_env {
        unsigned int            loop;
        unsigned int            loop_break;
        unsigned int            loop_max;
+
+       enum fbq_type           fbq_type;
 };
 
 /*
@@ -4121,6 +4684,20 @@ static void move_task(struct task_struct *p, struct lb_env *env)
        set_task_cpu(p, env->dst_cpu);
        activate_task(env->dst_rq, p, 0);
        check_preempt_curr(env->dst_rq, p, 0);
+#ifdef CONFIG_NUMA_BALANCING
+       if (p->numa_preferred_nid != -1) {
+               int src_nid = cpu_to_node(env->src_cpu);
+               int dst_nid = cpu_to_node(env->dst_cpu);
+
+               /*
+                * If the load balancer has moved the task then limit
+                * migrations from taking place in the short term in
+                * case this is a short-lived migration.
+                */
+               if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid)
+                       p->numa_migrate_seq = 0;
+       }
+#endif
 }
 
 /*
@@ -4169,12 +4746,16 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
        src_nid = cpu_to_node(env->src_cpu);
        dst_nid = cpu_to_node(env->dst_cpu);
 
-       if (src_nid == dst_nid ||
-           p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+       if (src_nid == dst_nid)
                return false;
 
-       if (dst_nid == p->numa_preferred_nid ||
-           task_faults(p, dst_nid) > task_faults(p, src_nid))
+       /* Always encourage migration to the preferred node. */
+       if (dst_nid == p->numa_preferred_nid)
+               return true;
+
+       /* If both task and group weight improve, this move is a winner. */
+       if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+           group_weight(p, dst_nid) > group_weight(p, src_nid))
                return true;
 
        return false;
@@ -4194,11 +4775,16 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        src_nid = cpu_to_node(env->src_cpu);
        dst_nid = cpu_to_node(env->dst_cpu);
 
-       if (src_nid == dst_nid ||
-           p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+       if (src_nid == dst_nid)
                return false;
 
-       if (task_faults(p, dst_nid) < task_faults(p, src_nid))
+       /* Migrating away from the preferred node is always bad. */
+       if (src_nid == p->numa_preferred_nid)
+               return true;
+
+       /* If either task or group weight get worse, don't do it. */
+       if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+           group_weight(p, dst_nid) < group_weight(p, src_nid))
                return true;
 
        return false;
@@ -4335,8 +4921,6 @@ static int move_one_task(struct lb_env *env)
        return 0;
 }
 
-static unsigned long task_h_load(struct task_struct *p);
-
 static const unsigned int sched_nr_migrate_break = 32;
 
 /*
@@ -4549,6 +5133,10 @@ struct sg_lb_stats {
        unsigned int group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned int nr_numa_running;
+       unsigned int nr_preferred_running;
+#endif
 };
 
 /*
@@ -4866,6 +5454,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
                sgs->group_load += load;
                sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+               sgs->nr_numa_running += rq->nr_numa_running;
+               sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
@@ -4931,14 +5523,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        return false;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       if (sgs->sum_nr_running > sgs->nr_numa_running)
+               return regular;
+       if (sgs->sum_nr_running > sgs->nr_preferred_running)
+               return remote;
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       if (rq->nr_running > rq->nr_numa_running)
+               return regular;
+       if (rq->nr_running > rq->nr_preferred_running)
+               return remote;
+       return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 /**
  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
  * @env: The load balancing environment.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
  */
-static inline void update_sd_lb_stats(struct lb_env *env,
-                                       struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
@@ -4995,6 +5616,9 @@ next_group:
 
                sg = sg->next;
        } while (sg != env->sd->groups);
+
+       if (env->sd->flags & SD_NUMA)
+               env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 }
 
 /**
@@ -5298,15 +5922,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
 
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-               unsigned long power = power_of(i);
-               unsigned long capacity = DIV_ROUND_CLOSEST(power,
-                                                          SCHED_POWER_SCALE);
-               unsigned long wl;
+               unsigned long power, capacity, wl;
+               enum fbq_type rt;
+
+               rq = cpu_rq(i);
+               rt = fbq_classify_rq(rq);
 
+               /*
+                * We classify groups/runqueues into three groups:
+                *  - regular: there are !numa tasks
+                *  - remote:  there are numa tasks that run on the 'wrong' node
+                *  - all:     there is no distinction
+                *
+                * In order to avoid migrating ideally placed numa tasks,
+                * ignore those when there's better options.
+                *
+                * If we ignore the actual busiest queue to migrate another
+                * task, the next balance pass can still reduce the busiest
+                * queue by moving tasks around inside the node.
+                *
+                * If we cannot move enough load due to this classification
+                * the next pass will adjust the group classification and
+                * allow migration of more tasks.
+                *
+                * Both cases only affect the total convergence complexity.
+                */
+               if (rt > env->fbq_type)
+                       continue;
+
+               power = power_of(i);
+               capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
 
-               rq = cpu_rq(i);
                wl = weighted_cpuload(i);
 
                /*
@@ -5423,6 +6071,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
+               .fbq_type       = all,
        };
 
        /*