+ if (!cur) {
+ /* Is there capacity at our destination? */
+ if (env->src_stats.has_capacity &&
+ !env->dst_stats.has_capacity)
+ goto unlock;
+
+ goto balance;
+ }
+
+ /* Balance doesn't matter much if we're running a task per cpu */
+ if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+ goto assign;
+
+ /*
+ * In the overloaded case, try and keep the load balanced.
+ */
+balance:
+ dst_load = env->dst_stats.load;
+ src_load = env->src_stats.load;
+
+ /* XXX missing power terms */
+ load = task_h_load(env->p);
+ dst_load += load;
+ src_load -= load;
+
+ if (cur) {
+ load = task_h_load(cur);
+ dst_load -= load;
+ src_load += load;
+ }
+
+ /* make src_load the smaller */
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
+
+ if (src_load * env->imbalance_pct < dst_load * 100)
+ goto unlock;
+
+assign:
+ task_numa_assign(env, cur, imp);
+unlock:
+ rcu_read_unlock();
+}
+
+static void task_numa_find_cpu(struct task_numa_env *env,
+ long taskimp, long groupimp)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+ /* Skip this CPU if the source task cannot migrate */
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ continue;
+
+ env->dst_cpu = cpu;
+ task_numa_compare(env, taskimp, groupimp);
+ }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+ struct task_numa_env env = {
+ .p = p,
+
+ .src_cpu = task_cpu(p),
+ .src_nid = task_node(p),
+
+ .imbalance_pct = 112,
+
+ .best_task = NULL,
+ .best_imp = 0,
+ .best_cpu = -1
+ };
+ struct sched_domain *sd;
+ unsigned long taskweight, groupweight;
+ int nid, ret;
+ long taskimp, groupimp;
+
+ /*
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
+ * imbalance and would be the first to start moving tasks about.
+ *
+ * And we want to avoid any moving of tasks about, as that would create
+ * random movement of tasks -- counter the numa conditions we're trying
+ * to satisfy here.
+ */
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ rcu_read_unlock();
+
+ taskweight = task_weight(p, env.src_nid);
+ groupweight = group_weight(p, env.src_nid);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ env.dst_nid = p->numa_preferred_nid;
+ taskimp = task_weight(p, env.dst_nid) - taskweight;
+ groupimp = group_weight(p, env.dst_nid) - groupweight;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+
+ /* If the preferred nid has capacity, try to use it. */
+ if (env.dst_stats.has_capacity)
+ task_numa_find_cpu(&env, taskimp, groupimp);
+
+ /* No space available on the preferred nid. Look elsewhere. */
+ if (env.best_cpu == -1) {
+ for_each_online_node(nid) {
+ if (nid == env.src_nid || nid == p->numa_preferred_nid)
+ continue;
+
+ /* Only consider nodes where both task and groups benefit */
+ taskimp = task_weight(p, nid) - taskweight;
+ groupimp = group_weight(p, nid) - groupweight;
+ if (taskimp < 0 && groupimp < 0)
+ continue;
+
+ env.dst_nid = nid;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+ task_numa_find_cpu(&env, taskimp, groupimp);
+ }
+ }
+
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
+
+ sched_setnuma(p, env.dst_nid);
+
+ if (env.best_task == NULL) {
+ int ret = migrate_task_to(p, env.best_cpu);
+ return ret;
+ }
+
+ ret = migrate_swap(p, env.best_task);
+ put_task_struct(env.best_task);
+ return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+ /* Success if task is already running on preferred CPU */
+ p->numa_migrate_retry = 0;
+ if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) {
+ /*
+ * If migration is temporarily disabled due to a task migration
+ * then re-enable it now as the task is running on its
+ * preferred node and memory should migrate locally
+ */
+ if (!p->numa_migrate_seq)
+ p->numa_migrate_seq++;
+ return;
+ }
+
+ /* This task has no NUMA fault statistics yet */
+ if (unlikely(p->numa_preferred_nid == -1))