}
#ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
+
static inline void __update_task_entity_contrib(struct sched_entity *se);
/* Give new task start runnable values to heavy its load in infant time */
* the preferred node but still allow the scheduler to move the task again if
* the nodes CPUs are overloaded.
*/
-unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
-static void task_numa_placement(struct task_struct *p)
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ struct list_head task_list;
+
+ struct rcu_head rcu;
+ atomic_long_t total_faults;
+ atomic_long_t faults[0];
+};
+
+pid_t task_numa_group_id(struct task_struct *p)
+{
+ return p->numa_group ? p->numa_group->gid : 0;
+}
+
+static inline int task_faults_idx(int nid, int priv)
+{
+ return 2 * nid + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_faults)
+ return 0;
+
+ return p->numa_faults[task_faults_idx(nid, 0)] +
+ p->numa_faults[task_faults_idx(nid, 1)];
+}
+
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_group)
+ return 0;
+
+ return atomic_long_read(&p->numa_group->faults[2*nid]) +
+ atomic_long_read(&p->numa_group->faults[2*nid+1]);
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node. The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+ unsigned long total_faults;
+
+ if (!p->numa_faults)
+ return 0;
+
+ total_faults = p->total_numa_faults;
+
+ if (!total_faults)
+ return 0;
+
+ return 1000 * task_faults(p, nid) / total_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+ unsigned long total_faults;
+
+ if (!p->numa_group)
+ return 0;
+
+ total_faults = atomic_long_read(&p->numa_group->total_faults);
+
+ if (!total_faults)
+ return 0;
+
+ return 1000 * group_faults(p, nid) / total_faults;
+}
+
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+ unsigned long nr_running;
+ unsigned long load;
+
+ /* Total compute capacity of CPUs on a node */
+ unsigned long power;
+
+ /* Approximate capacity in terms of runnable tasks on a node */
+ unsigned long capacity;
+ int has_capacity;
+};
+
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+ int cpu;
+
+ memset(ns, 0, sizeof(*ns));
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ ns->nr_running += rq->nr_running;
+ ns->load += weighted_cpuload(cpu);
+ ns->power += power_of(cpu);
+ }
+
+ ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+ ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+ ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+
+struct task_numa_env {
+ struct task_struct *p;
+
+ int src_cpu, src_nid;
+ int dst_cpu, dst_nid;
+
+ struct numa_stats src_stats, dst_stats;
+
+ int imbalance_pct, idx;
+
+ struct task_struct *best_task;
+ long best_imp;
+ int best_cpu;
+};
+
+static void task_numa_assign(struct task_numa_env *env,
+ struct task_struct *p, long imp)
+{
+ if (env->best_task)
+ put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
+
+ env->best_task = p;
+ env->best_imp = imp;
+ env->best_cpu = env->dst_cpu;
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+ long taskimp, long groupimp)
{
- int seq, nid, max_nid = -1;
- unsigned long max_faults = 0;
+ struct rq *src_rq = cpu_rq(env->src_cpu);
+ struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ struct task_struct *cur;
+ long dst_load, src_load;
+ long load;
+ long imp = (groupimp > 0) ? groupimp : taskimp;
+
+ rcu_read_lock();
+ cur = ACCESS_ONCE(dst_rq->curr);
+ if (cur->pid == 0) /* idle */
+ cur = NULL;
+
+ /*
+ * "imp" is the fault differential for the source task between the
+ * source and destination node. Calculate the total differential for
+ * the source task and potential destination task. The more negative
+ * the value is, the more rmeote accesses that would be expected to
+ * be incurred if the tasks were swapped.
+ */
+ if (cur) {
+ /* Skip this swap candidate if cannot move to the source cpu */
+ if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ goto unlock;
+
+ /*
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
+ */
+ if (cur->numa_group == env->p->numa_group) {
+ imp = taskimp + task_weight(cur, env->src_nid) -
+ task_weight(cur, env->dst_nid);
+ /*
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
+ */
+ if (cur->numa_group)
+ imp -= imp/16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by
+ * itself (not part of a group), use the task weight
+ * instead.
+ */
+ if (env->p->numa_group)
+ imp = groupimp;
+ else
+ imp = taskimp;
+
+ if (cur->numa_group)
+ imp += group_weight(cur, env->src_nid) -
+ group_weight(cur, env->dst_nid);
+ else
+ imp += task_weight(cur, env->src_nid) -
+ task_weight(cur, env->dst_nid);
+ }
+ }
+
+ if (imp < env->best_imp)
+ goto unlock;
+
+ if (!cur) {
+ /* Is there capacity at our destination? */
+ if (env->src_stats.has_capacity &&
+ !env->dst_stats.has_capacity)
+ goto unlock;
+
+ goto balance;
+ }
+
+ /* Balance doesn't matter much if we're running a task per cpu */
+ if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+ goto assign;
+
+ /*
+ * In the overloaded case, try and keep the load balanced.
+ */
+balance:
+ dst_load = env->dst_stats.load;
+ src_load = env->src_stats.load;
+
+ /* XXX missing power terms */
+ load = task_h_load(env->p);
+ dst_load += load;
+ src_load -= load;
+
+ if (cur) {
+ load = task_h_load(cur);
+ dst_load -= load;
+ src_load += load;
+ }
+
+ /* make src_load the smaller */
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
+
+ if (src_load * env->imbalance_pct < dst_load * 100)
+ goto unlock;
+
+assign:
+ task_numa_assign(env, cur, imp);
+unlock:
+ rcu_read_unlock();
+}
+
+static void task_numa_find_cpu(struct task_numa_env *env,
+ long taskimp, long groupimp)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+ /* Skip this CPU if the source task cannot migrate */
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ continue;
+
+ env->dst_cpu = cpu;
+ task_numa_compare(env, taskimp, groupimp);
+ }
+}
+
+static int task_numa_migrate(struct task_struct *p)
+{
+ struct task_numa_env env = {
+ .p = p,
+
+ .src_cpu = task_cpu(p),
+ .src_nid = task_node(p),
+
+ .imbalance_pct = 112,
+
+ .best_task = NULL,
+ .best_imp = 0,
+ .best_cpu = -1
+ };
+ struct sched_domain *sd;
+ unsigned long taskweight, groupweight;
+ int nid, ret;
+ long taskimp, groupimp;
+
+ /*
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
+ * imbalance and would be the first to start moving tasks about.
+ *
+ * And we want to avoid any moving of tasks about, as that would create
+ * random movement of tasks -- counter the numa conditions we're trying
+ * to satisfy here.
+ */
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ rcu_read_unlock();
+
+ taskweight = task_weight(p, env.src_nid);
+ groupweight = group_weight(p, env.src_nid);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ env.dst_nid = p->numa_preferred_nid;
+ taskimp = task_weight(p, env.dst_nid) - taskweight;
+ groupimp = group_weight(p, env.dst_nid) - groupweight;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+
+ /* If the preferred nid has capacity, try to use it. */
+ if (env.dst_stats.has_capacity)
+ task_numa_find_cpu(&env, taskimp, groupimp);
+
+ /* No space available on the preferred nid. Look elsewhere. */
+ if (env.best_cpu == -1) {
+ for_each_online_node(nid) {
+ if (nid == env.src_nid || nid == p->numa_preferred_nid)
+ continue;
+
+ /* Only consider nodes where both task and groups benefit */
+ taskimp = task_weight(p, nid) - taskweight;
+ groupimp = group_weight(p, nid) - groupweight;
+ if (taskimp < 0 && groupimp < 0)
+ continue;
+
+ env.dst_nid = nid;
+ update_numa_stats(&env.dst_stats, env.dst_nid);
+ task_numa_find_cpu(&env, taskimp, groupimp);
+ }
+ }
+
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
+
+ sched_setnuma(p, env.dst_nid);
+
+ if (env.best_task == NULL) {
+ int ret = migrate_task_to(p, env.best_cpu);
+ return ret;
+ }
+
+ ret = migrate_swap(p, env.best_task);
+ put_task_struct(env.best_task);
+ return ret;
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+ /* Success if task is already running on preferred CPU */
+ p->numa_migrate_retry = 0;
+ if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) {
+ /*
+ * If migration is temporarily disabled due to a task migration
+ * then re-enable it now as the task is running on its
+ * preferred node and memory should migrate locally
+ */
+ if (!p->numa_migrate_seq)
+ p->numa_migrate_seq++;
+ return;
+ }
- if (!p->mm) /* for example, ksmd faulting in a user's mm */
+ /* This task has no NUMA fault statistics yet */
+ if (unlikely(p->numa_preferred_nid == -1))
return;
+
+ /* Otherwise, try migrate to a CPU on the preferred node */
+ if (task_numa_migrate(p) != 0)
+ p->numa_migrate_retry = jiffies + HZ*5;
+}
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq, nid, max_nid = -1, max_group_nid = -1;
+ unsigned long max_faults = 0, max_group_faults = 0;
+ spinlock_t *group_lock = NULL;
+
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_migrate_seq++;
p->numa_scan_period_max = task_scan_max(p);
+ /* If the task is part of a group prevent parallel updates to group stats */
+ if (p->numa_group) {
+ group_lock = &p->numa_group->lock;
+ spin_lock(group_lock);
+ }
+
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
- unsigned long faults;
-
- /* Decay existing window and copy faults since last scan */
- p->numa_faults[nid] >>= 1;
- p->numa_faults[nid] += p->numa_faults_buffer[nid];
- p->numa_faults_buffer[nid] = 0;
+ unsigned long faults = 0, group_faults = 0;
+ int priv, i;
+
+ for (priv = 0; priv < 2; priv++) {
+ long diff;
+
+ i = task_faults_idx(nid, priv);
+ diff = -p->numa_faults[i];
+
+ /* Decay existing window, copy faults since last scan */
+ p->numa_faults[i] >>= 1;
+ p->numa_faults[i] += p->numa_faults_buffer[i];
+ p->numa_faults_buffer[i] = 0;
+
+ faults += p->numa_faults[i];
+ diff += p->numa_faults[i];
+ p->total_numa_faults += diff;
+ if (p->numa_group) {
+ /* safe because we can only change our own group */
+ atomic_long_add(diff, &p->numa_group->faults[i]);
+ atomic_long_add(diff, &p->numa_group->total_faults);
+ group_faults += atomic_long_read(&p->numa_group->faults[i]);
+ }
+ }
- faults = p->numa_faults[nid];
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
}
+
+ if (group_faults > max_group_faults) {
+ max_group_faults = group_faults;
+ max_group_nid = nid;
+ }
+ }
+
+ if (p->numa_group) {
+ /*
+ * If the preferred task and group nids are different,
+ * iterate over the nodes again to find the best place.
+ */
+ if (max_nid != max_group_nid) {
+ unsigned long weight, max_weight = 0;
+
+ for_each_online_node(nid) {
+ weight = task_weight(p, nid) + group_weight(p, nid);
+ if (weight > max_weight) {
+ max_weight = weight;
+ max_nid = nid;
+ }
+ }
+ }
+
+ spin_unlock(group_lock);
}
- /* Update the tasks preferred node if necessary */
+ /* Preferred node as the node with the most faults */
if (max_faults && max_nid != p->numa_preferred_nid) {
- p->numa_preferred_nid = max_nid;
- p->numa_migrate_seq = 0;
+ /* Update the preferred nid and migrate task if possible */
+ sched_setnuma(p, max_nid);
+ numa_migrate_preferred(p);
+ }
+}
+
+static inline int get_numa_group(struct numa_group *grp)
+{
+ return atomic_inc_not_zero(&grp->refcount);
+}
+
+static inline void put_numa_group(struct numa_group *grp)
+{
+ if (atomic_dec_and_test(&grp->refcount))
+ kfree_rcu(grp, rcu);
+}
+
+static void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+ if (l1 > l2)
+ swap(l1, l2);
+
+ spin_lock(l1);
+ spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+ int *priv)
+{
+ struct numa_group *grp, *my_grp;
+ struct task_struct *tsk;
+ bool join = false;
+ int cpu = cpupid_to_cpu(cpupid);
+ int i;
+
+ if (unlikely(!p->numa_group)) {
+ unsigned int size = sizeof(struct numa_group) +
+ 2*nr_node_ids*sizeof(atomic_long_t);
+
+ grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!grp)
+ return;
+
+ atomic_set(&grp->refcount, 1);
+ spin_lock_init(&grp->lock);
+ INIT_LIST_HEAD(&grp->task_list);
+ grp->gid = p->pid;
+
+ for (i = 0; i < 2*nr_node_ids; i++)
+ atomic_long_set(&grp->faults[i], p->numa_faults[i]);
+
+ atomic_long_set(&grp->total_faults, p->total_numa_faults);
+
+ list_add(&p->numa_entry, &grp->task_list);
+ grp->nr_tasks++;
+ rcu_assign_pointer(p->numa_group, grp);
}
+
+ rcu_read_lock();
+ tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+
+ if (!cpupid_match_pid(tsk, cpupid))
+ goto unlock;
+
+ grp = rcu_dereference(tsk->numa_group);
+ if (!grp)
+ goto unlock;
+
+ my_grp = p->numa_group;
+ if (grp == my_grp)
+ goto unlock;
+
+ /*
+ * Only join the other group if its bigger; if we're the bigger group,
+ * the other task will join us.
+ */
+ if (my_grp->nr_tasks > grp->nr_tasks)
+ goto unlock;
+
+ /*
+ * Tie-break on the grp address.
+ */
+ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+ goto unlock;
+
+ /* Always join threads in the same process. */
+ if (tsk->mm == current->mm)
+ join = true;
+
+ /* Simple filter to avoid false positives due to PID collisions */
+ if (flags & TNF_SHARED)
+ join = true;
+
+ /* Update priv based on whether false sharing was detected */
+ *priv = !join;
+
+ if (join && !get_numa_group(grp))
+ join = false;
+
+unlock:
+ rcu_read_unlock();
+
+ if (!join)
+ return;
+
+ for (i = 0; i < 2*nr_node_ids; i++) {
+ atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]);
+ atomic_long_add(p->numa_faults[i], &grp->faults[i]);
+ }
+ atomic_long_sub(p->total_numa_faults, &my_grp->total_faults);
+ atomic_long_add(p->total_numa_faults, &grp->total_faults);
+
+ double_lock(&my_grp->lock, &grp->lock);
+
+ list_move(&p->numa_entry, &grp->task_list);
+ my_grp->nr_tasks--;
+ grp->nr_tasks++;
+
+ spin_unlock(&my_grp->lock);
+ spin_unlock(&grp->lock);
+
+ rcu_assign_pointer(p->numa_group, grp);
+
+ put_numa_group(my_grp);
+}
+
+void task_numa_free(struct task_struct *p)
+{
+ struct numa_group *grp = p->numa_group;
+ int i;
+ void *numa_faults = p->numa_faults;
+
+ if (grp) {
+ for (i = 0; i < 2*nr_node_ids; i++)
+ atomic_long_sub(p->numa_faults[i], &grp->faults[i]);
+
+ atomic_long_sub(p->total_numa_faults, &grp->total_faults);
+
+ spin_lock(&grp->lock);
+ list_del(&p->numa_entry);
+ grp->nr_tasks--;
+ spin_unlock(&grp->lock);
+ rcu_assign_pointer(p->numa_group, NULL);
+ put_numa_group(grp);
+ }
+
+ p->numa_faults = NULL;
+ p->numa_faults_buffer = NULL;
+ kfree(numa_faults);
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_cpupid, int node, int pages, int flags)
{
struct task_struct *p = current;
+ bool migrated = flags & TNF_MIGRATED;
+ int priv;
if (!numabalancing_enabled)
return;
+ /* for example, ksmd faulting in a user's mm */
+ if (!p->mm)
+ return;
+
+ /* Do not worry about placement if exiting */
+ if (p->state == TASK_DEAD)
+ return;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
- int size = sizeof(*p->numa_faults) * nr_node_ids;
+ int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
/* numa_faults and numa_faults_buffer share the allocation */
p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
return;
BUG_ON(p->numa_faults_buffer);
- p->numa_faults_buffer = p->numa_faults + nr_node_ids;
+ p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+ p->total_numa_faults = 0;
+ }
+
+ /*
+ * First accesses are treated as private, otherwise consider accesses
+ * to be private if the accessing pid has not changed
+ */
+ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
+ priv = 1;
+ } else {
+ priv = cpupid_match_pid(p, last_cpupid);
+ if (!priv && !(flags & TNF_NO_GROUP))
+ task_numa_group(p, last_cpupid, flags, &priv);
}
/*
task_numa_placement(p);
- p->numa_faults_buffer[node] += pages;
+ /* Retry task to preferred node migration if it previously failed */
+ if (p->numa_migrate_retry && time_after(jiffies, p->numa_migrate_retry))
+ numa_migrate_preferred(p);
+
+ if (migrated)
+ p->numa_pages_migrated += pages;
+
+ p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
continue;
- /* Skip small VMAs. They are not likely to be of relevance */
- if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+ /*
+ * Shared library pages mapped by multiple processes are not
+ * migrated as it is expected they are cache replicated. Avoid
+ * hinting faults in read-only file-backed mappings or the vdso
+ * as migrating the pages will be of marginal benefit.
+ */
+ if (!vma->vm_mm ||
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
continue;
do {
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
static void
if (!parent_entity(se))
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
- if (entity_is_task(se))
- list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+ if (entity_is_task(se)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_enqueue(rq, task_of(se));
+ list_add(&se->group_node, &rq->cfs_tasks);
+ }
#endif
cfs_rq->nr_running++;
}
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se))
+ if (entity_is_task(se)) {
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
+ }
cfs_rq->nr_running--;
}
{
struct sched_entity *se = tg->se[cpu];
- if (!tg->parent) /* the trivial, non-cgroup case */
+ if (!tg->parent || !wl) /* the trivial, non-cgroup case */
return wl;
for_each_sched_entity(se) {
}
#else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
- unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
return wl;
}
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+enum fbq_type { regular, remote, all };
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
unsigned int loop;
unsigned int loop_break;
unsigned int loop_max;
+
+ enum fbq_type fbq_type;
};
/*
set_task_cpu(p, env->dst_cpu);
activate_task(env->dst_rq, p, 0);
check_preempt_curr(env->dst_rq, p, 0);
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->numa_preferred_nid != -1) {
+ int src_nid = cpu_to_node(env->src_cpu);
+ int dst_nid = cpu_to_node(env->dst_cpu);
+
+ /*
+ * If the load balancer has moved the task then limit
+ * migrations from taking place in the short term in
+ * case this is a short-lived migration.
+ */
+ if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid)
+ p->numa_migrate_seq = 0;
+ }
+#endif
}
/*
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
- if (src_nid == dst_nid ||
- p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+ if (src_nid == dst_nid)
return false;
- if (dst_nid == p->numa_preferred_nid ||
- p->numa_faults[dst_nid] > p->numa_faults[src_nid])
+ /* Always encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
+ return true;
+
+ /* If both task and group weight improve, this move is a winner. */
+ if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+ group_weight(p, dst_nid) > group_weight(p, src_nid))
return true;
return false;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
- if (src_nid == dst_nid ||
- p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+ if (src_nid == dst_nid)
return false;
- if (p->numa_faults[dst_nid] < p->numa_faults[src_nid])
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid)
+ return true;
+
+ /* If either task or group weight get worse, don't do it. */
+ if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+ group_weight(p, dst_nid) < group_weight(p, src_nid))
return true;
return false;
return 0;
}
-static unsigned long task_h_load(struct task_struct *p);
-
static const unsigned int sched_nr_migrate_break = 32;
/*
unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
};
/*
sgs->group_load += load;
sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
return false;
}
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->nr_numa_running)
+ return regular;
+ if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ return remote;
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ if (rq->nr_running > rq->nr_numa_running)
+ return regular;
+ if (rq->nr_running > rq->nr_preferred_running)
+ return remote;
+ return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
-static inline void update_sd_lb_stats(struct lb_env *env,
- struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
sg = sg->next;
} while (sg != env->sd->groups);
+
+ if (env->sd->flags & SD_NUMA)
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
}
/**
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long power = power_of(i);
- unsigned long capacity = DIV_ROUND_CLOSEST(power,
- SCHED_POWER_SCALE);
- unsigned long wl;
+ unsigned long power, capacity, wl;
+ enum fbq_type rt;
+
+ rq = cpu_rq(i);
+ rt = fbq_classify_rq(rq);
+ /*
+ * We classify groups/runqueues into three groups:
+ * - regular: there are !numa tasks
+ * - remote: there are numa tasks that run on the 'wrong' node
+ * - all: there is no distinction
+ *
+ * In order to avoid migrating ideally placed numa tasks,
+ * ignore those when there's better options.
+ *
+ * If we ignore the actual busiest queue to migrate another
+ * task, the next balance pass can still reduce the busiest
+ * queue by moving tasks around inside the node.
+ *
+ * If we cannot move enough load due to this classification
+ * the next pass will adjust the group classification and
+ * allow migration of more tasks.
+ *
+ * Both cases only affect the total convergence complexity.
+ */
+ if (rt > env->fbq_type)
+ continue;
+
+ power = power_of(i);
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
if (!capacity)
capacity = fix_small_capacity(env->sd, group);
- rq = cpu_rq(i);
wl = weighted_cpuload(i);
/*
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
+ .fbq_type = all,
};
/*