* the preferred node but still allow the scheduler to move the task again if
* the nodes CPUs are overloaded.
*/
-unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+
+static inline int task_faults_idx(int nid, int priv)
+{
+ return 2 * nid + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+ if (!p->numa_faults)
+ return 0;
+
+ return p->numa_faults[task_faults_idx(nid, 0)] +
+ p->numa_faults[task_faults_idx(nid, 1)];
+}
static unsigned long weighted_cpuload(const int cpu);
int seq, nid, max_nid = -1;
unsigned long max_faults = 0;
- if (!p->mm) /* for example, ksmd faulting in a user's mm */
- return;
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
unsigned long faults;
+ int priv, i;
+
+ for (priv = 0; priv < 2; priv++) {
+ i = task_faults_idx(nid, priv);
- /* Decay existing window and copy faults since last scan */
- p->numa_faults[nid] >>= 1;
- p->numa_faults[nid] += p->numa_faults_buffer[nid];
- p->numa_faults_buffer[nid] = 0;
+ /* Decay existing window, copy faults since last scan */
+ p->numa_faults[i] >>= 1;
+ p->numa_faults[i] += p->numa_faults_buffer[i];
+ p->numa_faults_buffer[i] = 0;
+ }
- faults = p->numa_faults[nid];
+ /* Find maximum private faults */
+ faults = p->numa_faults[task_faults_idx(nid, 1)];
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
/* Update the preferred nid and migrate task if possible */
p->numa_preferred_nid = max_nid;
- p->numa_migrate_seq = 0;
+ p->numa_migrate_seq = 1;
migrate_task_to(p, preferred_cpu);
}
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_nidpid, int node, int pages, bool migrated)
{
struct task_struct *p = current;
+ int priv;
if (!numabalancing_enabled)
return;
+ /* for example, ksmd faulting in a user's mm */
+ if (!p->mm)
+ return;
+
+ /*
+ * First accesses are treated as private, otherwise consider accesses
+ * to be private if the accessing pid has not changed
+ */
+ if (!nidpid_pid_unset(last_nidpid))
+ priv = ((p->pid & LAST__PID_MASK) == nidpid_to_pid(last_nidpid));
+ else
+ priv = 1;
+
/* Allocate buffer to track faults on a per-node basis */
if (unlikely(!p->numa_faults)) {
- int size = sizeof(*p->numa_faults) * nr_node_ids;
+ int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
/* numa_faults and numa_faults_buffer share the allocation */
p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
return;
BUG_ON(p->numa_faults_buffer);
- p->numa_faults_buffer = p->numa_faults + nr_node_ids;
+ p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
}
/*
task_numa_placement(p);
- p->numa_faults_buffer[node] += pages;
+ p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma))
- continue;
-
- /* Skip small VMAs. They are not likely to be of relevance */
- if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+ if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
continue;
do {
set_task_cpu(p, env->dst_cpu);
activate_task(env->dst_rq, p, 0);
check_preempt_curr(env->dst_rq, p, 0);
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->numa_preferred_nid != -1) {
+ int src_nid = cpu_to_node(env->src_cpu);
+ int dst_nid = cpu_to_node(env->dst_cpu);
+
+ /*
+ * If the load balancer has moved the task then limit
+ * migrations from taking place in the short term in
+ * case this is a short-lived migration.
+ */
+ if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid)
+ p->numa_migrate_seq = 0;
+ }
+#endif
}
/*
return false;
if (dst_nid == p->numa_preferred_nid ||
- p->numa_faults[dst_nid] > p->numa_faults[src_nid])
+ task_faults(p, dst_nid) > task_faults(p, src_nid))
return true;
return false;
p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
return false;
- if (p->numa_faults[dst_nid] < p->numa_faults[src_nid])
+ if (task_faults(p, dst_nid) < task_faults(p, src_nid))
return true;
return false;