feature is too high then the rate the kernel samples for NUMA hinting
faults may be controlled by the numa_balancing_scan_period_min_ms,
numa_balancing_scan_delay_ms, numa_balancing_scan_period_reset,
-numa_balancing_scan_period_max_ms and numa_balancing_scan_size_mb sysctls.
+numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb and
+numa_balancing_settle_count sysctls.
==============================================================
numa_balancing_scan_period_reset is a blunt instrument that controls how
often a tasks scan delay is reset to detect sudden changes in task behaviour.
+numa_balancing_settle_count is how many scan periods must complete before
+the schedule balancer stops pushing the task towards a preferred node. This
+gives the scheduler a chance to place the task on an alternative node if the
+preferred node is overloaded.
+
==============================================================
osrelease, ostype & version:
return max(smin, smax);
}
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1;
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
+ p->numa_migrate_seq++;
p->numa_scan_period_max = task_scan_max(p);
/* Find the node with the highest number of faults */
}
/* Update the tasks preferred node if necessary */
- if (max_faults && max_nid != p->numa_preferred_nid)
+ if (max_faults && max_nid != p->numa_preferred_nid) {
p->numa_preferred_nid = max_nid;
+ p->numa_migrate_seq = 0;
+ }
}
/*
return delta < (s64)sysctl_sched_migration_cost;
}
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+ int src_nid, dst_nid;
+
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+ !(env->sd->flags & SD_NUMA)) {
+ return false;
+ }
+
+ src_nid = cpu_to_node(env->src_cpu);
+ dst_nid = cpu_to_node(env->dst_cpu);
+
+ if (src_nid == dst_nid ||
+ p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+ return false;
+
+ if (dst_nid == p->numa_preferred_nid ||
+ p->numa_faults[dst_nid] > p->numa_faults[src_nid])
+ return true;
+
+ return false;
+}
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+ struct lb_env *env)
+{
+ return false;
+}
+#endif
+
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
/*
* Aggressive migration if:
- * 1) task is cache cold, or
- * 2) too many balance attempts have failed.
+ * 1) destination numa is preferred
+ * 2) task is cache cold, or
+ * 3) too many balance attempts have failed.
*/
-
tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+
+ if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
+
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "numa_balancing_settle_count",
+ .data = &sysctl_numa_balancing_settle_count,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{