#ifdef CONFIG_NUMA_BALANCING
/*
- * numa task sample period in ms
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
*/
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
+unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+ unsigned long rss = 0;
+ unsigned long nr_scan_pages;
+
+ /*
+ * Calculations based on RSS as non-present and empty pages are skipped
+ * by the PTE scanner and NUMA hinting faults should be trapped based
+ * on resident pages
+ */
+ nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+ rss = get_mm_rss(p->mm);
+ if (!rss)
+ rss = nr_scan_pages;
+
+ rss = round_up(rss, nr_scan_pages);
+ return rss / nr_scan_pages;
+}
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
+{
+ unsigned int scan, floor;
+ unsigned int windows = 1;
+
+ if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ floor = 1000 / windows;
+
+ scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+ return max_t(unsigned int, floor, scan);
+}
+
+static unsigned int task_scan_max(struct task_struct *p)
+{
+ unsigned int smin = task_scan_min(p);
+ unsigned int smax;
+
+ /* Watch for min being lower than max due to floor calculations */
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+ return max(smin, smax);
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq;
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
+ p->numa_scan_period_max = task_scan_max(p);
/* FIXME: Scheduling placement policy hints go here */
}
* If pages are properly placed (did not migrate) then scan slower.
* This is reset periodically in case of phase changes
*/
- if (!migrated)
- p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
- p->numa_scan_period + jiffies_to_msecs(10));
+ if (!migrated) {
+ /* Initialise if necessary */
+ if (!p->numa_scan_period_max)
+ p->numa_scan_period_max = task_scan_max(p);
+
+ p->numa_scan_period = min(p->numa_scan_period_max,
+ p->numa_scan_period + 10);
+ }
task_numa_placement(p);
}
struct mm_struct *mm = p->mm;
struct vm_area_struct *vma;
unsigned long start, end;
+ unsigned long nr_pte_updates = 0;
long pages;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
*/
migrate = mm->numa_next_reset;
if (time_after(now, migrate)) {
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ p->numa_scan_period = task_scan_min(p);
next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
xchg(&mm->numa_next_reset, next_scan);
}
if (time_before(now, migrate))
return;
- if (p->numa_scan_period == 0)
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ if (p->numa_scan_period == 0) {
+ p->numa_scan_period_max = task_scan_max(p);
+ p->numa_scan_period = task_scan_min(p);
+ }
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
- pages -= change_prot_numa(vma, start, end);
+ nr_pte_updates += change_prot_numa(vma, start, end);
+
+ /*
+ * Scan sysctl_numa_balancing_scan_size but ensure that
+ * at least one PTE is updated so that unused virtual
+ * address space is quickly skipped.
+ */
+ if (nr_pte_updates)
+ pages -= (end - start) >> PAGE_SHIFT;
start = end;
if (pages <= 0)
if (now - curr->node_stamp > period) {
if (!curr->node_stamp)
- curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ curr->numa_scan_period = task_scan_min(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {