config NUMA
bool "Non Uniform Memory Access (NUMA) Support"
depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+ select ARCH_WANT_NUMA_VARIABLE_LOCALITY
default n
help
Some SH systems have many various memories scattered around
def_bool y
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
+ select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
#ifdef CONFIG_CPUMASK_OFFSTACK
struct cpumask cpumask_allocation;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * numa_next_scan is the next time when the PTEs will me marked
+ * pte_numa to gather statistics and migrate pages to new nodes
+ * if necessary
+ */
+ unsigned long numa_next_scan;
+
+ /* numa_scan_seq prevents two threads setting pte_numa */
+ int numa_scan_seq;
+#endif
struct uprobes_state uprobes_state;
};
short il_next;
short pref_node_fork;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ int numa_scan_seq;
+ int numa_migrate_seq;
+ unsigned int numa_scan_period;
+ u64 node_stamp; /* migration stamp */
+ struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+
struct rcu_head rcu;
/*
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages);
+#else
+static inline void task_numa_fault(int node, int pages)
+{
+}
+#endif
+
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
};
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_settle_count;
+
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ p->mm->numa_next_scan = jiffies;
+ p->mm->numa_scan_seq = 0;
+ }
+
+ p->node_stamp = 0ULL;
+ p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
}
/*
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
#include <trace/events/sched.h>
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms: 5s
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 5000;
+unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+
+ /* FIXME: Scheduling placement policy hints go here */
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages)
+{
+ struct task_struct *p = current;
+
+ /* FIXME: Allocate task-specific structure for placement policy here */
+
+ task_numa_placement(p);
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+ unsigned long migrate, next_scan, now = jiffies;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+ work->next = work; /* protect against double add */
+ /*
+ * Who cares about NUMA placement when they're dying.
+ *
+ * NOTE: make sure not to dereference p->mm before this check,
+ * exit_task_work() happens _after_ exit_mm() so we could be called
+ * without p->mm even though we still had it when we enqueued this
+ * work.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Enforce maximal scan/migration frequency..
+ */
+ migrate = mm->numa_next_scan;
+ if (time_before(now, migrate))
+ return;
+
+ if (p->numa_scan_period == 0)
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+
+ next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ return;
+
+ ACCESS_ONCE(mm->numa_scan_seq)++;
+ {
+ struct vm_area_struct *vma;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!vma_migratable(vma))
+ continue;
+ change_prot_numa(vma, vma->vm_start, vma->vm_end);
+ }
+ up_read(&mm->mmap_sem);
+ }
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_work;
+ u64 period, now;
+
+ /*
+ * We don't care about NUMA placement if we don't have memory.
+ */
+ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ return;
+
+ /*
+ * Using runtime rather than walltime has the dual advantage that
+ * we (mostly) drive the selection from busy threads and that the
+ * task needs to have done some actual work before we bother with
+ * NUMA placement.
+ */
+ now = curr->se.sum_exec_runtime;
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+ if (now - curr->node_stamp > period) {
+ curr->node_stamp = now;
+
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ task_work_add(curr, work, true);
+ }
+ }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
+
+ if (sched_feat_numa(NUMA))
+ task_tick_numa(rq, curr);
}
/*
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
+
+/*
+ * Apply the automatic NUMA scheduling policy
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA, true)
+#endif
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#else
+#define sched_feat_numa(x) (0)
+#endif
+
static inline u64 global_rt_period(void)
{
return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
+#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
.extra1 = &zero,
.extra2 = &one,
},
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_scan_period_min_ms",
+ .data = &sysctl_numa_balancing_scan_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_max_ms",
+ .data = &sysctl_numa_balancing_scan_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
*/
split_huge_page(page);
put_page(page);
+
return 0;
clear_pmdnuma:
out_unlock:
spin_unlock(&mm->page_table_lock);
- if (page)
+ if (page) {
put_page(page);
+ task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
+ }
return 0;
}
{
struct page *page = NULL;
spinlock_t *ptl;
- int current_nid, target_nid;
+ int current_nid = -1;
+ int target_nid;
/*
* The "pte" at this point cannot be used safely without
current_nid = target_nid;
out:
+ task_numa_fault(current_nid, 1);
return 0;
}
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte;
struct page *page;
+ int curr_nid;
if (!pte_present(pteval))
continue;
if (!pte_numa(pteval))
page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page))
continue;
+ /* only check non-shared pages */
+ if (unlikely(page_mapcount(page) != 1))
+ continue;
+ pte_unmap_unlock(pte, ptl);
+
+ curr_nid = page_to_nid(page);
+ task_numa_fault(curr_nid, 1);
+
+ pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}
pte_unmap_unlock(orig_pte, ptl);