#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
.flags = MPOL_F_LOCAL,
};
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+ struct mempolicy *pol = p->mempolicy;
+ int node;
+
+ if (!pol) {
+ node = numa_node_id();
+ if (node != -1)
+ pol = &preferred_node_policy[node];
+
+ /* preferred_node_policy is not initialised early in boot */
+ if (!pol->mode)
+ pol = NULL;
+ }
+
+ return pol;
+}
+
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
/*
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- return NULL; /* simply delete any existing policy */
+ return NULL;
}
VM_BUG_ON(!nodes);
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
}
+ } else if (mode == MPOL_LOCAL) {
+ if (!nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
return 0;
}
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ int nr_updated;
+ BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+
+ nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+ if (nr_updated)
+ count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+
+ return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+
/*
* Check if all pages in a range are on a set of nodes.
* If pagelist != NULL then isolate pages from the LRU and
return ERR_PTR(-EFAULT);
prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ unsigned long endvma = vma->vm_end;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
}
- if (!is_vm_hugetlb_page(vma) &&
- ((flags & MPOL_MF_STRICT) ||
+
+ if (is_vm_hugetlb_page(vma))
+ goto next;
+
+ if (flags & MPOL_MF_LAZY) {
+ change_prot_numa(vma, start, endvma);
+ goto next;
+ }
+
+ if ((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))) {
- unsigned long endvma = vma->vm_end;
+ vma_migratable(vma))) {
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
err = check_pgd_range(vma, start, endvma, nodes,
flags, private);
if (err) {
break;
}
}
+next:
prev = vma;
}
return first;
}
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+ struct mempolicy *pol)
+{
+ int err;
+ struct mempolicy *old;
+ struct mempolicy *new;
+
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_ops, vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+ new = mpol_dup(pol);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ if (vma->vm_ops && vma->vm_ops->set_policy) {
+ err = vma->vm_ops->set_policy(vma, new);
+ if (err)
+ goto err_out;
+ }
+
+ old = vma->vm_policy;
+ vma->vm_policy = new; /* protected by mmap_sem */
+ mpol_put(old);
+
+ return 0;
+ err_out:
+ mpol_put(new);
+ return err;
+}
+
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
if (err)
goto out;
}
-
- /*
- * Apply policy to a single VMA. The reference counting of
- * policy for vma_policy linkages has already been handled by
- * vma_merge and split_vma as necessary. If this is a shared
- * policy then ->set_policy will increment the reference count
- * for an sp node.
- */
- pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
- vma->vm_start, vma->vm_end, vma->vm_pgoff,
- vma->vm_ops, vma->vm_file,
- vma->vm_ops ? vma->vm_ops->set_policy : NULL);
- if (vma->vm_ops && vma->vm_ops->set_policy) {
- err = vma->vm_ops->set_policy(vma, new_pol);
- if (err)
- goto out;
- }
+ err = vma_replace_policy(vma, new_pol);
+ if (err)
+ goto out;
}
out:
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
- struct vm_area_struct *vma;
nodes_clear(nmask);
node_set(source, nmask);
- vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ /*
+ * This does not "check" the range but isolates all pages that
+ * need migration. Between passing in the full user address
+ * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+ */
+ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+ check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
- if (IS_ERR(vma))
- return PTR_ERR(vma);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
int err;
LIST_HEAD(pagelist);
- if (flags & ~(unsigned long)(MPOL_MF_STRICT |
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (IS_ERR(new))
return PTR_ERR(new);
+ if (flags & MPOL_MF_LAZY)
+ new->flags |= MPOL_F_MOF;
+
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
- err = PTR_ERR(vma);
- if (!IS_ERR(vma)) {
- int nr_failed = 0;
-
+ err = PTR_ERR(vma); /* maybe ... */
+ if (!IS_ERR(vma))
err = mbind_range(mm, start, end, new);
+ if (!err) {
+ int nr_failed = 0;
+
if (!list_empty(&pagelist)) {
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_lru_pages(&pagelist);
}
- if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
} else
putback_lru_pages(&pagelist);
*
* Returns effective policy for a VMA at specified address.
* Falls back to @task or system default policy, as necessary.
- * Current or other task's task mempolicy and non-shared vma policies
- * are protected by the task's mmap_sem, which must be held for read by
- * the caller.
+ * Current or other task's task mempolicy and non-shared vma policies must be
+ * protected by task_lock(task) by the caller.
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
* count--added by the get_policy() vm_op, as appropriate--to protect against
* freeing by another task. It is the caller's responsibility to free the
struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
- struct mempolicy *pol = task->mempolicy;
+ struct mempolicy *pol = get_task_policy(task);
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
addr);
if (vpol)
pol = vpol;
- } else if (vma->vm_policy)
+ } else if (vma->vm_policy) {
pol = vma->vm_policy;
+
+ /*
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+ * count on these policies which will be dropped by
+ * mpol_cond_put() later
+ */
+ if (mpol_needs_cond_ref(pol))
+ mpol_get(pol);
+ }
}
if (!pol)
pol = &default_policy;
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = get_task_policy(current);
struct page *page;
unsigned int cpuset_mems_cookie;
*/
/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
if (!sp->root.rb_node)
return NULL;
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- spin_unlock(&sp->lock);
+ mutex_unlock(&sp->mutex);
return pol;
}
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page - page to be checked
+ * @vma - vm area where page mapped
+ * @addr - virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol;
+ struct zone *zone;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int polnid = -1;
+ int ret = -1;
+
+ BUG_ON(!vma);
+
+ pol = get_vma_policy(current, vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, vma, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ (void)first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes, &zone);
+ polnid = zone->node;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Migrate the page towards the node whose CPU is referencing it */
+ if (pol->flags & MPOL_F_MORON)
+ polnid = numa_node_id();
+
+ if (curnid != polnid)
+ ret = polnid;
+out:
+ mpol_cond_put(pol);
+
+ return ret;
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_free(n);
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
- struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ struct sp_node *n;
+ struct mempolicy *newpol;
+ n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
+
+ newpol = mpol_dup(pol);
+ if (IS_ERR(newpol)) {
+ kmem_cache_free(sn_cache, n);
+ return NULL;
+ }
+ newpol->flags |= MPOL_F_SHARED;
+
n->start = start;
n->end = end;
- mpol_get(pol);
- pol->flags |= MPOL_F_SHARED; /* for unref */
- n->policy = pol;
+ n->policy = newpol;
+
return n;
}
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
- struct sp_node *n, *new2 = NULL;
+ struct sp_node *n;
+ int ret = 0;
-restart:
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
+ struct sp_node *new2;
+ new2 = sp_alloc(end, n->end, n->policy);
if (!new2) {
- spin_unlock(&sp->lock);
- new2 = sp_alloc(end, n->end, n->policy);
- if (!new2)
- return -ENOMEM;
- goto restart;
+ ret = -ENOMEM;
+ goto out;
}
n->end = start;
sp_insert(sp, new2);
- new2 = NULL;
break;
} else
n->end = start;
}
if (new)
sp_insert(sp, new);
- spin_unlock(&sp->lock);
- if (new2) {
- mpol_put(new2->policy);
- kmem_cache_free(sn_cache, new2);
- }
- return 0;
+out:
+ mutex_unlock(&sp->mutex);
+ return ret;
}
/**
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- spin_lock_init(&sp->lock);
+ mutex_init(&sp->mutex);
if (mpol) {
struct vm_area_struct pvma;
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
- kmem_cache_free(sn_cache, new);
+ sp_free(new);
return err;
}
if (!p->root.rb_node)
return;
- spin_lock(&p->lock);
+ mutex_lock(&p->mutex);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- rb_erase(&n->nd, &p->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_delete(p, n);
}
- spin_unlock(&p->lock);
+ mutex_unlock(&p->mutex);
}
/* assumes fs == KERNEL_DS */
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
+ for_each_node(nid) {
+ preferred_node_policy[nid] = (struct mempolicy) {
+ .refcnt = ATOMIC_INIT(1),
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_MOF | MPOL_F_MORON,
+ .v = { .preferred_node = nid, },
+ };
+ }
+
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
* Used only for mpol_parse_str() and mpol_to_str()
*/
-#define MPOL_LOCAL MPOL_MAX
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
- [MPOL_LOCAL] = "local"
+ [MPOL_LOCAL] = "local",
};
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+ for (mode = 0; mode < MPOL_MAX; mode++) {
if (!strcmp(str, policy_modes[mode])) {
break;
}
}
- if (mode > MPOL_LOCAL)
+ if (mode >= MPOL_MAX)
goto out;
switch (mode) {