mm: numa: Migrate on reference policy

[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / mempolicy.c
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 51d3ebd..4c1c8d8 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -118,6 +118,26 @@ static struct mempolicy default_policy = {
         .flags = MPOL_F_LOCAL,
  };
  
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+       struct mempolicy *pol = p->mempolicy;
+       int node;
+
+       if (!pol) {
+               node = numa_node_id();
+               if (node != -1)
+                       pol = &preferred_node_policy[node];
+
+               /* preferred_node_policy is not initialised early in boot */
+               if (!pol->mode)
+                       pol = NULL;
+       }
+
+       return pol;
+}
+
  static const struct mempolicy_operations {
         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
         /*
@@ -252,7 +272,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
  
-       if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) {
+       if (mode == MPOL_DEFAULT) {
                 if (nodes && !nodes_empty(*nodes))
                         return ERR_PTR(-EINVAL);
                 return NULL;
@@ -568,134 +588,25 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
  
  #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
  /*
- * Here we search for not shared page mappings (mapcount == 1) and we
- * set up the pmd/pte_numa on those mappings so the very next access
- * will fire a NUMA hinting page fault.
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
   */
-static int
-change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma,
-                       unsigned long address)
-{
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte, *_pte;
-       struct page *page;
-       unsigned long _address, end;
-       spinlock_t *ptl;
-       int ret = 0;
-
-       VM_BUG_ON(address & ~PAGE_MASK);
-
-       pgd = pgd_offset(mm, address);
-       if (!pgd_present(*pgd))
-               goto out;
-
-       pud = pud_offset(pgd, address);
-       if (!pud_present(*pud))
-               goto out;
-
-       pmd = pmd_offset(pud, address);
-       if (pmd_none(*pmd))
-               goto out;
-
-       if (pmd_trans_huge_lock(pmd, vma) == 1) {
-               int page_nid;
-               ret = HPAGE_PMD_NR;
-
-               VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-
-               if (pmd_numa(*pmd)) {
-                       spin_unlock(&mm->page_table_lock);
-                       goto out;
-               }
-
-               page = pmd_page(*pmd);
-
-               /* only check non-shared pages */
-               if (page_mapcount(page) != 1) {
-                       spin_unlock(&mm->page_table_lock);
-                       goto out;
-               }
-
-               page_nid = page_to_nid(page);
-
-               if (pmd_numa(*pmd)) {
-                       spin_unlock(&mm->page_table_lock);
-                       goto out;
-               }
-
-               set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
-               ret += HPAGE_PMD_NR;
-               /* defer TLB flush to lower the overhead */
-               spin_unlock(&mm->page_table_lock);
-               goto out;
-       }
-
-       if (pmd_trans_unstable(pmd))
-               goto out;
-       VM_BUG_ON(!pmd_present(*pmd));
-
-       end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK);
-       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-       for (_address = address, _pte = pte; _address < end;
-            _pte++, _address += PAGE_SIZE) {
-               pte_t pteval = *_pte;
-               if (!pte_present(pteval))
-                       continue;
-               if (pte_numa(pteval))
-                       continue;
-               page = vm_normal_page(vma, _address, pteval);
-               if (unlikely(!page))
-                       continue;
-               /* only check non-shared pages */
-               if (page_mapcount(page) != 1)
-                       continue;
-
-               set_pte_at(mm, _address, _pte, pte_mknuma(pteval));
-
-               /* defer TLB flush to lower the overhead */
-               ret++;
-       }
-       pte_unmap_unlock(pte, ptl);
-
-       if (ret && !pmd_numa(*pmd)) {
-               spin_lock(&mm->page_table_lock);
-               set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
-               spin_unlock(&mm->page_table_lock);
-               /* defer TLB flush to lower the overhead */
-       }
-
-out:
-       return ret;
-}
-
-/* Assumes mmap_sem is held */
-void
-change_prot_numa(struct vm_area_struct *vma,
-                       unsigned long address, unsigned long end)
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+                       unsigned long addr, unsigned long end)
  {
-       struct mm_struct *mm = vma->vm_mm;
-       int progress = 0;
-
-       while (address < end) {
-               VM_BUG_ON(address < vma->vm_start ||
-                         address + PAGE_SIZE > vma->vm_end);
+       int nr_updated;
+       BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
  
-               progress += change_prot_numa_range(mm, vma, address);
-               address = (address + PMD_SIZE) & PMD_MASK;
-       }
+       nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+       if (nr_updated)
+               count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
  
-       /*
-        * Flush the TLB for the mm to start the NUMA hinting
-        * page faults after we finish scanning this vma part
-        * if there were any PTE updates
-        */
-       if (progress) {
-               mmu_notifier_invalidate_range_start(vma->vm_mm, address, end);
-               flush_tlb_range(vma, address, end);
-               mmu_notifier_invalidate_range_end(vma->vm_mm, address, end);
-       }
+       return nr_updated;
  }
  #else
  static unsigned long change_prot_numa(struct vm_area_struct *vma,
@@ -1297,7 +1208,7 @@ static long do_mbind(unsigned long start, unsigned long len,
         if (start & ~PAGE_MASK)
                 return -EINVAL;
  
-       if (mode == MPOL_DEFAULT || mode == MPOL_NOOP)
+       if (mode == MPOL_DEFAULT)
                 flags &= ~MPOL_MF_STRICT;
  
         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
@@ -1352,7 +1263,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                           flags | MPOL_MF_INVERT, &pagelist);
  
         err = PTR_ERR(vma);     /* maybe ... */
-       if (!IS_ERR(vma) && mode != MPOL_NOOP)
+       if (!IS_ERR(vma))
                 err = mbind_range(mm, start, end, new);
  
         if (!err) {
@@ -1707,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
  struct mempolicy *get_vma_policy(struct task_struct *task,
                 struct vm_area_struct *vma, unsigned long addr)
  {
-       struct mempolicy *pol = task->mempolicy;
+       struct mempolicy *pol = get_task_policy(task);
  
         if (vma) {
                 if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -2130,7 +2041,7 @@ retry_cpuset:
   */
  struct page *alloc_pages_current(gfp_t gfp, unsigned order)
  {
-       struct mempolicy *pol = current->mempolicy;
+       struct mempolicy *pol = get_task_policy(current);
         struct page *page;
         unsigned int cpuset_mems_cookie;
  
@@ -2404,6 +2315,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
         default:
                 BUG();
         }
+
+       /* Migrate the page towards the node whose CPU is referencing it */
+       if (pol->flags & MPOL_F_MORON)
+               polnid = numa_node_id();
+
         if (curnid != polnid)
                 ret = polnid;
  out:
@@ -2592,6 +2508,15 @@ void __init numa_policy_init(void)
                                      sizeof(struct sp_node),
                                      0, SLAB_PANIC, NULL);
  
+       for_each_node(nid) {
+               preferred_node_policy[nid] = (struct mempolicy) {
+                       .refcnt = ATOMIC_INIT(1),
+                       .mode = MPOL_PREFERRED,
+                       .flags = MPOL_F_MOF | MPOL_F_MORON,
+                       .v = { .preferred_node = nid, },
+               };
+       }
+
         /*
          * Set interleaving policy for system init. Interleaving is only
          * enabled across suitably sized nodes (default is >= 16MB), or
@@ -2641,7 +2566,6 @@ static const char * const policy_modes[] =
         [MPOL_BIND]       = "bind",
         [MPOL_INTERLEAVE] = "interleave",
         [MPOL_LOCAL]      = "local",
-       [MPOL_NOOP]       = "noop",     /* should not actually be used */
  };
  
  
@@ -2692,7 +2616,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                         break;
                 }
         }
-       if (mode >= MPOL_MAX || mode == MPOL_NOOP)
+       if (mode >= MPOL_MAX)
                 goto out;
  
         switch (mode) {