mm: Only flush TLBs if a transhuge PMD is modified for NUMA pte scanning
[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / huge_memory.c
index d94f7de..de8d5cf 100644 (file)
@@ -422,7 +422,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
        unsigned long msecs;
        int err;
 
-       err = strict_strtoul(buf, 10, &msecs);
+       err = kstrtoul(buf, 10, &msecs);
        if (err || msecs > UINT_MAX)
                return -EINVAL;
 
@@ -449,7 +449,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
        unsigned long msecs;
        int err;
 
-       err = strict_strtoul(buf, 10, &msecs);
+       err = kstrtoul(buf, 10, &msecs);
        if (err || msecs > UINT_MAX)
                return -EINVAL;
 
@@ -475,7 +475,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
        int err;
        unsigned long pages;
 
-       err = strict_strtoul(buf, 10, &pages);
+       err = kstrtoul(buf, 10, &pages);
        if (err || !pages || pages > UINT_MAX)
                return -EINVAL;
 
@@ -543,7 +543,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
        int err;
        unsigned long max_ptes_none;
 
-       err = strict_strtoul(buf, 10, &max_ptes_none);
+       err = kstrtoul(buf, 10, &max_ptes_none);
        if (err || max_ptes_none > HPAGE_PMD_NR-1)
                return -EINVAL;
 
@@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
        return pmd;
 }
 
-static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 {
        pmd_t entry;
-       entry = mk_pmd(page, vma->vm_page_prot);
-       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+       entry = mk_pmd(page, prot);
        entry = pmd_mkhuge(entry);
        return entry;
 }
@@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                pte_free(mm, pgtable);
        } else {
                pmd_t entry;
-               entry = mk_huge_pmd(page, vma);
+               entry = mk_huge_pmd(page, vma->vm_page_prot);
+               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                page_add_new_anon_rmap(page, vma, haddr);
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                set_pmd_at(mm, haddr, pmd, entry);
@@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *page;
        unsigned long haddr = address & HPAGE_PMD_MASK;
-       pte_t *pte;
 
-       if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
-               if (unlikely(anon_vma_prepare(vma)))
-                       return VM_FAULT_OOM;
-               if (unlikely(khugepaged_enter(vma)))
+       if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+               return VM_FAULT_FALLBACK;
+       if (unlikely(anon_vma_prepare(vma)))
+               return VM_FAULT_OOM;
+       if (unlikely(khugepaged_enter(vma)))
+               return VM_FAULT_OOM;
+       if (!(flags & FAULT_FLAG_WRITE) &&
+                       transparent_hugepage_use_zero_page()) {
+               pgtable_t pgtable;
+               struct page *zero_page;
+               bool set;
+               pgtable = pte_alloc_one(mm, haddr);
+               if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
-               if (!(flags & FAULT_FLAG_WRITE) &&
-                               transparent_hugepage_use_zero_page()) {
-                       pgtable_t pgtable;
-                       struct page *zero_page;
-                       bool set;
-                       pgtable = pte_alloc_one(mm, haddr);
-                       if (unlikely(!pgtable))
-                               return VM_FAULT_OOM;
-                       zero_page = get_huge_zero_page();
-                       if (unlikely(!zero_page)) {
-                               pte_free(mm, pgtable);
-                               count_vm_event(THP_FAULT_FALLBACK);
-                               goto out;
-                       }
-                       spin_lock(&mm->page_table_lock);
-                       set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
-                                       zero_page);
-                       spin_unlock(&mm->page_table_lock);
-                       if (!set) {
-                               pte_free(mm, pgtable);
-                               put_huge_zero_page();
-                       }
-                       return 0;
-               }
-               page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                         vma, haddr, numa_node_id(), 0);
-               if (unlikely(!page)) {
+               zero_page = get_huge_zero_page();
+               if (unlikely(!zero_page)) {
+                       pte_free(mm, pgtable);
                        count_vm_event(THP_FAULT_FALLBACK);
-                       goto out;
-               }
-               count_vm_event(THP_FAULT_ALLOC);
-               if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
-                       put_page(page);
-                       goto out;
+                       return VM_FAULT_FALLBACK;
                }
-               if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
-                                                         page))) {
-                       mem_cgroup_uncharge_page(page);
-                       put_page(page);
-                       goto out;
+               spin_lock(&mm->page_table_lock);
+               set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+                               zero_page);
+               spin_unlock(&mm->page_table_lock);
+               if (!set) {
+                       pte_free(mm, pgtable);
+                       put_huge_zero_page();
                }
-
                return 0;
        }
-out:
-       /*
-        * Use __pte_alloc instead of pte_alloc_map, because we can't
-        * run pte_offset_map on the pmd, if an huge pmd could
-        * materialize from under us from a different thread.
-        */
-       if (unlikely(pmd_none(*pmd)) &&
-           unlikely(__pte_alloc(mm, vma, pmd, address)))
-               return VM_FAULT_OOM;
-       /* if an huge pmd materialized from under us just retry later */
-       if (unlikely(pmd_trans_huge(*pmd)))
-               return 0;
-       /*
-        * A regular pmd is established and it can't morph into a huge pmd
-        * from under us anymore at this point because we hold the mmap_sem
-        * read mode and khugepaged takes it in write mode. So now it's
-        * safe to run pte_offset_map().
-        */
-       pte = pte_offset_map(pmd, address);
-       return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+       page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                       vma, haddr, numa_node_id(), 0);
+       if (unlikely(!page)) {
+               count_vm_event(THP_FAULT_FALLBACK);
+               return VM_FAULT_FALLBACK;
+       }
+       if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+               put_page(page);
+               count_vm_event(THP_FAULT_FALLBACK);
+               return VM_FAULT_FALLBACK;
+       }
+       if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+               mem_cgroup_uncharge_page(page);
+               put_page(page);
+               count_vm_event(THP_FAULT_FALLBACK);
+               return VM_FAULT_FALLBACK;
+       }
+
+       count_vm_event(THP_FAULT_ALLOC);
+       return 0;
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1170,7 +1150,6 @@ alloc:
                new_page = NULL;
 
        if (unlikely(!new_page)) {
-               count_vm_event(THP_FAULT_FALLBACK);
                if (is_huge_zero_pmd(orig_pmd)) {
                        ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
                                        address, pmd, orig_pmd, haddr);
@@ -1181,9 +1160,9 @@ alloc:
                                split_huge_page(page);
                        put_page(page);
                }
+               count_vm_event(THP_FAULT_FALLBACK);
                goto out;
        }
-       count_vm_event(THP_FAULT_ALLOC);
 
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
                put_page(new_page);
@@ -1191,10 +1170,13 @@ alloc:
                        split_huge_page(page);
                        put_page(page);
                }
+               count_vm_event(THP_FAULT_FALLBACK);
                ret |= VM_FAULT_OOM;
                goto out;
        }
 
+       count_vm_event(THP_FAULT_ALLOC);
+
        if (is_huge_zero_pmd(orig_pmd))
                clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
        else
@@ -1215,7 +1197,8 @@ alloc:
                goto out_mn;
        } else {
                pmd_t entry;
-               entry = mk_huge_pmd(new_page, vma);
+               entry = mk_huge_pmd(new_page, vma->vm_page_prot);
+               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                pmdp_clear_flush(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
@@ -1295,64 +1278,90 @@ out:
 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
+       struct anon_vma *anon_vma = NULL;
        struct page *page;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
+       int page_nid = -1, this_nid = numa_node_id();
        int target_nid;
-       int current_nid = -1;
-       bool migrated;
+       bool page_locked;
+       bool migrated = false;
 
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
 
        page = pmd_page(pmd);
-       get_page(page);
-       current_nid = page_to_nid(page);
+       page_nid = page_to_nid(page);
        count_vm_numa_event(NUMA_HINT_FAULTS);
-       if (current_nid == numa_node_id())
+       if (page_nid == this_nid)
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
 
+       /*
+        * Acquire the page lock to serialise THP migrations but avoid dropping
+        * page_table_lock if at all possible
+        */
+       page_locked = trylock_page(page);
        target_nid = mpol_misplaced(page, vma, haddr);
        if (target_nid == -1) {
-               put_page(page);
-               goto clear_pmdnuma;
+               /* If the page was locked, there are no parallel migrations */
+               if (page_locked)
+                       goto clear_pmdnuma;
+
+               /*
+                * Otherwise wait for potential migrations and retry. We do
+                * relock and check_same as the page may no longer be mapped.
+                * As the fault is being retried, do not account for it.
+                */
+               spin_unlock(&mm->page_table_lock);
+               wait_on_page_locked(page);
+               page_nid = -1;
+               goto out;
        }
 
-       /* Acquire the page lock to serialise THP migrations */
+       /* Page is misplaced, serialise migrations and parallel THP splits */
+       get_page(page);
        spin_unlock(&mm->page_table_lock);
-       lock_page(page);
+       if (!page_locked)
+               lock_page(page);
+       anon_vma = page_lock_anon_vma_read(page);
 
-       /* Confirm the PTE did not while locked */
+       /* Confirm the PMD did not change while page_table_lock was released */
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(pmd, *pmdp))) {
                unlock_page(page);
                put_page(page);
+               page_nid = -1;
                goto out_unlock;
        }
-       spin_unlock(&mm->page_table_lock);
 
-       /* Migrate the THP to the requested node */
+       /*
+        * Migrate the THP to the requested node, returns with page unlocked
+        * and pmd_numa cleared.
+        */
+       spin_unlock(&mm->page_table_lock);
        migrated = migrate_misplaced_transhuge_page(mm, vma,
                                pmdp, pmd, addr, page, target_nid);
-       if (!migrated)
-               goto check_same;
+       if (migrated)
+               page_nid = target_nid;
 
-       task_numa_fault(target_nid, HPAGE_PMD_NR, true);
-       return 0;
-
-check_same:
-       spin_lock(&mm->page_table_lock);
-       if (unlikely(!pmd_same(pmd, *pmdp)))
-               goto out_unlock;
+       goto out;
 clear_pmdnuma:
+       BUG_ON(!PageLocked(page));
        pmd = pmd_mknonnuma(pmd);
        set_pmd_at(mm, haddr, pmdp, pmd);
        VM_BUG_ON(pmd_numa(*pmdp));
        update_mmu_cache_pmd(vma, addr, pmdp);
+       unlock_page(page);
 out_unlock:
        spin_unlock(&mm->page_table_lock);
-       if (current_nid != -1)
-               task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+
+out:
+       if (anon_vma)
+               page_unlock_anon_vma_read(anon_vma);
+
+       if (page_nid != -1)
+               task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
+
        return 0;
 }
 
@@ -1449,6 +1458,12 @@ out:
        return ret;
 }
 
+/*
+ * Returns
+ *  - 0 if PMD could not be locked
+ *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
+ *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, pgprot_t newprot, int prot_numa)
 {
@@ -1457,9 +1472,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                pmd_t entry;
-               entry = pmdp_get_and_clear(mm, addr, pmd);
+               ret = 1;
                if (!prot_numa) {
+                       entry = pmdp_get_and_clear(mm, addr, pmd);
                        entry = pmd_modify(entry, newprot);
+                       ret = HPAGE_PMD_NR;
                        BUG_ON(pmd_write(entry));
                } else {
                        struct page *page = pmd_page(*pmd);
@@ -1467,12 +1484,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        /* only check non-shared pages */
                        if (page_mapcount(page) == 1 &&
                            !pmd_numa(*pmd)) {
+                               entry = pmdp_get_and_clear(mm, addr, pmd);
                                entry = pmd_mknuma(entry);
+                               ret = HPAGE_PMD_NR;
                        }
                }
-               set_pmd_at(mm, addr, pmd, entry);
+
+               /* Set PMD if cleared earlier */
+               if (ret == HPAGE_PMD_NR)
+                       set_pmd_at(mm, addr, pmd, entry);
+
                spin_unlock(&vma->vm_mm->page_table_lock);
-               ret = 1;
        }
 
        return ret;
@@ -1666,7 +1688,6 @@ static void __split_huge_page_refcount(struct page *page,
        BUG_ON(atomic_read(&page->_count) <= 0);
 
        __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-       __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
 
        ClearPageCompound(page);
        compound_unlock(page);
@@ -2301,6 +2322,8 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
 
        vma = find_vma(mm, address);
+       if (!vma)
+               goto out;
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
@@ -2362,7 +2385,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
 
-       _pmd = mk_huge_pmd(new_page, vma);
+       _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+       _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
 
        /*
         * spin_lock() below is not the equivalent of smp_wmb(), so